<a href="https://colab.research.google.com/github/Shah03-rgb/diya_bati_ss/blob/main/4model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --quiet optuna

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/383.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m327.7/383.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/231.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/231.9 kB[0m [31m11.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import optuna

optuna.__version__

'4.2.1'

In [15]:
# fine tuned SVM, R²>0
# adjusting (c, epsilon, gamma)
# faster than adjusting kernel, but kernel gives a better R² value for SVR


import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
file_path = "/content/Diabetic_Merged.xlsx"
df = pd.read_excel(file_path)

# Convert Age at Diagnosis to numerical
def age_to_numeric(x):
    if pd.isna(x):
        return np.nan
    mapping = {
        'Under 18': 9,
        '18-34': (18 + 34) / 2,
        '35-44': (35 + 44) / 2,
        '45-54': (45 + 54) / 2,
        '55 or older': 60
    }
    return mapping.get(x, np.nan)

df['Age_at_Diagnosis_numeric'] = df['Age at Diagnosis'].apply(age_to_numeric)

# Drop 'Age at Diagnosis' since we have converted it
X = df.drop(columns=['Age at Diagnosis', 'Age_at_Diagnosis_numeric'])
y = df['Age_at_Diagnosis_numeric']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Engineering
def create_features(data):
    data = data.copy()
    data['BMI'] = data['Weight'] / ((data['Height'] / 100) ** 2)
    data['Weight_Height_Ratio'] = data['Weight'] / data['Height']

    def hba1c_to_numeric(x):
        if pd.isna(x):
            return np.nan
        mapping = {
            'Below 5.7%': 5.35,
            '5.7%-6.4%': 6.05,
            '6.5%-7.5%': 7.0,
            '7.6%-9.0%': 8.3,
            '9.1% or higher': 9.5
        }
        return mapping.get(x, np.nan)

    data['HbA1c_numeric'] = data['HbA1c Levels'].apply(hba1c_to_numeric)
    data['BMI_HbA1c_Interaction'] = data['BMI'] * data['HbA1c_numeric']
    data.drop(columns=['Height'], inplace=True, errors='ignore')
    return data

X_train = create_features(X_train)
X_test = create_features(X_test)

# Identify feature types
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
manual_features = ['BMI', 'HbA1c_numeric', 'Weight_Height_Ratio', 'BMI_HbA1c_Interaction']
numeric_features = [f for f in numeric_features if f not in manual_features]

# Preprocessing
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features),
    ('manual', 'passthrough', manual_features)
])

# Hyperparameter Optimization using Optuna for XGBoost
def objective_xgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500, step=50),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10, log=True),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0)
    }
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', XGBRegressor(**params, random_state=42, eval_metric='mae'))
    ])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=30)
best_xgb_params = study_xgb.best_params

# Hyperparameter Optimization using Optuna for SVR
def objective_svr(trial):
    params = {
        "C": trial.suggest_loguniform("C", 0.1, 100),
        "epsilon": trial.suggest_loguniform("epsilon", 0.01, 1.0),
        "gamma": trial.suggest_categorical("gamma", ["scale", "auto"])
    }
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', SVR(**params))
    ])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

study_svr = optuna.create_study(direction="minimize")
study_svr.optimize(objective_svr, n_trials=30)
best_svr_params = study_svr.best_params

# Train Models
xgb_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(**best_xgb_params, random_state=42, eval_metric='mae'))
])
xgb_model.fit(X_train, y_train)

rf_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=200, random_state=42))
])
rf_model.fit(X_train, y_train)

dt_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(max_depth=10, random_state=42))
])
dt_model.fit(X_train, y_train)

svr_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(**best_svr_params))
])
svr_model.fit(X_train, y_train)


# Predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)
y_pred_dt = dt_model.predict(X_test)
y_pred_svr = svr_model.predict(X_test)

# Averaging Ensemble
ensemble_pred_avg = (y_pred_xgb + y_pred_rf + y_pred_dt + y_pred_svr) / 4

# Stacking Ensemble
stack_train = np.column_stack((y_pred_xgb, y_pred_rf, y_pred_dt, y_pred_svr))
meta_model = Ridge()
meta_model.fit(stack_train, y_test)
stack_pred = meta_model.predict(stack_train)

# Evaluation
def evaluate_model(name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{name} Model Performance:")
    print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.2f}")

evaluate_model("XGBoost", y_test, y_pred_xgb)
evaluate_model("Random Forest", y_test, y_pred_rf)
evaluate_model("Decision Tree", y_test, y_pred_dt)
evaluate_model("Optimized SVR", y_test, y_pred_svr)
evaluate_model("Averaging Ensemble", y_test, ensemble_pred_avg)
evaluate_model("Stacking Ensemble", y_test, stack_pred)


[I 2025-04-03 21:13:37,006] A new study created in memory with name: no-name-c6ab9ebb-7103-4d4c-afc8-1a700a99d301
[I 2025-04-03 21:13:37,330] Trial 0 finished with value: 9.636896981133354 and parameters: {'n_estimators': 400, 'learning_rate': 0.019634583617124474, 'max_depth': 4, 'subsample': 0.8090982425928877, 'colsample_bytree': 0.904499866126869, 'reg_alpha': 0.2599691133702963, 'reg_lambda': 3.69388876368703, 'min_child_weight': 1, 'gamma': 0.606425025369442}. Best is trial 0 with value: 9.636896981133354.
[I 2025-04-03 21:13:37,569] Trial 1 finished with value: 9.805224100748697 and parameters: {'n_estimators': 200, 'learning_rate': 0.013456643796372333, 'max_depth': 8, 'subsample': 0.7784003975999652, 'colsample_bytree': 0.9756776010833097, 'reg_alpha': 0.038507301928503546, 'reg_lambda': 3.4508731673434845, 'min_child_weight': 8, 'gamma': 1.6964401475791187}. Best is trial 0 with value: 9.636896981133354.
[I 2025-04-03 21:13:37,819] Trial 2 finished with value: 9.8261970237449


XGBoost Model Performance:
MAE: 8.98, RMSE: 10.86, R²: 0.28

Random Forest Model Performance:
MAE: 9.06, RMSE: 10.57, R²: 0.32

Decision Tree Model Performance:
MAE: 10.25, RMSE: 13.10, R²: -0.04

Optimized SVR Model Performance:
MAE: 9.85, RMSE: 12.46, R²: 0.05

Averaging Ensemble Model Performance:
MAE: 9.27, RMSE: 11.06, R²: 0.26

Stacking Ensemble Model Performance:
MAE: 7.82, RMSE: 9.37, R²: 0.47


In [16]:
# fine tuned SVM, R²>0
# adjusting (c, epsilon, kernel)
# way slower than adjusting gamma, but gives way better R² value for SVR


import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
file_path = "/content/Diabetic_Merged.xlsx"
df = pd.read_excel(file_path)

# Convert Age at Diagnosis to numerical
def age_to_numeric(x):
    if pd.isna(x):
        return np.nan
    mapping = {
        'Under 18': 9,
        '18-34': (18 + 34) / 2,
        '35-44': (35 + 44) / 2,
        '45-54': (45 + 54) / 2,
        '55 or older': 60
    }
    return mapping.get(x, np.nan)

df['Age_at_Diagnosis_numeric'] = df['Age at Diagnosis'].apply(age_to_numeric)

# Drop 'Age at Diagnosis' since we have converted it
X = df.drop(columns=['Age at Diagnosis', 'Age_at_Diagnosis_numeric'])
y = df['Age_at_Diagnosis_numeric']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Engineering
def create_features(data):
    data = data.copy()
    data['BMI'] = data['Weight'] / ((data['Height'] / 100) ** 2)
    data['Weight_Height_Ratio'] = data['Weight'] / data['Height']

    def hba1c_to_numeric(x):
        if pd.isna(x):
            return np.nan
        mapping = {
            'Below 5.7%': 5.35,
            '5.7%-6.4%': 6.05,
            '6.5%-7.5%': 7.0,
            '7.6%-9.0%': 8.3,
            '9.1% or higher': 9.5
        }
        return mapping.get(x, np.nan)

    data['HbA1c_numeric'] = data['HbA1c Levels'].apply(hba1c_to_numeric)
    data['BMI_HbA1c_Interaction'] = data['BMI'] * data['HbA1c_numeric']
    data.drop(columns=['Height'], inplace=True, errors='ignore')
    return data

X_train = create_features(X_train)
X_test = create_features(X_test)

# Identify feature types
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
manual_features = ['BMI', 'HbA1c_numeric', 'Weight_Height_Ratio', 'BMI_HbA1c_Interaction']
numeric_features = [f for f in numeric_features if f not in manual_features]

# Preprocessing
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features),
    ('manual', 'passthrough', manual_features)
])

# Hyperparameter Optimization using Optuna for XGBoost
def objective_xgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500, step=50),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10, log=True),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0)
    }
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', XGBRegressor(**params, random_state=42, eval_metric='mae'))
    ])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=30)
best_xgb_params = study_xgb.best_params

# Hyperparameter Optimization using Optuna for SVR
def objective_svr(trial):
    params = {
    "C": trial.suggest_loguniform("C", 0.1, 100),
    "epsilon": trial.suggest_loguniform("epsilon", 0.01, 1.0),
    "kernel": trial.suggest_categorical("kernel", ["linear", "poly", "rbf"]),
  }

    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', SVR(**params))
    ])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

study_svr = optuna.create_study(direction="minimize")
study_svr.optimize(objective_svr, n_trials=30)
best_svr_params = study_svr.best_params

# Train Models
xgb_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(**best_xgb_params, random_state=42, eval_metric='mae'))
])
xgb_model.fit(X_train, y_train)

rf_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=200, random_state=42))
])
rf_model.fit(X_train, y_train)

dt_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(max_depth=10, random_state=42))
])
dt_model.fit(X_train, y_train)

svr_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(**best_svr_params))
])
svr_model.fit(X_train, y_train)


# Predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)
y_pred_dt = dt_model.predict(X_test)
y_pred_svr = svr_model.predict(X_test)

# Averaging Ensemble
ensemble_pred_avg = (y_pred_xgb + y_pred_rf + y_pred_dt + y_pred_svr) / 4

# Stacking Ensemble
stack_train = np.column_stack((y_pred_xgb, y_pred_rf, y_pred_dt, y_pred_svr))
meta_model = Ridge()
meta_model.fit(stack_train, y_test)
stack_pred = meta_model.predict(stack_train)

# Evaluation
def evaluate_model(name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{name} Model Performance:")
    print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.2f}")

evaluate_model("XGBoost", y_test, y_pred_xgb)
evaluate_model("Random Forest", y_test, y_pred_rf)
evaluate_model("Decision Tree", y_test, y_pred_dt)
evaluate_model("Optimized SVR", y_test, y_pred_svr)
evaluate_model("Averaging Ensemble", y_test, ensemble_pred_avg)
evaluate_model("Stacking Ensemble", y_test, stack_pred)


[I 2025-04-03 21:19:19,320] A new study created in memory with name: no-name-1645898b-4bf7-492a-bf08-4fa0f3168f86
[I 2025-04-03 21:19:19,427] Trial 0 finished with value: 9.58360304655852 and parameters: {'n_estimators': 150, 'learning_rate': 0.04659543484234555, 'max_depth': 3, 'subsample': 0.7035374098908042, 'colsample_bytree': 0.8347971299947181, 'reg_alpha': 2.1799620804933215, 'reg_lambda': 3.2194692520547448, 'min_child_weight': 10, 'gamma': 2.167894094430678}. Best is trial 0 with value: 9.58360304655852.
[I 2025-04-03 21:19:19,664] Trial 1 finished with value: 10.091962920294868 and parameters: {'n_estimators': 350, 'learning_rate': 0.08062709560136978, 'max_depth': 8, 'subsample': 0.7455080268904979, 'colsample_bytree': 0.9826608106187368, 'reg_alpha': 0.016505573844816296, 'reg_lambda': 0.0650511168269808, 'min_child_weight': 10, 'gamma': 3.7086977101367014}. Best is trial 0 with value: 9.58360304655852.
[I 2025-04-03 21:19:20,115] Trial 2 finished with value: 9.458517992938


XGBoost Model Performance:
MAE: 9.15, RMSE: 10.92, R²: 0.27

Random Forest Model Performance:
MAE: 9.06, RMSE: 10.57, R²: 0.32

Decision Tree Model Performance:
MAE: 10.25, RMSE: 13.10, R²: -0.04

Optimized SVR Model Performance:
MAE: 9.09, RMSE: 11.04, R²: 0.26

Averaging Ensemble Model Performance:
MAE: 8.95, RMSE: 10.71, R²: 0.30

Stacking Ensemble Model Performance:
MAE: 7.96, RMSE: 9.63, R²: 0.44


In [20]:
#trying to get a positive R² value for the Decision Tree Regressor



import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
file_path = "/content/Diabetic_Merged.xlsx"
df = pd.read_excel(file_path)

# Convert Age at Diagnosis to numerical
def age_to_numeric(x):
    if pd.isna(x):
        return np.nan
    mapping = {
        'Under 18': 9,
        '18-34': (18 + 34) / 2,
        '35-44': (35 + 44) / 2,
        '45-54': (45 + 54) / 2,
        '55 or older': 60
    }
    return mapping.get(x, np.nan)

df['Age_at_Diagnosis_numeric'] = df['Age at Diagnosis'].apply(age_to_numeric)

# Drop 'Age at Diagnosis' since we have converted it
X = df.drop(columns=['Age at Diagnosis', 'Age_at_Diagnosis_numeric'])
y = df['Age_at_Diagnosis_numeric']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Engineering
def create_features(data):
    data = data.copy()
    data['BMI'] = data['Weight'] / ((data['Height'] / 100) ** 2)
    data['Weight_Height_Ratio'] = data['Weight'] / data['Height']

    def hba1c_to_numeric(x):
        if pd.isna(x):
            return np.nan
        mapping = {
            'Below 5.7%': 5.35,
            '5.7%-6.4%': 6.05,
            '6.5%-7.5%': 7.0,
            '7.6%-9.0%': 8.3,
            '9.1% or higher': 9.5
        }
        return mapping.get(x, np.nan)

    data['HbA1c_numeric'] = data['HbA1c Levels'].apply(hba1c_to_numeric)
    data['BMI_HbA1c_Interaction'] = data['BMI'] * data['HbA1c_numeric']
    data.drop(columns=['Height'], inplace=True, errors='ignore')
    return data

X_train = create_features(X_train)
X_test = create_features(X_test)

# Identify feature types
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
manual_features = ['BMI', 'HbA1c_numeric', 'Weight_Height_Ratio', 'BMI_HbA1c_Interaction']
numeric_features = [f for f in numeric_features if f not in manual_features]

# Preprocessing
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features),
    ('manual', 'passthrough', manual_features)
])

# Hyperparameter Optimization using Optuna for XGBoost
def objective_xgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500, step=50),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10, log=True),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0)
    }
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', XGBRegressor(**params, random_state=42, eval_metric='mae'))
    ])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=30)
best_xgb_params = study_xgb.best_params

# Hyperparameter Optimization using Optuna for SVR
def objective_svr(trial):
    params = {
    "C": trial.suggest_loguniform("C", 0.1, 100),
    "epsilon": trial.suggest_loguniform("epsilon", 0.01, 1.0),
    "kernel": trial.suggest_categorical("kernel", ["linear", "poly", "rbf"]),
  }

    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', SVR(**params))
    ])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

study_svr = optuna.create_study(direction="minimize")
study_svr.optimize(objective_svr, n_trials=30)
best_svr_params = study_svr.best_params

def objective_dt(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "criterion": trial.suggest_categorical("criterion", ["squared_error", "absolute_error"])
    }
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', DecisionTreeRegressor(**params, random_state=42))
    ])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

study_dt = optuna.create_study(direction="minimize")
study_dt.optimize(objective_dt, n_trials=30)
best_dt_params = study_dt.best_params


# Train Models
xgb_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(**best_xgb_params, random_state=42, eval_metric='mae'))
])
xgb_model.fit(X_train, y_train)

rf_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=200, random_state=42))
])
rf_model.fit(X_train, y_train)

dt_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(max_depth=10, random_state=42))
])
dt_model.fit(X_train, y_train)

svr_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(**best_svr_params))
])
svr_model.fit(X_train, y_train)


# Predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)
y_pred_dt = dt_model.predict(X_test)
y_pred_svr = svr_model.predict(X_test)

# Averaging Ensemble
ensemble_pred_avg = (y_pred_xgb + y_pred_rf + y_pred_dt + y_pred_svr) / 4

# Stacking Ensemble
stack_train = np.column_stack((y_pred_xgb, y_pred_rf, y_pred_dt, y_pred_svr))
meta_model = Ridge()
meta_model.fit(stack_train, y_test)
stack_pred = meta_model.predict(stack_train)

# Evaluation
def evaluate_model(name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{name} Model Performance:")
    print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.2f}")

evaluate_model("XGBoost", y_test, y_pred_xgb)
evaluate_model("Random Forest", y_test, y_pred_rf)
evaluate_model("Decision Tree", y_test, y_pred_dt)
evaluate_model("Optimized SVR", y_test, y_pred_svr)
evaluate_model("Averaging Ensemble", y_test, ensemble_pred_avg)
evaluate_model("Stacking Ensemble", y_test, stack_pred)


[I 2025-04-03 22:09:25,838] A new study created in memory with name: no-name-3d72d7d0-9ac7-48a8-9664-2f762c810880
[I 2025-04-03 22:09:27,001] Trial 0 finished with value: 9.304265481454355 and parameters: {'n_estimators': 450, 'learning_rate': 0.07402971525796094, 'max_depth': 10, 'subsample': 0.8333653552438517, 'colsample_bytree': 0.9159564827199469, 'reg_alpha': 2.772841779191908, 'reg_lambda': 0.023884784619795665, 'min_child_weight': 7, 'gamma': 0.17133619025293312}. Best is trial 0 with value: 9.304265481454355.
[I 2025-04-03 22:09:28,290] Trial 1 finished with value: 9.270616107516819 and parameters: {'n_estimators': 450, 'learning_rate': 0.024786387903279423, 'max_depth': 8, 'subsample': 0.6334145922810545, 'colsample_bytree': 0.6743515276657885, 'reg_alpha': 0.044074520977244146, 'reg_lambda': 0.015300812894273071, 'min_child_weight': 1, 'gamma': 0.25648056898698535}. Best is trial 1 with value: 9.270616107516819.
[I 2025-04-03 22:09:28,505] Trial 2 finished with value: 10.176


XGBoost Model Performance:
MAE: 9.09, RMSE: 10.72, R²: 0.30

Random Forest Model Performance:
MAE: 9.06, RMSE: 10.57, R²: 0.32

Decision Tree Model Performance:
MAE: 10.25, RMSE: 13.10, R²: -0.04

Optimized SVR Model Performance:
MAE: 9.14, RMSE: 11.08, R²: 0.25

Averaging Ensemble Model Performance:
MAE: 8.81, RMSE: 10.57, R²: 0.32

Stacking Ensemble Model Performance:
MAE: 7.86, RMSE: 9.78, R²: 0.42
