In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load dataset
drift = pd.read_csv('data/drift_data.csv')

# Prepare full dataset
X_full = drift.drop(columns=['outcome'])
y_full = drift['outcome']

# Standardize the features
scaler = StandardScaler()
X_full_scaled = scaler.fit_transform(X_full)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_full_scaled, y_full, test_size=0.2, random_state=42)

# Define hyperparameter grids for XGBoost and Random Forest
param_grid_xgb = [
    {"learning_rate": 0.01, "max_depth": 3, "subsample": 0.6, "reg_alpha": 0.1, "reg_lambda": 0.1},
    {"learning_rate": 0.05, "max_depth": 5, "subsample": 0.8, "reg_alpha": 0.1, "reg_lambda": 1},
    {"learning_rate": 0.1, "max_depth": 7, "subsample": 1.0, "reg_alpha": 0, "reg_lambda": 0.1},
    {"learning_rate": 0.2, "max_depth": 5, "subsample": 0.8, "reg_alpha": 1, "reg_lambda": 1},
    {"learning_rate": 0.3, "max_depth": 3, "subsample": 0.6, "reg_alpha": 0.1, "reg_lambda": 0.1},
]

param_grid_rf = [
    {"n_estimators": 100, "max_depth": 10, "min_samples_split": 2, "min_samples_leaf": 1},
    {"n_estimators": 100, "max_depth": 20, "min_samples_split": 5, "min_samples_leaf": 2},
    {"n_estimators": 200, "max_depth": 10, "min_samples_split": 2, "min_samples_leaf": 1},
    {"n_estimators": 200, "max_depth": 20, "min_samples_split": 5, "min_samples_leaf": 1},
    {"n_estimators": 150, "max_depth": 15, "min_samples_split": 3, "min_samples_leaf": 1},
]

# Initialize models
models = {
    "XGBoost": xgb.XGBRegressor(objective="reg:squarederror", random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42)
}

# Train models with different hyperparameters and store results
results = []

for params in param_grid_xgb:
    model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42, **params)
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    r2 = r2_score(y_test, y_test_pred)

    results.append({"Hyperparameters": params, "Train RMSE": train_rmse, "Test RMSE": test_rmse, "R²": r2, "Model": "XGBoost"})

for params in param_grid_rf:
    model = RandomForestRegressor(random_state=42, **params)
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    r2 = r2_score(y_test, y_test_pred)

    results.append({"Hyperparameters": params, "Train RMSE": train_rmse, "Test RMSE": test_rmse, "R²": r2, "Model": "Random Forest"})

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print("\n🔧 Hyperparameter Tuning Results:")
results_df



🔧 Hyperparameter Tuning Results:


Unnamed: 0,Hyperparameters,Train RMSE,Test RMSE,R²,Model
0,"{'learning_rate': 0.01, 'max_depth': 3, 'subsa...",0.361312,0.369434,0.563701,XGBoost
1,"{'learning_rate': 0.05, 'max_depth': 5, 'subsa...",0.080846,0.126258,0.94904,XGBoost
2,"{'learning_rate': 0.1, 'max_depth': 7, 'subsam...",0.021335,0.112267,0.959709,XGBoost
3,"{'learning_rate': 0.2, 'max_depth': 5, 'subsam...",0.051304,0.092952,0.97238,XGBoost
4,"{'learning_rate': 0.3, 'max_depth': 3, 'subsam...",0.062154,0.082635,0.978171,XGBoost
5,"{'n_estimators': 100, 'max_depth': 10, 'min_sa...",0.088816,0.178725,0.897887,Random Forest
6,"{'n_estimators': 100, 'max_depth': 20, 'min_sa...",0.074677,0.173034,0.904287,Random Forest
7,"{'n_estimators': 200, 'max_depth': 10, 'min_sa...",0.088145,0.17885,0.897744,Random Forest
8,"{'n_estimators': 200, 'max_depth': 20, 'min_sa...",0.070744,0.172718,0.904636,Random Forest
9,"{'n_estimators': 150, 'max_depth': 15, 'min_sa...",0.065686,0.172339,0.905054,Random Forest


In [5]:
# Select the best model based on the lowest Test RMSE
best_model_row = results_df.loc[results_df["Test RMSE"].idxmin()]
best_model_params = best_model_row["Hyperparameters"]
best_model_name = best_model_row["Model"]

# Best model summary
best_model_summary = pd.DataFrame({
    "Best Model": [best_model_name],
    "Best Hyperparameters": [best_model_params],
    "Train RMSE": [best_model_row["Train RMSE"]],
    "Test RMSE": [best_model_row["Test RMSE"]],
    "R²": [best_model_row["R²"]]
})

print("\n🏆 Best Model Summary:")
best_model_summary


🏆 Best Model Summary:


Unnamed: 0,Best Model,Best Hyperparameters,Train RMSE,Test RMSE,R²
0,XGBoost,"{'learning_rate': 0.3, 'max_depth': 3, 'subsam...",0.062154,0.082635,0.978171


In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
drift_data = pd.read_csv('data/drift_data.csv')

# Split features & target
X = drift_data.drop(columns=['outcome'])
y = drift_data['outcome']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define hyperparameters for manual testing
manual_params = [
    {"model": "Random Forest", "n_estimators": 100, "max_depth": 10, "min_samples_split": 2, "min_samples_leaf": 1},
    {"model": "Random Forest", "n_estimators": 200, "max_depth": 15, "min_samples_split": 5, "min_samples_leaf": 2},
    {"model": "Random Forest", "n_estimators": 150, "max_depth": 12, "min_samples_split": 3, "min_samples_leaf": 1},
    {"model": "Random Forest", "n_estimators": 250, "max_depth": 20, "min_samples_split": 2, "min_samples_leaf": 2},
    {"model": "Random Forest", "n_estimators": 300, "max_depth": 25, "min_samples_split": 5, "min_samples_leaf": 1},
    
    {"model": "XGBoost", "n_estimators": 100, "learning_rate": 0.01, "max_depth": 3, "subsample": 0.8, "reg_alpha": 0, "reg_lambda": 1},
    {"model": "XGBoost", "n_estimators": 200, "learning_rate": 0.1, "max_depth": 5, "subsample": 1.0, "reg_alpha": 1, "reg_lambda": 1},
    {"model": "XGBoost", "n_estimators": 150, "learning_rate": 0.05, "max_depth": 4, "subsample": 0.9, "reg_alpha": 0.5, "reg_lambda": 1},
    {"model": "XGBoost", "n_estimators": 250, "learning_rate": 0.2, "max_depth": 6, "subsample": 0.8, "reg_alpha": 1, "reg_lambda": 2},
    {"model": "XGBoost", "n_estimators": 300, "learning_rate": 0.05, "max_depth": 7, "subsample": 0.7, "reg_alpha": 0.5, "reg_lambda": 2}
]

# Store results
manual_results = []

for params in manual_params:
    if params["model"] == "Random Forest":
        model = RandomForestRegressor(
            n_estimators=params["n_estimators"], 
            max_depth=params["max_depth"], 
            min_samples_split=params["min_samples_split"], 
            min_samples_leaf=params["min_samples_leaf"],
            random_state=42
        )
    else:  # XGBoost
        model = xgb.XGBRegressor(
            n_estimators=params["n_estimators"], 
            learning_rate=params["learning_rate"], 
            max_depth=params["max_depth"], 
            subsample=params["subsample"], 
            reg_alpha=params["reg_alpha"], 
            reg_lambda=params["reg_lambda"],
            objective="reg:squarederror", 
            random_state=42
        )

    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    r2 = r2_score(y_test, y_test_pred)

    manual_results.append([
        params, train_rmse, test_rmse, r2, params["model"]
    ])

# Convert to DataFrame
manual_results_df = pd.DataFrame(manual_results, columns=["Hyperparameters", "Train RMSE", "Test RMSE", "R²", "Model"])
print("Manual Hyperparameter Testing Results:")
print(manual_results_df)

# ---- GridSearchCV ---- #
param_grid_rf = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 15, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

param_grid_xgb = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.1, 0.05],
    "max_depth": [3, 5, 7],
    "subsample": [0.7, 0.8, 1.0],
    "reg_alpha": [0, 0.5, 1],
    "reg_lambda": [1, 2]
}

grid_results = {}

for model_name, param_grid in [("Random Forest", param_grid_rf), ("XGBoost", param_grid_xgb)]:
    model = RandomForestRegressor(random_state=42) if model_name == "Random Forest" else xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
    
    grid_search = GridSearchCV(model, param_grid, scoring="neg_mean_squared_error", cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_test_pred = best_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    r2 = r2_score(y_test, y_test_pred)

    grid_results[model_name] = {
        "Best Hyperparameters": grid_search.best_params_,
        "Test RMSE": rmse,
        "R²": r2
    }

# Convert to DataFrame
grid_results_df = pd.DataFrame(grid_results).T
print("\nGridSearch Best Model Results:")
print(grid_results_df)

# ---- Comparison of Manual Best Model vs GridSearch Best Model ---- #
best_manual_model = manual_results_df.sort_values(by="Test RMSE").iloc[0]
best_grid_model = grid_results_df.sort_values(by="Test RMSE").iloc[0]

comparison_df = pd.DataFrame({
    "Manual Best Model": [best_manual_model["Hyperparameters"]],
    "GridSearch Best Model": [best_grid_model["Best Hyperparameters"]],
    "Manual RMSE": [best_manual_model["Test RMSE"]],
    "GridSearch RMSE": [best_grid_model["Test RMSE"]],
    "Manual R²": [best_manual_model["R²"]],
    "GridSearch R²": [best_grid_model["R²"]]
})

print("\nComparison Between Manual and GridSearch Best Model:")
print(comparison_df)


Manual Hyperparameter Testing Results:
                                     Hyperparameters  Train RMSE  Test RMSE  \
0  {'model': 'Random Forest', 'n_estimators': 100...    0.088816   0.178725   
1  {'model': 'Random Forest', 'n_estimators': 200...    0.073816   0.172568   
2  {'model': 'Random Forest', 'n_estimators': 150...    0.070478   0.173639   
3  {'model': 'Random Forest', 'n_estimators': 250...    0.071083   0.172012   
4  {'model': 'Random Forest', 'n_estimators': 300...    0.070473   0.172220   
5  {'model': 'XGBoost', 'n_estimators': 100, 'lea...    0.362983   0.372160   
6  {'model': 'XGBoost', 'n_estimators': 200, 'lea...    0.041842   0.086032   
7  {'model': 'XGBoost', 'n_estimators': 150, 'lea...    0.073831   0.110824   
8  {'model': 'XGBoost', 'n_estimators': 250, 'lea...    0.030718   0.097688   
9  {'model': 'XGBoost', 'n_estimators': 300, 'lea...    0.029696   0.093286   

         R²          Model  
0  0.897887  Random Forest  
1  0.904801  Random Forest  
2  0

KeyboardInterrupt: 