# Model tuning for tree algorithms

## Hyperparameter tuning

> Tune hyperparemeters to optimize ${R^2}$ metric.

Define models and initialize param_grids with hyperparemeters to be tuned for each model.

In [None]:
# control if using sampled or original data
from config import TRAIN_TEST_SPLIT_ELECTRIC_FILE, TRAIN_TEST_SPLIT_ELECTRIC_FILE_SAMPLED

is_sampled = True

In [110]:
# from sklearn.ensemble._forest import RandomForestRegressor, ExtraTreesRegressor
# from sklearn.tree._classes ExtraTreeRegressor, DecisionTreeRegressor
import sklearn

# Define models
models = {  
    'ExtraTreeRegressor': sklearn.tree._classes.ExtraTreeRegressor(),
    'ExtraTreesRegressor': sklearn.ensemble._forest.ExtraTreesRegressor(),
    'RandomForestRegressor': sklearn.ensemble._forest.RandomForestRegressor(),
    'DecisionTreeRegressor': sklearn.tree._classes.DecisionTreeRegressor(),
}

# Hyperparameters to be tested for each model
param_grids = {
    'ExtraTreeRegressor': {
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [None, 'sqrt', 'log2']
    },
    'ExtraTreesRegressor': {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [None, 'sqrt', 'log2']
    },
    'RandomForestRegressor': {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [None, 'sqrt', 'log2']
    },
    'DecisionTreeRegressor': {
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [None, 'sqrt', 'log2']
    }
}

# Dictionary for storing results
results = {}

In [None]:
# load data
import joblib

if is_sampled:
    X_train, X_test, y_train, y_test = joblib.load(TRAIN_TEST_SPLIT_ELECTRIC_FILE_SAMPLED)
else:
    X_train, X_test, y_train, y_test = joblib.load(TRAIN_TEST_SPLIT_ELECTRIC_FILE)

In [112]:
# temporary reduce the size of the dataset for testing
# X_train = X_train[:1000]
# y_train = y_train[:1000]
# X_test = X_test[:1000]
# y_test = y_test[:1000]

In [113]:
# example prints to validate correct output
X_train.head(2)

Unnamed: 0,member_state_AT,member_state_BE,member_state_BG,member_state_CY,member_state_CZ,member_state_DE,member_state_DK,member_state_EE,member_state_ES,member_state_FI,member_state_FR,member_state_GR,member_state_HR,member_state_HU,member_state_IE,member_state_IS,member_state_IT,member_state_LT,member_state_LU,member_state_LV,member_state_MT,member_state_NL,member_state_NO,member_state_PL,member_state_PT,member_state_RO,member_state_SE,member_state_SI,member_state_SK,manufacturer_name_eu_AUDI AG,manufacturer_name_eu_BMW AG,manufacturer_name_eu_DACIA,manufacturer_name_eu_FIAT GROUP,manufacturer_name_eu_FORD WERKE GMBH,manufacturer_name_eu_HYUNDAI,manufacturer_name_eu_HYUNDAI CZECH,manufacturer_name_eu_KIA,manufacturer_name_eu_MERCEDES-BENZ AG,manufacturer_name_eu_NISSAN AUTOMOTIVE EUROPE,manufacturer_name_eu_POLESTAR,...,commercial_name_Q4 SPORTBACK 35 E-TRON,commercial_name_Q4 SPORTBACK 40 E-TRON,commercial_name_Q4 SPORTBACK 50 E-TRON,commercial_name_Q8 50 E-TRON,commercial_name_Q8 55 E-TRON,commercial_name_Q8 SPORTBACK 55 E-TRON,commercial_name_RS E-TRON GT,commercial_name_SOLTERRA,commercial_name_SOUL,commercial_name_SPRING,commercial_name_TAYCAN,commercial_name_TAYCAN 4,commercial_name_TAYCAN 4S,commercial_name_TAYCAN GTS,commercial_name_TOYOTA BZ4X,commercial_name_TWINGO,commercial_name_Taycan 4,commercial_name_U5,commercial_name_UP!,commercial_name_XC40,commercial_name_ZOE,commercial_name_e-C4,commercial_name_e-tron 55,commercial_name_i3,commercial_name_i3s,commercial_name_i4 M50,commercial_name_i4 eDrive40,commercial_name_iX xDrive40,commercial_name_iX xDrive50,commercial_name_iX1 xDrive30,commercial_name_iX3,commercial_name_other,category_of_vehicle_M1,category_of_vehicle_M1.1,category_of_vehicle_M1G,category_of_vehicle_N1,mass_vehicle,engine_power,year,electric_range
981250,-0.176391,-0.187304,-0.032097,-0.01393,-0.060469,-0.73656,-0.181131,-0.025524,-0.172781,-0.120979,-0.467806,-0.041225,-0.031943,-0.05928,-0.106155,-0.066425,-0.209106,-0.034854,-0.075747,-0.029082,-0.024335,-0.266454,3.450575,-0.083684,-0.13556,-0.093594,-0.276377,-0.047164,-0.036562,4.58849,-0.28974,-0.191901,-0.102712,-0.115912,-0.170118,-0.154105,-0.209517,-0.26196,-0.137336,-0.11134,...,-0.03184,-0.057638,-0.041145,-0.029698,-0.04194,-0.032251,-0.02822,-0.02358,-0.054813,-0.191901,-0.046847,-0.036471,-0.048714,-0.030248,-0.071014,-0.137485,-0.026035,-0.028044,-0.128761,-0.143604,-0.174379,-0.025395,-0.022799,-0.07493,-0.065317,-0.05269,-0.045087,-0.063123,-0.043104,-0.021916,-0.086004,-0.234216,0.017013,-0.001813,-0.016322,-0.004442,2.069688,-0.044017,-0.229607,-0.198688
3356637,-0.176391,-0.187304,-0.032097,-0.01393,-0.060469,1.357663,-0.181131,-0.025524,-0.172781,-0.120979,-0.467806,-0.041225,-0.031943,-0.05928,-0.106155,-0.066425,-0.209106,-0.034854,-0.075747,-0.029082,-0.024335,-0.266454,-0.289807,-0.083684,-0.13556,-0.093594,-0.276377,-0.047164,-0.036562,-0.217937,-0.28974,-0.191901,-0.102712,-0.115912,-0.170118,-0.154105,-0.209517,-0.26196,-0.137336,-0.11134,...,-0.03184,-0.057638,-0.041145,-0.029698,-0.04194,-0.032251,-0.02822,-0.02358,-0.054813,-0.191901,-0.046847,-0.036471,-0.048714,-0.030248,-0.071014,-0.137485,-0.026035,-0.028044,-0.128761,-0.143604,-0.174379,-0.025395,-0.022799,-0.07493,-0.065317,-0.05269,-0.045087,-0.063123,-0.043104,-0.021916,-0.086004,-0.234216,0.017013,-0.001813,-0.016322,-0.004442,0.28425,0.065082,1.023576,1.333491


Execute hyperparameter search methods.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score


# Function to compare the hyperparameter search methods
def compare_search_methods(model_name, model, param_grid):
    search_methods = {
    # 'GridSearchCV': GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy'),
    'RandomizedSearchCV': RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=5, cv=5, random_state=42),
    # 'BayesSearchCV': BayesSearchCV(estimator=model, search_spaces=param_grid, n_iter=5, cv=5, scoring='accuracy', random_state=42)
    }

    results[model_name] = {}

    for search_name, search in search_methods.items():
        print(f"Perform {search_name} for model {model_name}")
        # Perform hyperparameter search
        search.fit(X_train, y_train)

        # Best score and hyperparameters found
        best_params = search.best_params_
        best_score = search.best_score_

        # Test on test data
        y_pred = search.predict(X_test)
        test_r2_score = r2_score(y_true=y_test, y_pred=y_pred)

        # Store results
        results[model_name][search_name] = {
            'best_params': best_params,
            'best_cv_score': best_score,
            'test_r2_score': test_r2_score
        }

In [115]:
models

{'ExtraTreeRegressor': ExtraTreeRegressor(),
 'ExtraTreesRegressor': ExtraTreesRegressor(),
 'RandomForestRegressor': RandomForestRegressor(),
 'DecisionTreeRegressor': DecisionTreeRegressor()}

In [116]:
# Run comparison for each model
for model_name, model in models.items():
    compare_search_methods(model_name, model, param_grids[model_name])

Evaluate results

In [117]:
# Print results
for model_name, model_results in results.items():
    print(f"Model: {model_name}")
    for search_name, search_results in model_results.items():
        print(f"  {search_name}:")
        print(f"    Best Params: {search_results['best_params']}")
        print(f"    Best CV Score: {search_results['best_cv_score']:2f}")
        print(f"    Test R Squared score: {search_results['test_r2_score']:.2f}")
    print("\n")

Model: ExtraTreeRegressor
  RandomizedSearchCV:
    Best Params: {'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 20}
    Best CV Score: 0.966285
    Test R Squared score: 0.96


Model: ExtraTreesRegressor
  RandomizedSearchCV:
    Best Params: {'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 10}
    Best CV Score: 0.894334
    Test R Squared score: 0.89


Model: RandomForestRegressor
  RandomizedSearchCV:
    Best Params: {'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 10}
    Best CV Score: 0.947039
    Test R Squared score: 0.70


Model: DecisionTreeRegressor
  RandomizedSearchCV:
    Best Params: {'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 20}
    Best CV Score: 0.978108
    Test R Squared score: 0.73




## Cross-Validation

> Analyze if models generalize well

In [118]:
# Insert your code here
from sklearn.model_selection import KFold

# Define models to test
models_cv = {
'ExtraTreeRegressor':  sklearn.tree._classes.ExtraTreeRegressor(**results["ExtraTreeRegressor"]["RandomizedSearchCV"]["best_params"]),
'ExtraTreesRegressor':  sklearn.ensemble._forest.ExtraTreesRegressor(**results["ExtraTreesRegressor"]["RandomizedSearchCV"]["best_params"]),
'RandomForestRegressor':  sklearn.ensemble._forest.RandomForestRegressor(**results["RandomForestRegressor"]["RandomizedSearchCV"]["best_params"]),
'DecisionTreeRegressor':  sklearn.tree._classes.DecisionTreeRegressor(**results["DecisionTreeRegressor"]["RandomizedSearchCV"]["best_params"]),
}

# Define the number of folds for KFold
n_splits = 5  # n-fold cross-validation
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

In [None]:
import pandas as pd
# merge X_train and and X_test
# X = pd.concat([X_train, X_test])
# y = pd.concat([y_train, y_test])

# TODO ask Romain if this is the correct approach (in exercise we used X before train_test_split)
# do KFold cross-validation only on train samples.
# This approach ensures that the test data remains a true holdout set, 
# providing an unbiased estimate of the model's performance on unseen data.
X = X_train
y = y_train

In [122]:
import numpy as np
# Store results for each model
results_cv = {}

# Loop through each model
for model_name, model in models_cv.items():
    fold_r2_scores = []  # Store r2 results for each fold

    # Perform KFold cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the model
        model.fit(X_train, y_train)

        # Predict on test set
        y_pred = model.predict(X_test)

        # Calculate r2
        r2 = r2_score(y_test, y_pred)
        fold_r2_scores.append(r2)

    # calc avg and variance of r2 scores
    avg_r2 = np.mean(fold_r2_scores)
    var_r2 = np.var(fold_r2_scores, ddof=1)  # ddof=1 for unbiased estimation (sample)

    results_cv[model_name] = {
        "mean_r2": avg_r2,
        "variance_r2": var_r2
    }

Display the average precision and the variance of the precision obtained for each model.

In [123]:
# Print results
for model_name, metrics in results_cv.items():
    print(f"Model : {model_name}")
    print(f"  - Average R squared : {metrics['mean_r2']:.2f}")
    print(f"  - R squared variance : {metrics['variance_r2']:.2f}")

Model : ExtraTreeRegressor
  - Average R squared : 0.95
  - R squared variance : 0.00
Model : ExtraTreesRegressor
  - Average R squared : 0.89
  - R squared variance : 0.00
Model : RandomForestRegressor
  - Average R squared : 0.95
  - R squared variance : 0.00
Model : DecisionTreeRegressor
  - Average R squared : 0.98
  - R squared variance : 0.00
