# Hyperparameter tuning to get model's best configuration

## Load data

In [35]:
import pandas as pd

path = '../../../data/default_credit_card/output/simplified_features.csv'
df = pd.read_csv(path)
df

Unnamed: 0,Gender,Age,Married,YearsEmployed,Income,Approved
0,1,30,1,1.25,0,1
1,0,58,1,3.04,560,1
...,...,...,...,...,...,...
688,1,17,1,0.04,750,0
689,1,35,1,8.29,0,0


## Feature selection

In [36]:
target = 'Approved'

y = df[target]
X = df.drop(columns=target)

## Train test split

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

## Grid search

### Import model

In [38]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

### See hyperparameters

From the model.

In [39]:
model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

### Hyperparameters grid

To define the possible configurations the model will try in order to find the best.

> There isn't an exact science to define the values to try on the grid. It depends on the model you choose and the dataset you have. If you ask ChatGPT to propose a `param_grid` given a model and a dataset, it will give you a good starting point, based on the literature and the model's documentation.

In [40]:
param_grid = {
    'max_depth': [3, 5, 7, 9],
    'min_samples_leaf': [50, 100, 200, 500],
    'criterion': ['gini', 'entropy']
}

### Import grid search

In [41]:
from sklearn.model_selection import GridSearchCV
model_grid = GridSearchCV(model, param_grid, cv=3, verbose=1)

### Fit best model

In [42]:
model_grid.fit(X_train, y_train)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


In [43]:
model_grid.best_estimator_

### Evaluation

In [44]:
df_results = pd.DataFrame(model_grid.cv_results_)

(df_results
 .loc[:, ['params', 'mean_test_score', 'rank_test_score']]
 .sort_values(by='rank_test_score')
 .style
 )

Unnamed: 0,params,mean_test_score,rank_test_score
20,"{'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 50}",0.724638,1
16,"{'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 50}",0.724638,1
28,"{'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 50}",0.724638,1
24,"{'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 50}",0.724638,1
12,"{'criterion': 'gini', 'max_depth': 9, 'min_samples_leaf': 50}",0.722567,5
8,"{'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 50}",0.722567,5
0,"{'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 50}",0.722567,5
4,"{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 50}",0.722567,5
5,"{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 100}",0.681159,9
21,"{'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 100}",0.681159,9


In [45]:
model_grid.best_score_

0.7246376811594203

In [46]:
model_grid.score(X_test, y_test)

0.7053140096618358

## Multiple models with grid search

### Preprocess data

Some models require different preprocessing steps because their algorithms optimize the math equation numbers by calculating distances between data points.

### Models and param grids

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

models = {
    'Logistic Regression': (LogisticRegression(max_iter=1000), {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
    }),
    'Decision Tree': (DecisionTreeClassifier(), {
        'max_depth': [3, 5, 7, 9],
        'min_samples_leaf': [50, 100, 200, 500],
        'criterion': ['gini', 'entropy']
    }),
    'Random Forest': (RandomForestClassifier(), {
        'max_depth': [3, 5, 7, 9],
        'min_samples_leaf': [50, 100, 200, 500],
        'criterion': ['gini', 'entropy']
    })
}

### Perform GridSearchCV for each model

In [57]:
results = {}

for name, (model, param_grid) in models.items():
    print(f"Performing grid search for {name}...")
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Store results
    results[name] = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'test_score': grid_search.score(X_test, y_test)
    }
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
    print(f"Test set score: {grid_search.score(X_test, y_test):.4f}")
    print()

Performing grid search for Logistic Regression...
Best parameters: {'C': 10}
Best cross-validation score: 0.7226
Test set score: 0.7101

Performing grid search for Decision Tree...
Best parameters: {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 50}
Best cross-validation score: 0.7246
Test set score: 0.7053

Performing grid search for Random Forest...
Best parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 50}
Best cross-validation score: 0.7640
Test set score: 0.6908



### Evaluation

In [58]:
pd.DataFrame(results).T.sort_values(by='test_score', ascending=False)

Unnamed: 0,best_params,best_score,test_score
Logistic Regression,{'C': 10},0.722567,0.710145
Decision Tree,"{'criterion': 'entropy', 'max_depth': 3, 'min_...",0.724638,0.705314
Random Forest,"{'criterion': 'entropy', 'max_depth': 5, 'min_...",0.763975,0.690821


### Retrain with best model

In [59]:
best_model_name = max(results, key=lambda x: results[x]['test_score'])
best_model_name

'Logistic Regression'

In [60]:
best_model_params = results[best_model_name]['best_params']
best_model_params

{'C': 10}

In [61]:
model = models[best_model_name][0]
model.set_params(**best_model_params)

In [62]:
model.fit(X_train, y_train)

In [63]:
model.score(X_train, y_train)

0.7287784679089027

In [64]:
model.score(X_test, y_test)

0.7101449275362319