# Tuning of hyperparameters through Grid Search and Cross Validation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_curve
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

Steps:
1. Create an initial training/test split
2. Do cross validation on the training data for model/parameter selection
3. Save the hold-out test set for final model evaluation

1. Create an initial training/test split: This is the same for all the teams. You can't change this code.

In [None]:
# Import data
dataset = pd.read_csv("assets/dataset.csv",index_col="id")
x_names = ["title_word_count","document_entropy","freshness","easiness","fraction_stopword_presence","speaker_speed","silent_period_rate"]
X = dataset[x_names]
y = dataset["engagement"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,train_size=0.80,test_size=0.20)

# Transformation (min max scaling)
min_max_scaler = MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_test_minmax = min_max_scaler.transform(X_test)

2. Do cross validation on the training data for model/parameter selection: The team can decide:  
- To change the grid_values of its model (add more hyperparameters and/or more values) 
- To work with (X_train_minmax, X_test_minmax) or without (X_train, X_test) the min max scaling. 

What is provided is just an example. Your goal is to maximize the area under the roc curve in the hold-out test set.

Logistic regression

In [None]:
lr = LogisticRegression(solver='liblinear')
grid_values = {'penalty': ['l1', 'l2'],'C':[0.01, 0.1, 1,10,15,20,25,30]}
grid_lr_auc = GridSearchCV(lr, param_grid = grid_values, scoring = 'roc_auc',n_jobs=-1)
grid_lr_auc.fit(X_train_minmax, y_train)
print(grid_lr_auc.cv_results_['param_C'])
print(grid_lr_auc.cv_results_['param_penalty'])    
print(grid_lr_auc.cv_results_['mean_test_score'])
print(grid_lr_auc.cv_results_['rank_test_score'])
print("Best logistic regression")
print(grid_lr_auc.best_params_)
print(grid_lr_auc.best_score_)
y_lr_df=grid_lr_auc.best_estimator_.decision_function(X_test_minmax)
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_lr_df)
roc_auc_lr = auc(fpr_lr, tpr_lr)
print("AUC in the hold out dataset of the cross validation: ",roc_auc_lr)


Decision tree

In [None]:
dt = DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 8,random_state = 0)
grid_values = {'max_depth': [round(x * 1, 0) for x in range(2, 10)],'min_samples_leaf':[round(x * 1, 0) for x in range(5, 20)]}
grid_dt_auc = GridSearchCV(dt, param_grid = grid_values, scoring = 'roc_auc',n_jobs=-1)
grid_dt_auc.fit(X_train, y_train)
print("Best Decision Tree")
print(grid_dt_auc.best_params_)
print(grid_dt_auc.best_score_)
y_dt_df=grid_dt_auc.best_estimator_.predict_proba(X_test)
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_dt_df[:,1])
roc_auc_dt = auc(fpr_dt, tpr_dt)
print("AUC in the hold out dataset of the cross validation: ",roc_auc_dt)

Multilayer perceptron neural network

In [None]:
mlp = MLPClassifier(hidden_layer_sizes = [5, 5],random_state = 0)
grid_values = {'alpha': [round(x * 0.05, 2) for x in range(1, 20)],'activation':['logistic', 'tanh', 'relu']}
grid_mlp_auc = GridSearchCV(mlp, param_grid = grid_values, scoring = 'roc_auc',n_jobs=-1)
grid_mlp_auc.fit(X_train_minmax, y_train)
print("Best MLP")
print(grid_mlp_auc.best_params_)
print(grid_mlp_auc.best_score_)
y_mlp_df=grid_mlp_auc.best_estimator_.predict_proba(X_test_minmax)
fpr_mlp, tpr_mlp, _ = roc_curve(y_test, y_mlp_df[:,1])
roc_auc_mlp = auc(fpr_mlp, tpr_mlp)
print("AUC in the hold out dataset of the cross validation: ",roc_auc_mlp)

3. Save the hold-out test set for final model evaluation: 
- Team with the highest AUC: 10
- Team with the second highest AUC: 9
- Team with the third highest AUC: 8


4. Upload your code to the forum.

In [None]:
# Jorge Adrián 
mlp = MLPClassifier(random_state=0, max_iter=500)

grid_values = {
    'hidden_layer_sizes': [(5,5),(10,), (10, 10), (20, 10), (20, 20), (30, 20, 10)], 
    'alpha': [round(x * 0.05, 2) for x in range(1, 20)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
}

grid_mlp_auc = GridSearchCV(mlp, param_grid=grid_values, scoring='roc_auc', n_jobs=-1, cv=5)
grid_mlp_auc.fit(X_train_minmax, y_train)

print("Best MLP parameters:", grid_mlp_auc.best_params_)
print("Best AUC score:", grid_mlp_auc.best_score_)

y_mlp_df = grid_mlp_auc.best_estimator_.predict_proba(X_test_minmax)
fpr_mlp, tpr_mlp, _ = roc_curve(y_test, y_mlp_df[:, 1])
roc_auc_mlp = auc(fpr_mlp, tpr_mlp)

print("AUC in the hold-out dataset:", roc_auc_mlp)

In [None]:
# María Luisa
dt = DecisionTreeClassifier(random_state=10, class_weight='balanced') 
grid_values = {
    'max_depth':[5],  
    'min_samples_leaf': [19], 
    'min_samples_split': list(range(2, 21)),
    'criterion': ['gini', 'entropy','log_loss']
}
grid_dt_auc = GridSearchCV(
    dt, 
    param_grid=grid_values, 
    scoring='roc_auc', 
    n_jobs=-1,
    cv=10  
)
grid_dt_auc.fit(X_train, y_train)  
print("Best Decision Tree parameters from cross-validation:")
print(grid_dt_auc.best_params_)
print("Best cross-validation AUC score on training data:", grid_dt_auc.best_score_)
 

best_dt = grid_dt_auc.best_estimator_  # Get the best model from cross-validation
y_dt_df = best_dt.predict_proba(X_test)[:, 1]  
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_dt_df)
roc_auc_dt = auc(fpr_dt, tpr_dt)
print("AUC on the hold-out test set: ", roc_auc_dt)

In [None]:
# Guillermo
lr = LogisticRegression()
grid_values = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'tol': [1e-4, 1e-5, 1e-6],  # Valores más bajos de tolerancia para mayor precisión
    'C': [0.001, 0.01, 0.1, 0.5, 1, 5],
    'solver': ['liblinear',  'saga', 'newton-cg'],
    'l1_ratio': [0.1, 0.5, 0.9]  # Solo aplicable si se usa 'elasticnet'
}

grid_lr_auc = GridSearchCV(lr, param_grid = grid_values, scoring = 'roc_auc',n_jobs=-1)
grid_lr_auc.fit(X_train, y_train)
print("Best logistic regression")
print(grid_lr_auc.best_params_)
print(grid_lr_auc.best_score_)
y_lr_df=grid_lr_auc.best_estimator_.decision_function(X_test)
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_lr_df)
roc_auc_lr = auc(fpr_lr, tpr_lr)
print("AUC in the hold out dataset of the cross validation: ",roc_auc_lr)
 
###
# Best logistic regression
# {'C': 0.5, 'l1_ratio': 0.1, 'penalty': 'l2', 'solver': 'newton-cg', 'tol': 1e-06}
# 0.8466227812321454
# AUC in the hold out dataset of the cross validation:  0.8304825685721611