<a href="https://colab.research.google.com/github/kessingtonosazee/GCP_Project_1/blob/master/mlc_2324_w5_cv_grid_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to Cross Validation and Grid Search

* Machine Learning Concepts (MLC)
* Week 5, 2023/24
* Luciano Gerber

## Preamble: Importing and Configuring Essential Packages


In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(
    { "figure.figsize": (6, 4) },
    style='ticks',
    color_codes=True,
    font_scale=0.8
)
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import sys
if 'google.colab' in sys.modules:
    !pip install -q dtreeviz
import dtreeviz

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/91.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/91.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m61.4/91.8 kB[0m [31m806.0 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.8/91.8 kB[0m [31m998.4 kB/s[0m eta [36m0:00:00[0m
[?25h

## Use Case: Diabetes (PIMA) Dataset

In [None]:
dbt = pd.read_csv('https://raw.githubusercontent.com/gerberl/6G7V0015-2324/main/datasets/diabetes.csv')

In [None]:
dbt.head(1)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1


## Pre-Processing

In [None]:
# just to get things going, a broad-swipe-fillna
dbt_glc_age = dbt.loc[:, 'Glucose':'Age']
dbt.loc[:, 'Glucose':'Age'] = dbt_glc_age.replace(0, dbt_glc_age.mean())

In [None]:
(dbt==0).sum()

Pregnancies                 111
Glucose                       0
BloodPressure                 0
SkinThickness                 0
Insulin                       0
BMI                           0
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

In [None]:
X, y = dbt.drop(columns='Outcome'), dbt['Outcome']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20231020)

## A First Decision Tree for Illustration

In [None]:
dt = DecisionTreeClassifier(max_depth=5)
dt.fit(X_train, y_train)

In [None]:
balanced_accuracy_score(y_test, dt.predict(X_test))

0.6883366313583983

In [None]:
balanced_accuracy_score(y_train, dt.predict(X_train))

0.8194085423837778

<!-- ![5-Fold Cross Validation](https://scikit-learn.org/stable/_images/grid_search_cross_validation.png) -->

[sklearn's documentation on cross-validation](https://scikit-learn.org/stable/modules/cross_validation.html): many options (we'll discuss a couple in MLC/AML).

## Cross-Validation with K-Fold

<!-- ![K-Fold](https://github.com/gerberl/6G7V0015-2324/raw/main/figures/kfold_cv.png)

<img src="https://github.com/gerberl/6G7V0015-2324/raw/main/figures/kfold_cv.png" style="width:800px;height:600px;"/> -->

<img src="https://github.com/gerberl/6G7V0015-2324/raw/main/figures/kfold_cv.png" width="1600" height="200"/>



In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(dt, X, y, cv=5)

In [None]:
scores

array([0.74675325, 0.68831169, 0.75974026, 0.79084967, 0.75163399])

In [None]:
scores.mean(), scores.std()

(0.7474577709871827, 0.03331640117924166)

In [None]:
scores = cross_val_score(dt, X, y, cv=10)
scores.mean(), scores.std()

(0.7213773069036227, 0.032823110450359565)

In [None]:
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)

In [None]:
scores = cross_val_score(dt, X, y, cv=10)
scores.mean(), scores.std()

(0.7343814080656187, 0.03546347617090141)

## Scores/Losses
    
- score: the higher, the better. A "got-things-right" aggregate. Examples are balanced accuracy (classification) and explained variance ($R^2$) (regression).

- loss: the lower, the better. A "got-things-wrong" aggregate. Examples are mean absolute error (regression).

## Hyper-Parameter Search with Grid Search

* https://scikit-learn.org/stable/modules/grid_search.html#grid-search
* https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV

In [None]:
param_grid = {
    'max_depth': [ 1, 3, 4, 8 ],
    'min_samples_leaf': [ 10, 20, 30 ]
}

In [None]:
from sklearn.model_selection import ParameterGrid
# this is the Parameter Grid of the above: all possible combinations of the values
# of the two hyper-parameters (4 max_depth x 3 min_sample_leaf = 12 configuration)
list(ParameterGrid(param_grid))

[{'max_depth': 1, 'min_samples_leaf': 10},
 {'max_depth': 1, 'min_samples_leaf': 20},
 {'max_depth': 1, 'min_samples_leaf': 30},
 {'max_depth': 3, 'min_samples_leaf': 10},
 {'max_depth': 3, 'min_samples_leaf': 20},
 {'max_depth': 3, 'min_samples_leaf': 30},
 {'max_depth': 4, 'min_samples_leaf': 10},
 {'max_depth': 4, 'min_samples_leaf': 20},
 {'max_depth': 4, 'min_samples_leaf': 30},
 {'max_depth': 8, 'min_samples_leaf': 10},
 {'max_depth': 8, 'min_samples_leaf': 20},
 {'max_depth': 8, 'min_samples_leaf': 30}]

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# GridSearchCV has an interface similar to estimators
# first we instantiate and configure; then, we fit it to the data (the latter
# is when the actual grid search happens)
# The instantiation needs the model and the parameter grid; I've also enabled
# the train score, as it helps with determining over/underfitting
clf = GridSearchCV(dt, param_grid, return_train_score=True)

In [None]:
# the results from a grid search produces some interesting properties
# the `cv_results_` is a fitted parameter (a dict) that can be made into
# a DataFrame for easier analysis of the configurations and their performance
gs_results = clf.fit(X_train, y_train)

In [None]:
gs_df = pd.DataFrame(gs_results.cv_results_)

In [None]:
gs_df.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_max_depth', 'param_min_samples_leaf', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score', 'split0_train_score',
       'split1_train_score', 'split2_train_score', 'split3_train_score',
       'split4_train_score', 'mean_train_score', 'std_train_score'],
      dtype='object')

In [None]:
gs_df.head(1)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.003339,0.00098,0.002089,0.000174,1,10,"{'max_depth': 1, 'min_samples_leaf': 10}",0.732759,0.643478,0.765217,...,0.718726,0.040415,10,0.75,0.759219,0.72885,0.737527,0.739696,0.743059,0.010518


In [None]:
gs_df[ [
    'param_max_depth', 'param_min_samples_leaf',
    'mean_train_score', 'std_train_score',
    'mean_test_score', 'std_test_score', 'rank_test_score'
] ].sort_values('rank_test_score')

Unnamed: 0,param_max_depth,param_min_samples_leaf,mean_train_score,std_train_score,mean_test_score,std_test_score,rank_test_score
6,4,10,0.803814,0.010784,0.767346,0.013014,1
8,4,30,0.795137,0.004105,0.765622,0.036072,2
11,8,30,0.79905,0.006935,0.765622,0.036072,2
7,4,20,0.801645,0.008952,0.762144,0.016235,4
10,8,20,0.81684,0.008537,0.758666,0.020287,5
9,8,10,0.844176,0.0113,0.755202,0.038289,6
5,3,30,0.773861,0.014848,0.741334,0.030674,7
4,3,20,0.773427,0.01473,0.739595,0.030502,8
3,3,10,0.771692,0.014768,0.73958,0.030634,9
0,1,10,0.743059,0.010518,0.718726,0.040415,10


In [None]:
# one can look at the winning configuration (in this case, the best mean test score)
gs_results.best_params_

{'max_depth': 4, 'min_samples_leaf': 30}

In [None]:
# ...and obtain the best model (the one fitted on that winning configuration)
gs_results.best_estimator_

With access to the ranking, one can pick one or more preferred configurations based on quantitative and qualitative criteria (e.g., mean test score, size of tree, gap between train and test).