# Model Optimization

Example notebook for exploring model optimization with scikit-learn, using GridSearchCV for hyperparameter tuning.

Ricardo Almeida, based on [GridSearchCV Example](https://www.kaggle.com/code/dskagglemt/gridsearchcv-example) by Piotr Płoński

In [None]:
import pandas as pd
import numpy as np

In [None]:
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
RANDOM_SEED = 7657

TEST_SIZE=0.05

Prepare the data

In [None]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [None]:
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, random_state=RANDOM_SEED, test_size=TEST_SIZE)

In [None]:
MAX_DEPTH = 6
MIN_SAMPLES_LEAF = 8

Fit the model

In [None]:
clf = DecisionTreeClassifier(random_state=RANDOM_SEED,
                             max_depth=MAX_DEPTH,
                             min_samples_leaf=MIN_SAMPLES_LEAF)
model = clf.fit(X_train, y_train)

Check performance

In [None]:
print("Accuracy on train set:  {:.1f}%".format(model.score(X_train, y_train)*100))

## Hyperparameter tuning with GridSearchCV

For more details on GridSearchCV, refer https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html?highlight=gridsearchcv#sklearn.model_selection.GridSearchCV

In [None]:
params = {
    'max_depth': [...],
    'max_features': [...],
    'max_leaf_nodes': [...],
    }

In [None]:
tree = DecisionTreeClassifier(random_state=RANDOM_SEED)

In [None]:
clf = GridSearchCV(
    estimator = tree, 
    param_grid = params,
    scoring = 'accuracy',
    cv = 3,
    verbose=True, n_jobs=-1)

In [None]:
# cv --> Determines the cross-validation splitting strategy
# verbose --> Controls the verbosity. Verbose is a general programming term for produce lots of logging output. You can think of it as asking the program to "tell me everything about what you are doing all the time". 
# n_jobs --> Number of jobs to run in parallel. `-1` means using all processors. 

In [None]:
clf.fit(X,y)

In [None]:
# Estimator that was chosen by the search, i.e. estimator which gave highest score (or smallest loss if specified) on the left out data.
best_model = clf.best_estimator_

In [None]:
best_model

In [None]:
# Accuracy on the best model
# best_estimator_.score method

print("Accuracy on train set:  {:.1f}%".format(best_model.score(X_train, y_train)*100))