# Grid Search

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

## Simple Grid Search

In [None]:
# naive grid search implementation
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)

print(f"Size of training set: {X_train.shape[0]} size of test set: {X_test.shape[0]}")

best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        # for each combination of parameters, train an SVC
        svm = SVC(gamma=gamma, C=C)
        svm.fit(X_train, y_train)
        # evaluate the SVC on the test set
        score = svm.score(X_test, y_test)
        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}
            
print(f"Best score: {best_score:.2f}")
print(f"Best parameters: {best_parameters}")

## The Danger of Overfitting the Parameters and the Validation Set

In [None]:
from helpers.plot_cross_validation import plot_threefold_split

plot_threefold_split()

In [None]:
from sklearn.svm import SVC

# split data into train+validation set and test set
X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data, iris.target, random_state=0)

# split train+validation set into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, random_state=1)

print(f"Size of training set: {X_train.shape[0]} size of validation set: {X_valid.shape[0]} size of test set: {X_test.shape[0]}\n")

In [None]:
best_score = 0

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        # for each combination of parameters, train an SVC
        svm = SVC(gamma=gamma, C=C)
        svm.fit(X_train, y_train)
        # evaluate the SVC on the test set
        score = svm.score(X_valid, y_valid)
        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}

In [None]:
# rebuild a model on the combined training and validation set,
# and evaluate it on the test set
svm = SVC(**best_parameters)
svm.fit(X_trainval, y_trainval)
test_score = svm.score(X_test, y_test)
print(f"Best score on validation set: {best_score:.2f}")
print(f"Best parameters: {best_parameters}")
print(f"Test set score with best parameters: {test_score:.2f}")

## Grid Search with Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        # for each combination of parameters,
        # train an SVC
        svm = SVC(gamma=gamma, C=C)
        # perform cross-validation
        scores = cross_val_score(svm, X_trainval, y_trainval, cv=5)
        # compute mean cross-validation accuracy
        score = np.mean(scores)
        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}
            
# rebuild a model on the combined training and validation set
svm = SVC(**best_parameters)
svm.fit(X_trainval, y_trainval)

In [None]:
from helpers.plot_grid_search import plot_cross_val_selection

plot_cross_val_selection()
plt.show()

In [None]:
from helpers.plot_grid_search import plot_grid_search_overview

plot_grid_search_overview()

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

print(f"Parameter grid:\n{param_grid}")

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

grid_search = GridSearchCV(SVC(), param_grid, cv=5)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print(f"Test set score: {grid_search.score(X_test, y_test):.2f}")

In [None]:
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.2f}")

In [None]:
print(f"Best estimator:\n{grid_search.best_estimator_}")

### Analyzing the result of cross-validation

In [None]:
# convert to DataFrame
results = pd.DataFrame(grid_search.cv_results_)

# show the first 5 rows
results.head()

In [None]:
scores = np.array(results["mean_test_score"]).reshape(6, 6)

In [None]:
import seaborn as sns

fig, ax = plt.subplots(figsize=(7, 5))
sns.heatmap(scores[::-1], vmin=0, vmax=1, cmap="viridis", square=True, annot=True)

# axis labels
plt.xlabel('gamma')
plt.ylabel('C')

# xticks
xticks_labels = param_grid['gamma']
plt.xticks(np.arange(len(xticks_labels)) + .5, labels=xticks_labels)

# y ticks
yticks_labels = param_grid['C']
plt.yticks(np.arange(len(yticks_labels)) + .5, labels=yticks_labels)

plt.show()

In [None]:
from helpers.tools import heatmap as heatmap_tools

fig, axes = plt.subplots(1, 3, figsize=(13, 5)) 

param_grid_linear = {'C': np.linspace(1, 2, 6), 'gamma': np.linspace(1, 2, 6)}
param_grid_one_log = {'C': np.linspace(1, 2, 6), 'gamma': np.logspace(-3, 2, 6)}
param_grid_range = {'C': np.logspace(-3, 2, 6), 'gamma': np.logspace(-7, -2, 6)}


for param_grid, ax in zip([param_grid_linear, param_grid_one_log, param_grid_range], axes):
    grid_search = GridSearchCV(SVC(), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    scores = grid_search.cv_results_['mean_test_score'].reshape(6, 6)
    # plot the mean cross-validation scores
    scores_image = heatmap_tools(scores, xlabel='gamma', 
                                               ylabel='C', xticklabels=param_grid['gamma'], 
                                               yticklabels=param_grid['C'], cmap="viridis", ax=ax)

plt.colorbar(scores_image, ax=axes.tolist())
plt.show()

### Search over spaces that are not grids

In [None]:
param_grid = [{'kernel': ['rbf'], 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
            {'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100]}]

In [None]:
param_grid

In [None]:
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.2f}")

In [None]:
results = pd.DataFrame(grid_search.cv_results_)
results.T

### Using different cross-validation strategies with grid search

### Nested cross-validation

In [None]:
scores = cross_val_score(GridSearchCV(SVC(), param_grid, cv=5), iris.data, iris.target, cv=5)

print("Cross-validation scores: ", scores)
print("Mean cross-validation score: ", scores.mean())

### Parallelizing cross-validation and grid search