## Grid Search CV: Visual Digits

### Preface

**Summary**
- KFold splits the baseline data into various train-test folds 
- Cross Validation (cross_val_score) calculates score of an model/algorithm for various folds
- Grid Search calculates score of a model/algorithm for various parameters _and_ folds
- Grid Search is costly as it calculates score for every permutation and combination
- Random Search is an alternative, calculate score for a random combination of parameters _and_ folds

**Acknowledgements**
- TBD 

### Initialization

**Packages**

In [1]:
import pandas as pkg_pandas
import math as pkg_math
import seaborn as pkg_seaborn
import warnings as pkg_warnings
from matplotlib import pyplot as pkg_plot
from sklearn import linear_model as pkg_linear_model
from sklearn import model_selection as pkg_model_selection
from sklearn import preprocessing as pkg_preprocessing
from sklearn import tree as pkg_tree
from sklearn import metrics as pkg_metrics
from sklearn import datasets as pkg_datasets
from sklearn import ensemble as pkg_ensemble
from sklearn import svm as pkg_svm
from sklearn import naive_bayes as pkg_naive_bayes


**Common**

In [2]:
%matplotlib inline
pkg_warnings.filterwarnings('ignore')

**Load Data**

In [3]:
dataset = pkg_datasets.load_digits()
dir(dataset)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [4]:
print("\nFeature Names = {}\nTarget Names = {}".format(dataset.feature_names, dataset.target_names))


Feature Names = ['pixel_0_0', 'pixel_0_1', 'pixel_0_2', 'pixel_0_3', 'pixel_0_4', 'pixel_0_5', 'pixel_0_6', 'pixel_0_7', 'pixel_1_0', 'pixel_1_1', 'pixel_1_2', 'pixel_1_3', 'pixel_1_4', 'pixel_1_5', 'pixel_1_6', 'pixel_1_7', 'pixel_2_0', 'pixel_2_1', 'pixel_2_2', 'pixel_2_3', 'pixel_2_4', 'pixel_2_5', 'pixel_2_6', 'pixel_2_7', 'pixel_3_0', 'pixel_3_1', 'pixel_3_2', 'pixel_3_3', 'pixel_3_4', 'pixel_3_5', 'pixel_3_6', 'pixel_3_7', 'pixel_4_0', 'pixel_4_1', 'pixel_4_2', 'pixel_4_3', 'pixel_4_4', 'pixel_4_5', 'pixel_4_6', 'pixel_4_7', 'pixel_5_0', 'pixel_5_1', 'pixel_5_2', 'pixel_5_3', 'pixel_5_4', 'pixel_5_5', 'pixel_5_6', 'pixel_5_7', 'pixel_6_0', 'pixel_6_1', 'pixel_6_2', 'pixel_6_3', 'pixel_6_4', 'pixel_6_5', 'pixel_6_6', 'pixel_6_7', 'pixel_7_0', 'pixel_7_1', 'pixel_7_2', 'pixel_7_3', 'pixel_7_4', 'pixel_7_5', 'pixel_7_6', 'pixel_7_7']
Target Names = [0 1 2 3 4 5 6 7 8 9]


In [5]:
dataset_df = pkg_pandas.DataFrame(dataset.data, columns=dataset.feature_names)
dataset_df['target'] = dataset.target
dataset_df.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4


In [6]:
output_column_name = 'target'
baseline_outputs = dataset_df[output_column_name]
baseline_inputs = dataset_df.drop(columns=[output_column_name]).to_numpy()

### Process

**Common**

In [7]:
min_score = 0.60
max_results = 5

In [8]:
def retrieve_top_classifier_results(classifier):
    results_df = pkg_pandas.DataFrame(classifier.cv_results_)
    results_df = results_df[["params", "mean_test_score", "rank_test_score"]]
    results_df.sort_values(by=["mean_test_score"], ascending=False, inplace=True)
    results_df = results_df[results_df["mean_test_score"] > min_score]
    return results_df.head(max_results)

In [9]:
def perform_grid_search(model, model_params, X_baseline, y_baseline):
    classifier = pkg_model_selection.GridSearchCV(estimator=model,\
        param_grid=model_params, cv=5, return_train_score=False)
    classifier.fit(X=X_baseline, y=y_baseline)
    results_df = retrieve_top_classifier_results(classifier)
    return classifier, results_df
    

In [10]:
def perform_random_search(model, model_params, X_baseline, y_baseline, num_iterations):
    classifier = pkg_model_selection.RandomizedSearchCV(estimator=model, \
        param_distributions=model_params, n_iter=num_iterations, cv=5, return_train_score=False)
    classifier.fit(X=X_baseline, y=y_baseline)
    results_df = retrieve_top_classifier_results(classifier)
    return classifier, results_df
    

**Models**

In [11]:
model_config = [
    {
        "name" : "SVM",
        "instance" : pkg_svm.SVC(),
        "params" : {
            "C": [1, 10, 20],
            "kernel" : ['linear','poly','rbf','sigmoid'],
            "gamma": ['scale', 'auto'],
            "decision_function_shape" : ['ovo', 'ovr']
        }
    },
    {
        "name" : "DecisionTree",
        "instance" : pkg_tree.DecisionTreeClassifier(), 
        "params": {
            "criterion" : ["gini", "entropy", "log_loss"],
            "splitter" : ["best", "random"],
            "max_depth": [3, 6, 9, 12, 18],
            "max_features" : ["sqrt", "log2"],
            "random_state": [None, 1]
        }
    },
    {
        "name" : "RandomForest",
        "instance" : pkg_ensemble.RandomForestClassifier(), 
        "params": {
            "n_estimators": [100, 120, 150],
            "criterion" : ["gini", "entropy", "log_loss"],
            "max_depth": [3, 6, 9, 12, 18],
            "max_features" : ["sqrt", "log2"],
            "random_state": [None, 1],
            "class_weight" : ["balanced", "balanced_subsample"]
        }
    },
    {
        "name" : "LinearRegression",
        "instance" : pkg_linear_model.LinearRegression(),
        "params": { 
            "fit_intercept" : [False, True],
            "positive" : [False, True]
        }
    },
    {
        "name" : "LogisticRegression",
        "instance" : pkg_linear_model.LogisticRegression(),
        "params": { 
            "penalty" : ['l1', 'l2', 'elasticnet'],
            "C": [1, 10, 20],
            "random_state": [None, 1],
            "solver" : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
            "max_iter" : [100, 200]
        }
    },
    {
        "name" : "GaussianNaiveBayes",
        "instance" : pkg_naive_bayes.GaussianNB(),
        "params": { }
    },
    {
        "name" : "MultinomialNaiveBayes",
        "instance" : pkg_naive_bayes.MultinomialNB(), 
        "params": {
            "alpha": [1e-10, 1e-6, 1e-3, 0.1, 1.0],
            "fit_prior" : [ True, False]
        }
    }
]

In [12]:
best_results_df = pkg_pandas.DataFrame(columns=["name", "score", "params"])

for mc in model_config:
    #print("DEBUG:: Model Config: {}".format(mc))
    classifier, results_df = perform_grid_search(model=mc["instance"], \
        model_params=mc["params"], X_baseline=baseline_inputs, y_baseline=baseline_outputs)
    best_results_df.loc[len(best_results_df)] = [mc["name"], classifier.best_score_, classifier.best_params_]

best_results_df

Unnamed: 0,name,score,params
0,SVM,0.97385,"{'C': 10, 'decision_function_shape': 'ovo', 'g..."
1,DecisionTree,0.778553,"{'criterion': 'gini', 'max_depth': 12, 'max_fe..."
2,RandomForest,0.946037,"{'class_weight': 'balanced', 'criterion': 'gin..."
3,LinearRegression,0.506557,"{'fit_intercept': True, 'positive': False}"
4,LogisticRegression,0.928234,"{'C': 1, 'max_iter': 100, 'penalty': 'l1', 'ra..."
5,GaussianNaiveBayes,0.806928,{}
6,MultinomialNaiveBayes,0.870907,"{'alpha': 1e-06, 'fit_prior': True}"


In [13]:
best_results_df = pkg_pandas.DataFrame(columns=["name", "score", "params"])

for mc in model_config:
    classifier, results_df = perform_random_search(model=mc["instance"], \
        model_params=mc["params"], X_baseline=baseline_inputs, y_baseline=baseline_outputs, num_iterations=5)
    best_results_df.loc[len(best_results_df)] = [mc["name"], classifier.best_score_, classifier.best_params_]

best_results_df

Unnamed: 0,name,score,params
0,SVM,0.97385,"{'kernel': 'rbf', 'gamma': 'scale', 'decision_..."
1,DecisionTree,0.744627,"{'splitter': 'random', 'random_state': 1, 'max..."
2,RandomForest,0.937693,"{'random_state': 1, 'n_estimators': 150, 'max_..."
3,LinearRegression,0.506557,"{'positive': False, 'fit_intercept': True}"
4,LogisticRegression,0.91822,"{'solver': 'liblinear', 'random_state': None, ..."
5,GaussianNaiveBayes,0.806928,{}
6,MultinomialNaiveBayes,0.870907,"{'fit_prior': True, 'alpha': 0.1}"
