## Grid Search CV: Iris Flowers

### Preface

**Summary**
- KFold splits the baseline data into various train-test folds 
- Cross Validation (cross_val_score) calculates score of an model/algorithm for various folds
- Grid Search calculates score of a model/algorithm for various parameters _and_ folds
- Grid Search is costly as it calculates score for every permutation and combination
- Random Search is an alternative, calculate score for a random combination of parameters _and_ folds

**Acknowledgements**
- TBD 

### Initialization

**Packages**

In [1]:
import pandas as pkg_pandas
import math as pkg_math
from matplotlib import pyplot as pkg_plot
from sklearn import linear_model as pkg_linear_model
from sklearn import model_selection as pkg_model_selection
from sklearn import preprocessing as pkg_preprocessing
from sklearn import tree as pkg_tree
from sklearn import metrics as pkg_metrics
from sklearn import datasets as pkg_datasets
from sklearn import ensemble as pkg_ensemble
from sklearn import svm as pkg_svm
from sklearn import naive_bayes as pkg_naive_bayes
import seaborn as pkg_seaborn

**Common**

In [2]:
%matplotlib inline

**Load Data**

In [3]:
dataset = pkg_datasets.load_iris()
dir(dataset)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [4]:
column_names = []
for feature_name in dataset.feature_names:
    column_name = feature_name.replace(' ', '_').replace('(','').replace(')','')
    column_names.append(column_name)

print("\nFeature Names = {}\nTarget Names = {}\nColumn Names = {}".format(\
    dataset.feature_names, dataset.target_names, column_names))


Feature Names = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target Names = ['setosa' 'versicolor' 'virginica']
Column Names = ['sepal_length_cm', 'sepal_width_cm', 'petal_length_cm', 'petal_width_cm']


In [5]:
dataset_df = pkg_pandas.DataFrame(dataset.data, columns=column_names)
dataset_df['flower_number'] = dataset.target
dataset_df['flower_name'] = dataset_df['flower_number'].apply(lambda fnum: dataset.target_names[fnum])
dataset_df.head()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,flower_number,flower_name
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


In [6]:
output_column_name = 'flower_number'
baseline_outputs = dataset_df[output_column_name]
baseline_inputs = dataset_df.drop(columns=[output_column_name, 'flower_name']).to_numpy()

### Process

**Common**

In [7]:
min_score = 0.60
max_results = 5

In [8]:
def retrieve_top_classifier_results(classifier):
    results_df = pkg_pandas.DataFrame(classifier.cv_results_)
    results_df = results_df[["params", "mean_test_score", "rank_test_score"]]
    results_df.sort_values(by=["mean_test_score"], ascending=False, inplace=True)
    results_df = results_df[results_df["mean_test_score"] > min_score]
    return results_df.head(max_results)

In [9]:
def perform_grid_search(model, model_params, X_baseline, y_baseline):
    classifier = pkg_model_selection.GridSearchCV(estimator=model,\
        param_grid=model_params, cv=5, return_train_score=False)
    classifier.fit(X=X_baseline, y=y_baseline)
    results_df = retrieve_top_classifier_results(classifier)
    return classifier, results_df
    

In [10]:
def perform_random_search(model, model_params, X_baseline, y_baseline, num_iterations):
    classifier = pkg_model_selection.RandomizedSearchCV(estimator=model, \
        param_distributions=model_params, n_iter=num_iterations, cv=5, return_train_score=False)
    classifier.fit(X=X_baseline, y=y_baseline)
    results_df = retrieve_top_classifier_results(classifier)
    return classifier, results_df
    

**Models**

In [11]:
model_config = [
    {
        "name" : "SVM",
        "instance" : pkg_svm.SVC(),
        "params" : {
            "C": [1, 10, 20],
            "kernel" : ['linear','poly','rbf','sigmoid'],
            "gamma": ['scale', 'auto'],
            "decision_function_shape" : ['ovo', 'ovr']
        }
    },
    {
        "name" : "DecisionTree",
        "instance" : pkg_tree.DecisionTreeClassifier(), 
        "params": {
            "criterion" : ["gini", "entropy", "log_loss"],
            "splitter" : ["best", "random"],
            "max_depth": [3, 6, 9, 12, 18],
            "max_features" : ["sqrt", "log2"],
            "random_state": [None, 1]
        }
    },
    {
        "name" : "RandomForest",
        "instance" : pkg_ensemble.RandomForestClassifier(), 
        "params": {
            "n_estimators": [100, 120, 150],
            "criterion" : ["gini", "entropy", "log_loss"],
            "max_depth": [3, 6, 9, 12, 18],
            "max_features" : ["sqrt", "log2"],
            "random_state": [None, 1],
            "class_weight" : ["balanced", "balanced_subsample"]
        }
    },
    {
        "name" : "LinearRegression",
        "instance" : pkg_linear_model.LinearRegression(),
        "params": { 
            "fit_intercept" : [False, True],
            "positive" : [False, True]
        }
    },
    {
        "name" : "LogisticRegression",
        "instance" : pkg_linear_model.LogisticRegression(),
        "params": { 
            "penalty" : ['l1', 'l2', 'elasticnet'],
            "dual" : [False, True],
            "tol" : [1e-3, 1e-4],
            "C": [1, 10, 20],
            "fit_intercept" : [False, True],
            "random_state": [None, 1],
            "solver" : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
            "max_iter" : [50, 90, 150, 230]
        }
    },
    {
        "name" : "GaussianNaiveBayes",
        "instance" : pkg_naive_bayes.GaussianNB(),
        "params": { }
    },
    {
        "name" : "MultinomialNaiveBayes",
        "instance" : pkg_naive_bayes.MultinomialNB(), 
        "params": {
            "alpha": [1e-10, 1e-6, 1e-3, 0.1, 1.0],
            "fit_prior" : [ True, False]
        }
    }
]

In [12]:
best_results_df = pkg_pandas.DataFrame(columns=["name", "score", "params"])

for mc in model_config:
    #print("DEBUG:: Model Config: {}".format(mc))
    classifier, results_df = perform_grid_search(model=mc["instance"], \
        model_params=mc["params"], X_baseline=baseline_inputs, y_baseline=baseline_outputs)
    best_results_df.loc[len(best_results_df)] = [mc["name"], classifier.best_score_, classifier.best_params_]

best_results_df

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,name,score,params
0,SVM,0.98,"{'C': 1, 'decision_function_shape': 'ovo', 'ga..."
1,DecisionTree,0.973333,"{'criterion': 'gini', 'max_depth': 6, 'max_fea..."
2,RandomForest,0.966667,"{'class_weight': 'balanced', 'criterion': 'gin..."
3,LinearRegression,0.322697,"{'fit_intercept': False, 'positive': False}"
4,LogisticRegression,0.986667,"{'C': 1, 'dual': False, 'fit_intercept': True,..."
5,GaussianNaiveBayes,0.953333,{}
6,MultinomialNaiveBayes,0.953333,"{'alpha': 1e-10, 'fit_prior': True}"


In [13]:
best_results_df = pkg_pandas.DataFrame(columns=["name", "score", "params"])

for mc in model_config:
    classifier, results_df = perform_random_search(model=mc["instance"], \
        model_params=mc["params"], X_baseline=baseline_inputs, y_baseline=baseline_outputs, num_iterations=5)
    best_results_df.loc[len(best_results_df)] = [mc["name"], classifier.best_score_, classifier.best_params_]

best_results_df

15 fits failed out of a total of 25.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/raooruga/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/raooruga/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1101, in fit
    raise ValueError(
ValueError: l1_ratio must be between 0 and 1; got (l1_ratio=None)

--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/raooruga/.local/lib/python3.8/site-pack

Unnamed: 0,name,score,params
0,SVM,0.98,"{'kernel': 'poly', 'gamma': 'scale', 'decision..."
1,DecisionTree,0.953333,"{'splitter': 'best', 'random_state': None, 'ma..."
2,RandomForest,0.966667,"{'random_state': 1, 'n_estimators': 150, 'max_..."
3,LinearRegression,0.322697,"{'positive': False, 'fit_intercept': False}"
4,LogisticRegression,0.98,"{'tol': 0.0001, 'solver': 'newton-cg', 'random..."
5,GaussianNaiveBayes,0.953333,{}
6,MultinomialNaiveBayes,0.953333,"{'fit_prior': False, 'alpha': 1.0}"
