# Finding best model and hyper parameter tunning using GridSearchCV
#### For iris flower dataset in sklearn library, we are going to find out best model and best hyper parameters using GridSearchCV

In [1]:
# Load iris flower dataset
from sklearn import svm, datasets
iris = datasets.load_iris()

## Approach 1: Use train_test_split and manually tune parameters by trial and error

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)
model = svm.SVC(kernel='rbf',C=30,gamma='auto')
model.fit(X_train,y_train)
model.score(X_test, y_test)

0.9333333333333333

## Approach 2: Use K Fold Cross validation and loop through Hyper parameters

In [7]:
# Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation
from sklearn.model_selection import cross_val_score
import numpy as np

# print(cross_val_score(svm.SVC(kernel='linear',C=10,gamma='auto'),iris.data, iris.target, cv=5))
# print(cross_val_score(svm.SVC(kernel='rbf',C=10,gamma='auto'),iris.data, iris.target, cv=5))
# print(cross_val_score(svm.SVC(kernel='rbf',C=20,gamma='auto'),iris.data, iris.target, cv=5))

kernels = ['rbf', 'linear']
C = [1,10,20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(svm.SVC(kernel=kval,C=cval,gamma='auto'),iris.data, iris.target, cv=5)
        avg_scores[kval + '_' + str(cval)] = np.average(cv_scores)

avg_scores

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

## Approach 3: Use GridSearchCV
#### GridSearchCV does exactly same thing as for loop above but in a single line of code

In [17]:
# Using GridSearchCV to for Crossvalidation and looping through multiple Hyper parameters
from sklearn.model_selection import GridSearchCV
import pandas as pd
clf = GridSearchCV(svm.SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf','linear']
}, cv=5, return_train_score=False)
clf.fit(iris.data, iris.target)
print("clf.cv_results_: ",clf.cv_results_)
# Visualizing clf.cv_results_
df = pd.DataFrame(clf.cv_results_)
print("df: ",df.head())
print("Reduced df: ",df[['param_C','param_kernel','mean_test_score']])
print("clf.best_params_", clf.best_params_)
print("clf.best_score_", clf.best_score_)
print("methods in clf: ",dir(clf))


clf.cv_results_:  {'mean_fit_time': array([0.00119753, 0.00119681, 0.00059843, 0.00059805, 0.00079765,
       0.00100598]), 'std_fit_time': array([3.98898477e-04, 7.46684518e-04, 4.88616597e-04, 4.88305129e-04,
       3.98826628e-04, 1.53646670e-05]), 'mean_score_time': array([0.00099711, 0.00039907, 0.00039897, 0.0001996 , 0.00019956,
       0.        ]), 'std_score_time': array([8.84401178e-07, 4.88753305e-04, 4.88636085e-04, 3.99208069e-04,
       3.99112701e-04, 0.00000000e+00]), 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 1, 'kernel': 'rbf'}, {'C': 1, 'kernel': 'linear'}, {'C': 10, 'kernel': 'rbf'}, {'C': 10, 'kernel': 'linear'}, {'C': 20, 'kernel': 

## Approach 4: Use RandomizedSearchCV
#### RandomizedSearchCV reduces number of iterations using random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation

In [18]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svm.SVC(gamma='auto'), {
        'C': [1,10,20],
        'kernel': ['rbf','linear']
    }, 
    cv=5, 
    return_train_score=False, 
    n_iter=2
)
rs.fit(iris.data, iris.target)
pd.DataFrame(rs.cv_results_)[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,20,rbf,0.966667
1,1,rbf,0.98


# Working on Hypertuning multiple models using Python JSON
#### using Python JSON to automate the model and parameter attribute in the RandomizedSearchCV and GridSearchCV

In [21]:
# Choosing best out of 3 models
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Python JSON declearization
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

# for tracking the score
scores = []

# Automation of GridsearchCV
for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.96,{'n_estimators': 5}
2,logistic_regression,0.966667,{'C': 5}


##### Based on above, I can conclude that SVM with C=1 and kernel='rbf' is the best model for solving my problem of iris flower classification

## Excersice
#### Finding best model and hyper parameters for sklearn digits dataset classification

In [22]:
from sklearn import datasets
digits = datasets.load_digits()

In [23]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier



model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
            
        }
    }     
}

In [25]:
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
scores = []

for model_name, mp in model_params.items():
    clf =  RandomizedSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(digits.data, digits.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df



Unnamed: 0,model,best_score,best_params
0,svm,0.947697,"{'kernel': 'linear', 'C': 1}"
1,random_forest,0.896535,{'n_estimators': 10}
2,logistic_regression,0.922114,{'C': 1}
3,naive_bayes_gaussian,0.806928,{}
4,naive_bayes_multinomial,0.87035,{}
5,decision_tree,0.80469,{'criterion': 'entropy'}


### the winner is SVM

# Other Options when using GridSearch:  
cv_curr.cv_results_  
cv_curr.best_score_  
cv_curr.best_estimator_  
cv_curr.best_params_  
