# Finding best model and hyper parameter tunning using GridSearchCV
#### For iris flower dataset in sklearn library, we are going to find out best model and best hyper parameters using GridSearchCV

In [2]:
import pandas as pd
from sklearn.datasets import load_iris

In [3]:
iris_dataset = load_iris()

In [4]:
dir(iris_dataset)

['DESCR', 'data', 'feature_names', 'filename', 'target', 'target_names']

In [5]:
iris_dataset.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [6]:
df = pd.DataFrame(iris_dataset.data, columns=iris_dataset.feature_names)

In [10]:
df['flower'] = iris_dataset.target
df['flower'] = df['flower'].apply(lambda x: iris_dataset.target_names[x])

In [11]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


## Approach 1: train_test_split and manually tune parameters by trial and error
### But it is not good way because everytime it will produce different score

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [24]:
x_train, x_test, y_train, y_test = train_test_split(iris_dataset.data, iris_dataset.target, train_size=0.3)

In [25]:
model = SVC(kernel='rbf',C=30,gamma='auto')
model.fit(x_train,y_train)
model.score(x_test, y_test)

0.9428571428571428

### Approach 2: Use K Fold Cross validation
#### Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation

In [27]:
from sklearn.model_selection import cross_val_score

In [47]:
svm_scores1 = cross_val_score(SVC(kernel='linear',C=10,gamma='auto'),iris_dataset.data, iris_dataset.target, cv=5)

In [48]:
svm_scores1

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [49]:
svm_scores1.mean()

0.9733333333333334

In [50]:
svm_scores2 = cross_val_score(SVC(kernel='rbf',C=10,gamma='auto'),iris_dataset.data, iris_dataset.target, cv=5)

In [51]:
svm_scores2

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [52]:
svm_scores2.mean()

0.9800000000000001

In [53]:
svm_scores3 = cross_val_score(SVC(kernel='rbf',C=20,gamma='auto'),iris_dataset.data, iris_dataset.target, cv=5)

In [54]:
svm_scores3

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

In [55]:
svm_scores3.mean()

0.9666666666666668

#### Above approach is tiresome and very manual. We can use for loop as an alternative

In [61]:
kernals = ['rbf', 'linear']
C = [1,10, 20, 30]
avg_scores = {}

for kval in kernals:
    for cval in C:
        scores = cross_val_score(SVC(kernel=kval,C=cval,gamma='auto'),iris_dataset.data, iris_dataset.target, cv=5)
        avg_scores[kval+'_'+str(cval)] = scores.mean()

In [62]:
avg_scores

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'rbf_30': 0.96,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666,
 'linear_30': 0.96}

#### From above results we can say that rbf with C=1 or 10 or linear with C=1 will give best performance

### Approach 3: Use GridSearchCV
#### GridSearchCV does exactly same thing as for loop above but in a single line of code

In [63]:
from sklearn.model_selection import GridSearchCV

In [90]:
clf_scores = GridSearchCV(SVC(gamma='auto'), {'kernel': ['rbf','linear'],
"C":[1,10, 20, 30]}, cv=5, return_train_score=False)

In [91]:
clf_scores.fit(iris_dataset.data, iris_dataset.target)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10, 20, 30], 'kernel': ['rbf', 'linear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [92]:
clf_scores.cv_results_

{'mean_fit_time': array([0.00059791, 0.00042405, 0.00047536, 0.00041223, 0.00051346,
        0.00045505, 0.00051026, 0.00046515]),
 'std_fit_time': array([1.02463130e-04, 1.66200837e-05, 1.23917125e-05, 2.43275757e-05,
        2.64743838e-05, 1.89086226e-05, 1.25067760e-05, 1.54137197e-05]),
 'mean_score_time': array([0.00032001, 0.00026927, 0.00026364, 0.00025377, 0.00028105,
        0.0002655 , 0.000282  , 0.00028157]),
 'std_score_time': array([7.30945482e-05, 1.83774798e-05, 1.81824455e-06, 2.71671668e-06,
        1.02574312e-05, 1.15199357e-05, 1.14692967e-05, 3.30953844e-05]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20, 30, 30],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear',
                    'rbf', 'linear'],
              mask=[False, False, False, False, False, False, False, False],
        fill_valu

In [93]:
df = pd.DataFrame(clf.cv_results_)

In [94]:
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000739,0.000162,0.000413,0.000106,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.000432,2.2e-05,0.000264,1e-05,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.000475,1.8e-05,0.000263,3e-06,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.000447,3e-05,0.000282,2.7e-05,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.000551,2.9e-05,0.000291,1.1e-05,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.000461,1.8e-05,0.000275,1.8e-05,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,5


In [95]:
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


In [96]:
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_required_parameters',
 '_run_search',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'iid',
 'inverse_transform',
 'multimetric_',
 'n_jobs',
 'n_splits_',
 'param_grid',
 'pre_dispatch',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'refit',
 'refit_time_',
 'return_train_score',
 'score',
 'scorer_',
 'scoring',
 '

In [97]:
clf.best_estimator_

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [98]:
clf.best_score_

0.98

#### Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation

In [99]:
from sklearn.model_selection import RandomizedSearchCV

In [101]:
rs = RandomizedSearchCV(SVC(gamma='auto'), {
        'C': [1,10,20],
        'kernel': ['rbf','linear']
    }, 
    cv=5, 
    return_train_score=False, 
)
rs.fit(iris_dataset.data, iris_dataset.target)
pd.DataFrame(rs.cv_results_)[['param_C','param_kernel','mean_test_score']]



Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


# How about different models with different hyperparameters?

In [125]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10, 20]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [126]:
scores = []

for model_name, params in model_params.items():
    clf =  GridSearchCV(params['model'], params['params'], cv=5, return_train_score=False)
    clf.fit(iris_dataset.data, iris_dataset.target)
    scores.append([model_name, clf.best_score_, clf.best_params_])

In [127]:
scores

[['svm', 0.98, {'C': 1, 'kernel': 'rbf'}],
 ['random_forest', 0.96, {'n_estimators': 10}],
 ['logistic_regression', 0.9666666666666667, {'C': 5}]]

In [128]:
model_score_df = pd.DataFrame(scores, columns=['Model', 'Score', 'Param'])

In [129]:
model_score_df

Unnamed: 0,Model,Score,Param
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.96,{'n_estimators': 10}
2,logistic_regression,0.966667,{'C': 5}


#### Based on above, I can conclude that SVM with C=1 and kernel='rbf' is the best model for solving my problem of iris flower classification