In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline 

COLOR = 'white'
plt.rcParams['text.color'] = COLOR
plt.rcParams['axes.labelcolor'] = COLOR
plt.rcParams['xtick.color'] = COLOR
plt.rcParams['ytick.color'] = COLOR

pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.display.float_format = '{:,.2f}'.format

# WHICH MODEL TO CHOOSE?

In [3]:
from sklearn import datasets
iris = datasets.load_iris()

In [4]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.10,3.50,1.40,0.20
1,4.90,3.00,1.40,0.20
2,4.70,3.20,1.30,0.20
3,4.60,3.10,1.50,0.20
4,5.00,3.60,1.40,0.20
...,...,...,...,...
145,6.70,3.00,5.20,2.30
146,6.30,2.50,5.00,1.90
147,6.50,3.00,5.20,2.00
148,6.20,3.40,5.40,2.30


#### Approach 1: Use train_test_split and manually tune parameters by trial and error (not reliable because everytime we refresh, it can give a different value)

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

In [6]:
from sklearn.svm import SVC
model = SVC()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9555555555555556

#### Approach 2: Use K Fold Cross validation

In [7]:
from sklearn.model_selection import cross_val_score

cross_val_score(SVC(kernel='linear',C=10,gamma='auto'),iris.data, iris.target, cv=5)
# but again here we'll have to choose different attributes of SVC to see which one fits the best, instead, we can use a loop of course, but then again thatll be tiresome, so we just use grid search cv.

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [8]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(SVC(gamma='auto'), {
    'C': [1, 10, 20],
    'kernel': ['rbf', 'linear']
}, cv=5, return_train_score=False)

clf.fit(iris.data, iris.target)
clf.cv_results_
# but theres no way in hell I am reading the stuff written below, I wonder if there was a way to turn all this info into a dataframe...

{'mean_fit_time': array([0.00100203, 0.00080328, 0.00099001, 0.00039887, 0.00079899,
        0.00079641]),
 'std_fit_time': array([1.27685559e-05, 4.01835440e-04, 1.40514129e-05, 4.88519238e-04,
        3.99495622e-04, 3.98217345e-04]),
 'mean_score_time': array([0.00059876, 0.00039344, 0.00019984, 0.00059853, 0.00039978,
        0.00019846]),
 'std_score_time': array([0.00048889, 0.00048198, 0.00039968, 0.00048869, 0.00048963,
        0.00039692]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20

In [9]:
tuning_df = pd.DataFrame(clf.cv_results_)
tuning_df
# here we go now we can view this, now in this whole table, the only useful things for us is mean test score and the attributes that we wanted to cross validate 

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0,0.0,0.0,0.0,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.97,1.0,0.97,0.97,1.0,0.98,0.02,1
1,0.0,0.0,0.0,0.0,1,linear,"{'C': 1, 'kernel': 'linear'}",0.97,1.0,0.97,0.97,1.0,0.98,0.02,1
2,0.0,0.0,0.0,0.0,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.97,1.0,0.97,0.97,1.0,0.98,0.02,1
3,0.0,0.0,0.0,0.0,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.97,1.0,0.97,0.04,4
4,0.0,0.0,0.0,0.0,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.97,1.0,0.9,0.97,1.0,0.97,0.04,5
5,0.0,0.0,0.0,0.0,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.93,1.0,0.97,0.04,6


In [10]:
tuning_df[['params', 'param_C', 'mean_test_score']], 

(                          params param_C  mean_test_score
 0      {'C': 1, 'kernel': 'rbf'}       1             0.98
 1   {'C': 1, 'kernel': 'linear'}       1             0.98
 2     {'C': 10, 'kernel': 'rbf'}      10             0.98
 3  {'C': 10, 'kernel': 'linear'}      10             0.97
 4     {'C': 20, 'kernel': 'rbf'}      20             0.97
 5  {'C': 20, 'kernel': 'linear'}      20             0.97,)

In [11]:
clf.best_score_

0.9800000000000001

In [12]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}