In [2]:
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

import numpy as np
import seaborn as sns
import pandas as pd
import scipy.io as sio
import matplotlib.pyplot as plt

# Load Data

In [3]:
mat = sio.loadmat('./data/ex6data3.mat')
print(mat.keys())

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y', 'yval', 'Xval'])


In [11]:
training = pd.DataFrame(mat.get('X'), columns = ['X1', 'X2'])
training['y'] = mat.get('y')

cv = pd.DataFrame(mat.get('Xval'), columns = ['X1', 'X2'])
cv['y'] = mat.get('yval')

In [5]:
print(training.shape)
training.head()

(211, 3)


Unnamed: 0,X1,X2,y
0,-0.158986,0.423977,1
1,-0.347926,0.47076,1
2,-0.504608,0.353801,1
3,-0.596774,0.114035,1
4,-0.518433,-0.172515,1


In [6]:
print(cv.shape)
cv.head()

(200, 3)


Unnamed: 0,X1,X2,y
0,-0.353062,-0.673902,0
1,-0.227126,0.44732,1
2,0.092898,-0.753524,0
3,0.148243,-0.718473,0
4,-0.001512,0.162928,0


# Manual grid search for $C$ and $\sigma$

http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC

In [7]:
candidate = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]

In [9]:
# gamma to comply with sklearn parameter name
combination = [(C, gamma) for C in candidate for gamma in candidate]
print("combination length: ", len(combination))

combination length:  81


In [13]:
search = []

for C, gamma in combination:
    svc = svm.SVC(C = C, gamma = gamma)
    svc.fit(training[['X1', 'X2']], training['y'])
    search.append(svc.score(cv[['X1', 'X2']], cv['y']))

In [14]:
best_score = search[np.argmax(search)]
best_param = combination[np.argmax(search)]

print(best_score, best_param)

0.965 (0.3, 100)


In [20]:
best_svc = svm.SVC(C = 0.3, gamma = 100) #note that C is 0.3 and gamma is 100 rather than C is 100 and gamma is 0.3
best_svc.fit(training[['X1', 'X2']], training['y'])
print("best score : ", best_svc.score(cv[['X1', 'X2']], cv['y']), "\n")
ypred = best_svc.predict(cv[['X1', 'X2']])

print(metrics.classification_report(cv['y'], ypred))

best score :  0.965 

              precision    recall  f1-score   support

           0       0.97      0.96      0.97       113
           1       0.95      0.97      0.96        87

    accuracy                           0.96       200
   macro avg       0.96      0.97      0.96       200
weighted avg       0.97      0.96      0.97       200



# sklearn GridSearchCV (网格搜索)

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [22]:
parameters = {'C': candidate, 'gamma': candidate}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, n_jobs = -1)
clf.fit(training[['X1', 'X2']], training['y'])

GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
                         'gamma': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]})

In [23]:
clf.best_params_

{'C': 30, 'gamma': 3}

In [24]:
clf.best_score_

0.9194905869324475

In [25]:
ypred = clf.predict(cv[['X1', 'X2']])
print(metrics.classification_report(cv['y'], ypred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96       113
           1       0.95      0.93      0.94        87

    accuracy                           0.95       200
   macro avg       0.95      0.95      0.95       200
weighted avg       0.95      0.95      0.95       200



>curiouly... they are not the same result. What?  

So the built in sklearn grid search is trying to find the best candidate from **training set**  
However, when we were doing manual grid search, we train using training set, but we pick the best from **cross validation set**. This is the reason of difference.

### I was wrong. That is not the reason
It turns out that **GridSearch** will appropriate part of data as CV and use it to find the best candidate.  
So the reason for different result is just that GridSearch here is just using part of **training data** to train because it need part of data as cv set