In [1]:
from sklearn import svm, metrics
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

def model(datapath):
    df = pd.read_csv(datapath)
    #df_s = df.sort_values('label')
    y = df["label"].values
    labels = df["label"] 
    X = df.drop("label",axis=1).values/255
    #print(df.head())
    #print(df.shape)
    print(y)
    print(X)
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
    parameters = {'C': [1000], 'kernel': ['poly'], 'gamma': [0.001]}
    svc = svm.SVC(parameters)
    #parameters = {"gamma":[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]}
    parameters = [
        {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
        {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.001, 0.0001]},
        {'C': [1, 10, 100, 1000], 'kernel': ['poly'], 'degree': [2, 3, 4], 'gamma': [0.001, 0.0001]},
        {'C': [1, 10, 100, 1000], 'kernel': ['sigmoid'], 'gamma': [0.001, 0.0001]}
        ]
    
    clf = GridSearchCV(svc, parameters, verbose=2, n_jobs=-1)
    #clf=svc
    clf.fit(X_train,y_train)
    print("Best estimator found by grid search:")
    print(clf.best_estimator_)
    return clf, X_test, y_test


def model_estimation(fitted_model, X_test, y_test):
    test_score = fitted_model.score(X_test,y_test)
    y_pred = fitted_model.predict(X_test)
    print(test_score)
    print(metrics.classification_report(y_test, y_pred, 
                                        target_names=["0","1","2","3","4","5","6","7","8","9"]))
    print(metrics.confusion_matrix(y_test, y_pred, labels=range(10)))
    plt.clf()


In [2]:
DATAPATH = "data/digits.csv"
clf,X_test, y_test = model(DATAPATH)

[1 0 1 ... 4 7 9]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Fitting 3 folds for each of 44 candidates, totalling 132 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 132 out of 132 | elapsed: 64.7min finished


Best estimator found by grid search:
SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=2, gamma=0.001, kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [3]:
model_estimation(clf, X_test, y_test)

0.9575
             precision    recall  f1-score   support

          0       0.97      0.99      0.98       206
          1       0.96      0.96      0.96       244
          2       0.95      0.95      0.95       220
          3       0.97      0.94      0.95       207
          4       0.94      0.96      0.95       191
          5       0.97      0.95      0.96       182
          6       0.94      0.98      0.96       192
          7       0.97      0.95      0.96       194
          8       0.98      0.93      0.95       194
          9       0.93      0.98      0.95       170

avg / total       0.96      0.96      0.96      2000

[[203   0   0   0   1   0   1   0   1   0]
 [  0 235   3   0   1   0   3   1   1   0]
 [  0   1 209   1   4   0   2   1   1   1]
 [  1   1   3 194   0   4   2   1   1   0]
 [  0   0   1   0 184   0   1   0   0   5]
 [  1   0   1   4   0 172   3   0   0   1]
 [  2   0   1   0   1   0 188   0   0   0]
 [  2   1   2   1   2   0   0 184   0   2]
 [  0   5 

<Figure size 432x288 with 0 Axes>

In [7]:
results = pd.DataFrame(clf.cv_results_)
print(clf.best_score_)
display(results)

0.9515




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,param_gamma,param_degree,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,11.008101,0.11007,7.88877,0.045987,1,linear,,,"{'C': 1, 'kernel': 'linear'}",0.917228,0.907387,0.907623,0.91075,0.004586,13,0.999812,1.0,1.0,0.999937,8.844362e-05
1,11.082582,0.170012,7.916274,0.013027,10,linear,,,"{'C': 10, 'kernel': 'linear'}",0.91573,0.907387,0.907247,0.910125,0.003968,14,1.0,1.0,1.0,1.0,0.0
2,11.156388,0.264457,7.917129,0.093096,100,linear,,,"{'C': 100, 'kernel': 'linear'}",0.91573,0.907387,0.907247,0.910125,0.003968,14,1.0,1.0,1.0,1.0,0.0
3,11.658526,0.077262,7.896836,0.02925,1000,linear,,,"{'C': 1000, 'kernel': 'linear'}",0.91573,0.907387,0.907247,0.910125,0.003968,14,1.0,1.0,1.0,1.0,0.0
4,33.30829,0.11927,16.106756,0.056728,1,rbf,0.001,,"{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}",0.910861,0.900262,0.907247,0.906125,0.004401,18,0.913696,0.913745,0.913435,0.913625,0.0001361872
5,79.108022,0.993484,23.457244,0.172545,1,rbf,0.0001,,"{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}",0.785393,0.776903,0.779196,0.7805,0.003588,25,0.787805,0.789987,0.785835,0.787875,0.001695838
6,14.646064,0.071234,9.866976,0.11195,10,rbf,0.001,,"{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}",0.934457,0.928384,0.929778,0.930875,0.002598,4,0.956098,0.959497,0.957841,0.957812,0.001388161
7,30.154826,0.017725,15.758705,0.07529,10,rbf,0.0001,,"{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}",0.908614,0.898388,0.904994,0.904,0.004235,19,0.911257,0.912057,0.910624,0.911313,0.000586364
8,11.654849,0.092454,8.286287,0.097575,100,rbf,0.001,,"{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}",0.935955,0.928759,0.932407,0.932375,0.002939,3,0.996248,0.996625,0.997189,0.996687,0.0003870109
9,14.069683,0.077032,9.743914,0.250492,100,rbf,0.0001,,"{'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}",0.929963,0.926134,0.923395,0.9265,0.002694,6,0.95197,0.953872,0.952782,0.952875,0.000779287


In [8]:
from sklearn.model_selection import cross_val_score
svm_ = svm.SVC(C=1000,gamma=0.001,kernel="poly",degree=2)
df = pd.read_csv(DATAPATH)
y = df["label"].values
labels = df["label"] 
X = df.drop("label",axis=1).values/255
scores=cross_val_score(svm_,X,y,cv=5)

In [9]:
print(scores)

[0.96255617 0.95852074 0.9645     0.95397699 0.95743615]
