In [1]:
from sklearn import svm, metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.manifold import TSNE
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


def model(X,y,CV_flag):
    if CV_flag == True:
        parameters = [
            {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
            {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.001, 0.0001]},
            {'C': [1, 10, 100, 1000], 'kernel': ['poly'], 'degree': [2, 3, 4], 'gamma': [0.001, 0.0001]},
            {'C': [1, 10, 100, 1000], 'kernel': ['sigmoid'], 'gamma': [0.001, 0.0001]}
            ]
        svc=svm.SVC()
        clf = GridSearchCV(svc, parameters, verbose=2, n_jobs=3)
        #clf=svc
        clf.fit(X,y)
        print("Best estimator found by grid search:")
        print(clf.best_estimator_)

    else:
        clf = svm.SVC(C=1000, kernel='poly', gamma=0.001,degree=2)
        #svc=svm.SVC()
        #parameters=[{'C': [1000], 'kernel': ['poly'], 
        #             'degree': [2], 'gamma': [0.1,0.01,0.001]}]
        #clf = GridSearchCV(svc, parameters, verbose=2, n_jobs=3)
        clf.fit(X,y)

    return clf


def model_estimation(fitted_model, X_test, y_test):
    test_score = fitted_model.score(X_test,y_test)
    y_pred = fitted_model.predict(X_test)
    print(test_score)
    print(metrics.classification_report(y_test, y_pred, 
                                        target_names=["0","1","2","3","4","5","6","7","8","9"]))
    print(metrics.confusion_matrix(y_test, y_pred, labels=range(10)))
    plt.clf()

def plot_decision_regions(x, y, model, resolution=0.1):

    ## 今回は被説明変数が3クラスのため散布図のマーカータイプと3種類の色を用意
    ## クラスの種類数に応じて拡張していくのが良いでしょう
    
    colors = ["red","blue","orange","greenyellow","green",
          "purple", "cyan","yellow","brown","black"]
    ## 2変数の入力データの最小値から最大値まで引数resolutionの幅でメッシュを描く
    x1_min, x1_max = x[:, 0].min()-1, x[:, 0].max()+1
    x2_min, x2_max = x[:, 1].min()-1, x[:, 1].max()+1
    x1_mesh, x2_mesh = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                                   np.arange(x2_min, x2_max, resolution))

    ## メッシュデータ全部を学習モデルで分類
    z = model.predict(np.array([x1_mesh.ravel(), x2_mesh.ravel()]).T)
    z = z.reshape(x1_mesh.shape)

    ## メッシュデータと分離クラスを使って決定境界を描いている
    plt.contourf(x1_mesh, x2_mesh, z, alpha=0.4, cmap=cmap)
    plt.xlim(x1_mesh.min(), x1_mesh.max())
    plt.ylim(x2_mesh.min(), x2_mesh.max())

    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=x[y == cl, 0],
                    y=x[y == cl, 1],
                    alpha=0.6,
                    c=cmap(idx),
                    edgecolors='black',
                    marker=markers[idx],
                    label=cl)

In [2]:
DATAPATH = "data/digits.csv"
CV_flag = False
df = pd.read_csv(DATAPATH)
y = df["label"].values
labels = df["label"] 
X = df.drop("label",axis=1).values/255
print(y)
print(X)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
clf = model(X_train,y_train,CV_flag)

[1 0 1 ... 4 7 9]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [3]:
model_estimation(clf, X_test, y_test)

0.9565
             precision    recall  f1-score   support

          0       0.96      0.98      0.97       199
          1       0.97      0.99      0.98       210
          2       0.93      0.97      0.95       182
          3       0.94      0.97      0.95       203
          4       0.95      0.97      0.96       185
          5       0.94      0.94      0.94       189
          6       0.97      0.95      0.96       220
          7       0.97      0.95      0.96       214
          8       0.97      0.92      0.94       205
          9       0.97      0.94      0.95       193

avg / total       0.96      0.96      0.96      2000

[[195   0   0   0   0   2   1   0   1   0]
 [  0 207   2   0   0   0   0   1   0   0]
 [  1   1 176   0   2   0   2   0   0   0]
 [  1   0   1 196   0   0   0   2   2   1]
 [  0   0   2   0 179   0   1   0   0   3]
 [  1   0   2   5   1 178   2   0   0   0]
 [  4   1   0   0   2   3 210   0   0   0]
 [  0   2   4   1   1   1   0 203   0   2]
 [  0   3 

<Figure size 432x288 with 0 Axes>

In [4]:
from sklearn.model_selection import cross_val_score
svm_ = svm.SVC(C=1000,gamma=0.001,kernel="poly",degree=2)
scores=cross_val_score(svm_,X,y,cv=5,verbose=2,n_jobs=3)

[Parallel(n_jobs=3)]: Done   3 out of   5 | elapsed:   31.9s remaining:   21.2s
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:   52.8s finished


In [5]:
print(scores)

[0.96255617 0.95852074 0.9645     0.95397699 0.95743615]


In [6]:
tsne = TSNE(random_state=42,verbose=1,)
tsne_train = tsne.fit_transform(X_train)
tsne_test = tsne.fit_transform(X_test)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 8000 samples in 0.494s...
[t-SNE] Computed neighbors for 8000 samples in 105.752s...
[t-SNE] Computed conditional probabilities for sample 1000 / 8000
[t-SNE] Computed conditional probabilities for sample 2000 / 8000
[t-SNE] Computed conditional probabilities for sample 3000 / 8000
[t-SNE] Computed conditional probabilities for sample 4000 / 8000
[t-SNE] Computed conditional probabilities for sample 5000 / 8000
[t-SNE] Computed conditional probabilities for sample 6000 / 8000
[t-SNE] Computed conditional probabilities for sample 7000 / 8000
[t-SNE] Computed conditional probabilities for sample 8000 / 8000
[t-SNE] Mean sigma: 2.096096
[t-SNE] KL divergence after 250 iterations with early exaggeration: 85.799927
[t-SNE] Error after 1000 iterations: 1.722267
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2000 samples in 0.060s...
[t-SNE] Computed neighbors for 2000 samples in 5.930s...
[t-SNE] Computed conditional probab

In [9]:
# モデルの学習
#clf = model(tsne_train, y_train,True)
parameters = [
            {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.001, 0.0001]},
            ]
svc=svm.SVC()
clf = GridSearchCV(svc, parameters, verbose=2, n_jobs=3)
#clf=svc
clf.fit(tsne_train,y_train)
print("Best estimator found by grid search:")
print(clf.best_estimator_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=3)]: Done  24 out of  24 | elapsed:  1.7min finished


Best estimator found by grid search:
SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [10]:
model_estimation(clf, tsne_test, y_test)

0.0345
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       199
          1       0.00      0.00      0.00       210
          2       0.01      0.01      0.01       182
          3       0.01      0.02      0.02       203
          4       0.00      0.00      0.00       185
          5       0.12      0.24      0.16       189
          6       0.00      0.00      0.00       220
          7       0.00      0.00      0.00       214
          8       0.04      0.06      0.04       205
          9       0.04      0.03      0.03       193

avg / total       0.02      0.03      0.03      2000

[[  0  32 154   1   1   0   0   0  11   0]
 [  0   0   0  66  86  32   0   6  14   6]
 [  0   0   2   1   5  25   0   0 108  41]
 [  0  54  14   4   2   6   1  97   3  22]
 [ 18   0   0  82   0  69   0   0  16   0]
 [  0  74   1   8   1  46   0   0  59   0]
 [  6  24  70  86   3   0   0   0  30   1]
 [  9   0   0  12   0  86  32   0  75   0]
 [  0  16 

<Figure size 432x288 with 0 Axes>

In [None]:
plt.figure(figsize=(10,10))
#決定境界をプロット
plot_decision_regions(tsne_train, y_train, clf)