# MNIST DATASET

In [13]:
from sklearn import svm, metrics
from sklearn.neighbors import KNeighborsClassifier
import sklearn.model_selection as ms
import pandas as pd

### Importing Data

In [14]:
df = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")
df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Subsetting data to train faster

In [15]:
df = df.sample(frac=1)
df = df.head(5000)

In [16]:
x = df.drop("label", axis=1)
y = df["label"]
test = df_test.as_matrix()

In [17]:
x_train, x_val, y_train, y_val = ms.train_test_split(x, y, test_size=0.2, random_state=0)

In [18]:
classifier = KNeighborsClassifier()
classifier.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [19]:
predicted = classifier.predict(x_val)
print (metrics.classification_report(predicted, y_val))

             precision    recall  f1-score   support

          0       0.98      0.93      0.96       104
          1       1.00      0.88      0.94       118
          2       0.94      0.96      0.95        93
          3       0.93      0.89      0.91       103
          4       0.91      0.95      0.93        98
          5       0.92      0.90      0.91        93
          6       0.98      0.93      0.95       101
          7       0.96      0.97      0.97       115
          8       0.79      1.00      0.89        77
          9       0.87      0.89      0.88        98

avg / total       0.93      0.93      0.93      1000



In [20]:
classifier = svm.SVC(gamma=0.001, kernel='linear')

In [21]:
from sklearn.model_selection import cross_val_predict

preds = cross_val_predict(classifier, X=x, y=y, cv=3)
print (metrics.classification_report(y, preds, digits=4))
print (metrics.confusion_matrix(y, preds))

             precision    recall  f1-score   support

          0     0.9512    0.9730    0.9620       481
          1     0.9405    0.9857    0.9626       561
          2     0.8748    0.9057    0.8900       509
          3     0.8688    0.8510    0.8598       490
          4     0.8855    0.9245    0.9046       477
          5     0.8383    0.8500    0.8441       500
          6     0.9284    0.9243    0.9263       449
          7     0.9436    0.9194    0.9314       546
          8     0.9200    0.8319    0.8737       470
          9     0.8968    0.8743    0.8854       517

avg / total     0.9051    0.9050    0.9047      5000

[[468   0   2   1   1   3   5   0   1   0]
 [  0 553   2   1   0   1   1   1   2   0]
 [  5   7 461   9   5   2   9   4   4   3]
 [  1   1  16 417   1  32   2   4  10   6]
 [  0   1   6   0 441   0   3   1   2  23]
 [  6   7   4  28   7 425   8   2   9   4]
 [  4   3  10   0   3  11 415   0   3   0]
 [  1   5  12   2   8   3   0 502   1  12]
 [  5  10   9  14

In [26]:
classifier.fit(x , y)
predict_test = classifier.predict(test)
out = zip(range(len(test)), predict_test)
with open("./solution.csv", 'w') as g:
    g.write("ImageId,Label\n")
    for id, cat in out:
        g.write(str(id + 1) + "," + str(cat) + "\n")