In [52]:
# Imports
import pandas as pd
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [60]:
# Read in the data
df = pd.read_csv('./data/train.csv')

In [61]:
df.shape

(42000, 785)

In [62]:
# Reduce processing time by reducing the amount of data
df = df.loc[0:1999, :]

In [63]:
# Define model variables
X = df.loc[:, 'pixel0':'pixel783']
y = df['label']

In [64]:
X.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
y.head()

0    1
1    0
2    1
3    4
4    0
Name: label, dtype: int64

In [66]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [73]:
# Assess the accuracy of a few different KNN models in order to establish a baseline
print(cross_val_score(KNeighborsClassifier(n_neighbors=1), X_train, y_train, cv=5).mean())
print(cross_val_score(KNeighborsClassifier(n_neighbors=3), X_train, y_train, cv=5).mean())
print(cross_val_score(KNeighborsClassifier(n_neighbors=5), X_train, y_train, cv=5).mean())
print(cross_val_score(KNeighborsClassifier(n_neighbors=10), X_train, y_train, cv=5).mean())
print(cross_val_score(KNeighborsClassifier(n_neighbors=15), X_train, y_train, cv=5).mean())

0.8772772309031822
0.8852229005225787
0.8799481174125008
0.8539992838465299
0.8326519660639041


In [41]:
# Look at parameters to be altered
KNeighborsClassifier().get_params().keys()

dict_keys(['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'])

In [67]:
# Set up a KNN pipeline that can be fed into GridsearchCV i
pipe = Pipeline([
    
    ('knn', KNeighborsClassifier())
    
])

pipe_params = {
    
    'knn__n_neighbors': [1, 3, 5],
    'knn__p': [1, 2, 3]
}

gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
gs_model = gs.best_estimator_
print(f'KNN Best Accuracy Score: {gs.best_score_}')
print(f'KNN Training Score: {gs_model.score(X_train, y_train)}')
print(f'KNN Testing Score {gs_model.score(X_test, y_test)}')

KNN Best Accuracy Score: 0.8766666666666667
KNN Training Score: 1.0
KNN Testing Score 0.91


In [68]:
svc = SVC(gamma='scale')

In [69]:
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [70]:
svc.score(X_train, y_train)

0.9813333333333333

In [71]:
y_pred = svc.predict(X_test)

In [72]:
accuracy_score(y_test, y_pred)

0.94