In [29]:
# packages to import
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.optimize import minimize
from sklearn import neighbors
from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble

%matplotlib inline

In [2]:
# read in data using pandas
df_train = pd.read_csv('data/train.csv')

In [3]:
# split (training) data into training and testing sets
def split_data(df, fraction=0.9): 
    # get pixels and labels
    pixels = df.iloc[:,1:785].as_matrix()
    labels = df.iloc[:,0].as_matrix()
    
    # normalize data
    pixels = pixels / 255.0
    #pixels = pixels / np.sum(pixels, axis=1)[:,None]
    
    # add column that is just ones for offset
    pixels = np.c_[pixels, np.ones(len(pixels))]
    
    # split into training and testing
    n = pixels.shape[0]
    pixels_train = pixels[0:int(fraction * n)]
    pixels_test = pixels[int(fraction * n):n]
    labels_train = labels[0:int(fraction * n)]
    labels_test = labels[int(fraction * n):n]
    
    return pixels_train, pixels_test, labels_train, labels_test

In [9]:
X_train, X_test, y_train, y_test = split_data(df_train, fraction=0.8)

# KNN 
sklearn method produces similar error rate as my code (but so much faster!)

In [24]:
# build model and fit
neigh = neighbors.KNeighborsClassifier(n_neighbors=5, weights='distance')
neigh.fit(X_train, y_train) 

# make predictions
a = neigh.predict(X_test)

# compute error
err = (np.sum(a != y_test) / len(y_test)) * 100
print('The percent error using KNN is ', err)

The percent error using KNN is  2.94047619048


## Logistic regression
Also get similar error, but much much faster

In [11]:
# build model 
lm = linear_model.LogisticRegression()
lm.fit(X_train, y_train)
pred = lm.predict(X_test)

In [14]:
# compute error
np.sum(pred != y_test) / len(y_test)

0.082380952380952374

## Support Vector Machine

In [18]:
# build model 
classifier = svm.SVC()
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [19]:
pred = classifier.predict(X_test)

In [20]:
# compute error
np.sum(pred != y_test) / len(y_test)

0.062857142857142861

In [28]:
# build model and fit
X_train_full, X_null, y_train_full, y_null = split_data(df_train, fraction=1.0)
neigh.fit(X_train_full, y_train_full) 

# make predictions
df_test = pd.read_csv('data/test.csv')
pixels = df_test.as_matrix()
pixels = np.c_[pixels, np.ones(len(pixels))]
predictions = neigh.predict(pixels)

# write to csv
df_pred = pd.DataFrame()
df_pred['ImageId'] = range(1,28001)
df_pred['Label'] = predictions
df_pred.to_csv('sklearn_knn.csv', index=False)

## Random forest
Based on the scripts on Kaggle, random forest seems to do well

In [31]:
# build model and fit
rf = ensemble.RandomForestClassifier()
rf.fit(X_train, y_train) 

# make predictions
pred = rf.predict(X_test)

# compute error
err = (np.sum(pred != y_test) / len(y_test)) * 100
print('The percent error using random forest is ', err)

The percent error using random forest is  5.72619047619


In [32]:
rf.fit(X_train_full, y_train_full) 

# make predictions
predictions = rf.predict(pixels)

# write to csv
df_pred = pd.DataFrame()
df_pred['ImageId'] = range(1,28001)
df_pred['Label'] = predictions
df_pred.to_csv('sklearn_randomforest.csv', index=False)