# Model Analysis

#### Model Performance Accuracy (normalization + feature selection + uniform dataset)
    Linear Regression: 22.89%
    Logistic Regression: 89.93%
    SVM: 93.41%  
    NN: 96.25%
    KNN: Too many features for my device
    
#### Kaggle Results
    SVM: 91.22%
    KAGGLE: (normalization): 96.58%
            (normalization + feature selection): 95.18% 
            (original): 94.81%
            (feature selection): 93.85%
    
#### Results
    Normalization improves accuracy by ~2%
    Feature selection improves efficiency by ~10% but decreases accuracy by ~1%
    Hypothesis: The uniform training set improves accuracy by _%  
    
#### Hyperparameters


*By Uniform dataset I mean there are a uniform amount of labels

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

path = '/Users/antonmax2/Documents/dev/data/digit_recognizer/'
df_train = pd.read_csv(path+'train_data.csv')
df_val = pd.read_csv(path+'val_data.csv')
df_test = pd.read_csv(path+'test_data.csv')

In [2]:
# Converting dataframes -> numpy arrays to train models off of 
train = df_train.values
val = df_val.values
test = df_test.values

Xtr = train[:,1:]
Ytr = train[:,0]
Xval = val[:,1:]
Yval = val[:,0]
Xte = test[:,1:]
Yte = test[:,0]

In [3]:
# Normalizing data to use simple Keras models
from sklearn import preprocessing
Xtr = preprocessing.normalize(Xtr)
Xval = preprocessing.normalize(Xval)
Xte = preprocessing.normalize(Xte)

In [4]:
# Linear Regression with feature selection and normalization
from sklearn import linear_model
from sklearn.metrics import accuracy_score

regr = linear_model.LinearRegression()
regr.fit(Xtr, Ytr)
Yhat = np.floor(regr.predict(Xval))
print("Linear model accuracy: {0:.2f}%".format(accuracy_score(Yval, Yhat)*100))

  linalg.lstsq(X, y)


Linear model accuracy: 24.44%


In [5]:
# Logistic Regression with feature selection and normalization and uniform dataset
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(Xtr, Ytr)
Yhat = lr.predict(Xval)
print("Linear model accuracy: {0:.2f}%".format(accuracy_score(Yval, Yhat)*100))

Linear model accuracy: 89.93%


In [6]:
# SVM with feature selection and normalization and uniform dataset
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier

Yhat = OneVsOneClassifier(LinearSVC(random_state=0)).fit(Xtr, Ytr).predict(Xval)
print("SVM model accuracy: {0:.2f}%".format(accuracy_score(Yval, Yhat)*100))

SVM model accuracy: 93.41%


In [7]:
# NN with feature selection and normalization and uniform dataset
# Takes much longer than SVM
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier()
clf.fit(Xtr, Ytr)
Yhat = clf.predict(Xval)
print("NN model accuracy: {0:.2f}%".format(accuracy_score(Yval, Yhat)*100))

NN model accuracy: 96.17%


#### KNN takes too long on my device due to all the features present in this problem

In [15]:
# Normalization and feature selection and uniform dataset

path = '/Users/antonmax2/Documents/dev/data/digit_recognizer/'
test_model = pd.read_csv(path+'test.csv')

# Importing modified training data
path = '/Users/antonmax2/Documents/dev/data/digit_recognizer/'
train_model = pd.read_csv(path+'model_train.csv')

# Removing all features from test.csv not evaluated in our model
for i in range(784):
    col = "pixel" + str(i)
    if col not in train_model.columns:
        test_model = test_model.drop([col], axis=1)

In [60]:
# Kaggle SVM with normalization and feature selection and uniform dataset

Yhat = OneVsOneClassifier(LinearSVC(random_state=0)).fit(train_model.iloc[:,1:], train_model.iloc[:,0]).predict(test_model)

# Creating submission file
index = np.arange(1, Yhat.shape[0]+1)
submission = np.column_stack((index,Yhat))
np.savetxt('submission.txt', submission, delimiter=',', fmt='%i') # I add: "ImageId,Label" manually to first row

In [63]:
# Kaggle NN with normalization and feature selection and uniform dataset

Yhat = MLPClassifier().fit(train_model.iloc[:,1:], train_model.iloc[:,0]).predict(test_model)

index = np.arange(1, Yhat.shape[0]+1)
submission = np.column_stack((index,Yhat))
np.savetxt('submission.txt', submission, delimiter=',', fmt='%i')

In [70]:
# Kaggle NN with feature selection on original set and uniform dataset

path = '/Users/antonmax2/Documents/dev/data/digit_recognizer/'
train_model = pd.read_csv(path+'train.csv')

# Removing all features from test.csv not evaluated in our model
for i in range(784):
    col = "pixel" + str(i)
    if col not in df_train.columns:
        train_model = train_model.drop([col], axis=1)
        
Yhat = MLPClassifier().fit(train_model.iloc[:,1:], train_model.iloc[:,0]).predict(test_model)

index = np.arange(1, Yhat.shape[0]+1)
submission = np.column_stack((index,Yhat))
np.savetxt('submission.txt', submission, delimiter=',', fmt='%i')

In [73]:
# Kaggle NN on original data

path = '/Users/antonmax2/Documents/dev/data/digit_recognizer/'
train_model = pd.read_csv(path+'train.csv')
path = '/Users/antonmax2/Documents/dev/data/digit_recognizer/'
test_model = pd.read_csv(path+'test.csv')

Yhat = MLPClassifier().fit(train_model.iloc[:,1:], train_model.iloc[:,0]).predict(test_model)

index = np.arange(1, Yhat.shape[0]+1)
submission = np.column_stack((index,Yhat))
np.savetxt('submission.txt', submission, delimiter=',', fmt='%i')

In [103]:
# Kaggle NN with normalized data on original data

path = '/Users/antonmax2/Documents/dev/data/digit_recognizer/'
train_model = pd.read_csv(path+'train.csv')
path = '/Users/antonmax2/Documents/dev/data/digit_recognizer/'
test_model = pd.read_csv(path+'test.csv')

labels = train_model.label.values
train_model = preprocessing.normalize(train_model.iloc[:,1:])

Yhat = MLPClassifier().fit(train_model, labels).predict(test_model.values)

index = np.arange(1, Yhat.shape[0]+1)
submission = np.column_stack((index,Yhat))
np.savetxt('submission.txt', submission, delimiter=',', fmt='%i') # Note: Dataframes run quicker than numpy arrays on scikit

In [104]:
# Kaggle NN with normalized data on uniform data

ValueError: shapes (28000,784) and (708,100) not aligned: 784 (dim 1) != 708 (dim 0)

*Train model on raw train set, and on non uniform set, and on pixels included