In [None]:
from datetime import datetime
import numpy as np

from sklearn.externals import joblib
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier

In [None]:
X_train = np.load('data/train-images.npy')
y_train = np.load('data/train-labels.npy')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train)

print("train size: {}".format(len(X_train)))
print("test size: {}".format(len(X_test)))

# baseline

In [None]:
clf = DummyClassifier()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
def create_submission(predictions, sub_name, comment=None, team='DrJ'):
    """Include the given array of image predictions in a properly-formatted 
    submission file.
    """
    now = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S')
    submission_name = '-'.join(sub_name.split())
    with open('submissions/{}_{}.submission'.format(now, submission_name), 'w') as f:
        f.write('#'*20 + ' Generated submission file\n')
        if comment is not None:
            f.write('# ' + comment + '\n')
        f.write('{}\n'.format(team))
        f.write('{}\n'.format(now))
        f.write('{}\n'.format(sub_name))
        for p in predictions:
            f.write('{}\n'.format(p))
    return True

In [None]:
create_submission(y_pred, 
                  'this is only a test', 
                  comment='this is only the test portion of the training set!'
                 )

----------

## use the real test set

In [None]:
X_test = np.load('data/test-images.npy')

print("test size: {}".format(len(X_test)))

In [None]:
clf = DummyClassifier()
clf.fit(X_train, y_train)

In [None]:
predictions = clf.predict(X_test)

print("target size: {}".format(len(predictions)))

In [None]:
predictions

In [None]:
create_submission(predictions, 
                  'Class Test Dummies', 
                  comment='stratified DummyClassifier'
                  )

# a real model

> *Hey, I heard somewhere once that SVMs work well on the MNIST dataset.*

In [None]:
X_train = np.load('data/train-images.npy')
y_train = np.load('data/train-labels.npy')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train)

print("train size: {}".format(len(X_train)))
print("test size: {}".format(len(X_test)))

In [None]:
clf = SGDClassifier()
clf.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, clf.predict(X_test))