In [138]:
%pylab inline
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.externals import joblib
from sklearn.metrics import (precision_recall_fscore_support, classification_report)

from sklearn import clone
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


In [139]:
# load train and test data
mnist_train = np.loadtxt('../data/train.csv', delimiter=',', dtype=int, skiprows=1)
mnist_test = np.loadtxt('../data/test.csv', delimiter=',', dtype=int, skiprows=1)

In [141]:
# split the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(mnist_train[:,1:], mnist_train[:,0])

In [226]:
# parameters
n = X_test.shape[1]
m_test = mnist_test.shape[0]
m = X_test.shape[0]
l = 10

n_neighbors = 5
n_estimators = 500
n_jobs = 3
max_depth = None

In [228]:
# model list
models = [#DecisionTreeClassifier(max_depth=max_depth),
          RandomForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs),
          ExtraTreesClassifier(n_estimators=n_estimators, n_jobs=n_jobs),
          #AdaBoostClassifier(DecisionTreeClassifier(max_depth=max_depth), n_estimators=n_estimators),
          GradientBoostingClassifier(n_estimators=n_estimators),
          #GaussianNB(),
          #SVC(kernel='rbf'),
          KNeighborsClassifier(n_neighbors=n_neighbors)]

c = len(models)
proba = np.empty((c, m_test, l), dtype=float64)
f1 = np.empty((c, l), dtype=float64)

In [None]:
# run models
for i in range(c):
    print i
    clf = models[i]
    clf.fit(X_train, y_train)

    predict = clf.predict(X_test)
    prfs = precision_recall_fscore_support(y_test, predict)
    f1[i, :] = prfs[2]

In [None]:
# generate prediction
for i in range(c):
    print i
    clf = models[i]
    clf.fit(mnist_train[:,1:], mnist_train[:,0])
    
    proba[i, :, :] = clf.predict_proba(mnist_test)

In [None]:
# sum total
predicted = np.empty(m_test, dtype=int)
s = numpy.zeros((m_test, l), dtype=float64)

for i in range(m_test):
    for j in range(c):
        s[i, :] += f1[j, :]*proba[j, i, :]
    predicted[i] = np.argmax(s[i, :])

In [None]:
# write to file
with open("predicted_ensemble.txt", 'w') as f:
    f.write('ImageId,Label\n')
    for i in range(len(predicted)):
        f.write(str(i + 1) + ',' + str(int(predicted[i])) + '\n')

array([[ 0.91577928,  0.92262895,  0.84784785,  0.83544304,  0.85056391,
         0.82261411,  0.9079334 ,  0.88299817,  0.76313181,  0.82152714],
       [ 0.97799043,  0.9821883 ,  0.95585413,  0.96676163,  0.97084548,
         0.96890672,  0.97502402,  0.98018868,  0.94666667,  0.93883225],
       [ 0.97894737,  0.98559322,  0.96435453,  0.9677113 ,  0.97187197,
         0.96603397,  0.97971014,  0.98207547,  0.95081967,  0.93894542],
       [ 0.92285156,  0.91826522,  0.82761998,  0.81967213,  0.84170616,
         0.81230448,  0.88472622,  0.88592456,  0.77116935,  0.81382488],
       [ 0.97398844,  0.97884941,  0.95472062,  0.96761905,  0.95450237,
         0.96076459,  0.97368421,  0.9782197 ,  0.95790554,  0.94388224],
       [ 0.69767442,  0.72920696,  0.89867841,  0.6875    ,  0.80357143,
         0.43661972,  0.57319953,  0.89776358,  0.24696545,  0.37259259],
       [ 0.97634816,  0.95588235,  0.98308458,  0.96299722,  0.98219585,
         0.95383104,  0.98449612,  0.9637883 