In [138]:
%pylab inline
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.externals import joblib
from sklearn.metrics import (precision_recall_fscore_support, classification_report)

from sklearn import clone
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


In [139]:
# load train and test data
mnist_train = np.loadtxt('../data/train.csv', delimiter=',', dtype=int, skiprows=1)
mnist_test = np.loadtxt('../data/test.csv', delimiter=',', dtype=int, skiprows=1)

In [141]:
# split the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(mnist_train[:,1:], mnist_train[:,0])

In [209]:
# parameters
n = X_test.shape[1]
m_test = mnist_test.shape[0]
m = X_test.shape[0]
l = 10

n_neighbors = 5
n_estimators = 500
n_jobs = 3
max_depth = None

In [210]:
# model list
models = [DecisionTreeClassifier(max_depth=max_depth),
          RandomForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs),
          ExtraTreesClassifier(n_estimators=n_estimators, n_jobs=n_jobs),
          AdaBoostClassifier(DecisionTreeClassifier(max_depth=max_depth), n_estimators=n_estimators),
          GradientBoostingClassifier(n_estimators=n_estimators),
          GaussianNB(),
          #SVC(kernel='rbf'),
          KNeighborsClassifier(n_neighbors=n_neighbors)]

c = len(models)
proba = np.empty((c, m_test, l), dtype=float64)
precision = np.empty((c, l), dtype=float64)

In [211]:
# run models
for i in range(c):
    print i
    clf = models[i]
    clf.fit(X_train, y_train)

    predict = clf.predict(X_test)
    prfs = precision_recall_fscore_support(y_test, predict)
    precision[i, :] = prfs[0]

0
1
2
3
4
5
6


In [212]:
# generate prediction
for i in range(c):
    print i
    clf = models[i]
    clf.fit(mnist_train[:,1:], mnist_train[:,0])
    
    proba[i, :, :] = clf.predict_proba(mnist_test)

0
1
2
3
4
5
6


In [224]:
# sum total
predicted = np.empty(m_test, dtype=int)
s = numpy.zeros((m_test, l), dtype=float64)

for i in range(m_test):
    for j in range(c):
        s[i, :] += precision[j, :]*proba[j, i, :]
    predicted[i] = np.argmax(s[i, :])

In [225]:
# write to file
with open("predicted_ensemble.txt", 'w') as f:
    f.write('ImageId,Label\n')
    for i in range(len(predicted)):
        f.write(str(i + 1) + ',' + str(int(predicted[i])) + '\n')