In [1]:
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt
import os
import json
import numpy as np

import pytz
from datetime import datetime, timedelta
alt.renderers.enable('notebook')
%matplotlib inline
pst = pytz.timezone('US/Pacific')

In [2]:
import sklearn.ensemble as sk_ens
import sklearn.neighbors as sk_neigh
import sklearn.metrics as sk_met
import sklearn.model_selection as sk_model

In [3]:
train = pd.read_csv(os.path.expanduser('~/dev/datasets/mnist/mnist_train.csv'), header=None)
test = pd.read_csv(os.path.expanduser('~/dev/datasets/mnist/mnist_test.csv'), header=None)
train.columns = map(str, train.columns)
test.columns = map(str, test.columns)

In [4]:
X_train = train[[col for col in train.columns if col not in ['0']]]
y_train = train['0']
X_test = test[[col for col in test.columns if col not in ['0']]]
y_test = test['0']

In [5]:
X_test.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,775,776,777,778,779,780,781,782,783,784
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
adaboost = sk_ens.AdaBoostClassifier(n_estimators=100, random_state=0)
knn = sk_neigh.KNeighborsClassifier(n_jobs=10)
rand_for = sk_ens.RandomForestClassifier(n_jobs=10, random_state=0)

In [7]:
models = [rand_for, adaboost, knn]

In [10]:
def scorer(model, X, y):
    y_pred = model.predict(X)
    return sk_met.f1_score(y, y_pred, average='weighted')

In [None]:
print(datetime.now(tz=pst))
class_reports = []
for model in models:
    print('Starting: ', datetime.now(tz=pst))
    print(model)
    model.fit(X_train, y_train)
    cv_score = sk_model.cross_val_score(model, X_train, y_train, cv=5, n_jobs=10, scoring=scorer)
    print('CV Score: Mean {0:.4f}, {1}'.format(np.mean(cv_score), np.round(cv_score, 4)))

    test_preds = model.predict(X_test)
    cr = sk_met.classification_report(y_test, test_preds, output_dict=True)
    cr = pd.DataFrame(cr)
    display(cr)
    class_reports.append(cr)
    print('Ending: ', datetime.now(tz=pst))
    

2018-10-03 08:28:51.218240-07:00
Starting:  2018-10-03 08:28:51.218513-07:00
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=10,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
CV Score: Mean 0.9426, [0.9426 0.9433 0.9373 0.9389 0.9509]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,micro avg,macro avg,weighted avg
f1-score,0.967549,0.982456,0.941402,0.934442,0.947848,0.923777,0.959205,0.949853,0.924361,0.929323,0.9468,0.946022,0.946678
precision,0.947214,0.978166,0.933333,0.923598,0.942598,0.937644,0.961216,0.960239,0.939555,0.940162,0.9468,0.946372,0.946825
recall,0.988776,0.986784,0.949612,0.945545,0.953157,0.910314,0.957203,0.939689,0.909651,0.918731,0.9468,0.945946,0.9468
support,980.0,1135.0,1032.0,1010.0,982.0,892.0,958.0,1028.0,974.0,1009.0,10000.0,10000.0,10000.0


Ending:  2018-10-03 08:28:57.041093-07:00
Starting:  2018-10-03 08:28:57.041157-07:00
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=0)
