In [33]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

from helper import load_train_dataset, load_test_dataset, drop_zero_mean, submit, display, to_black_scale

%matplotlib inline

In [2]:
train_orig, labels = load_train_dataset()

In [3]:
train_orig.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
labels.head()

0    1
1    0
2    1
3    4
4    0
Name: label, dtype: int64

## Atempting Without any transformation

In [6]:
pca = PCA(0.8)

In [11]:
result80 = pca.fit_transform(train_orig.values)

In [12]:
result80.shape

(42000, 43)

In [14]:
X, Y = train_orig.values, labels.values

In [17]:
pipe = Pipeline([('pca', PCA()), ('clf', RandomForestClassifier())])

In [19]:
param_grid = {
            'clf__n_estimators': [100],
            'pca__n_components': [0.7, 0.75, 0.8]
}
cv = GridSearchCV(pipe, param_grid=param_grid, verbose=3, cv=5)
cv.fit(X, Y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] clf__n_estimators=100, pca__n_components=0.7 ....................
[CV]  clf__n_estimators=100, pca__n_components=0.7, score=0.943843 -  27.5s
[CV] clf__n_estimators=100, pca__n_components=0.7 ....................
[CV]  clf__n_estimators=100, pca__n_components=0.7, score=0.948471 -  25.3s
[CV] clf__n_estimators=100, pca__n_components=0.7 ....................
[CV]  clf__n_estimators=100, pca__n_components=0.7, score=0.942850 -  25.3s
[CV] clf__n_estimators=100, pca__n_components=0.7 ....................
[CV]  clf__n_estimators=100, pca__n_components=0.7, score=0.946529 -  25.4s
[CV] clf__n_estimators=100, pca__n_components=0.7 ....................
[CV]  clf__n_estimators=100, pca__n_components=0.7, score=0.949500 -  24.9s
[CV] clf__n_estimators=100, pca__n_components=0.75 ...................
[CV]  clf__n_estimators=100, pca__n_components=0.75, score=0.948602 -  24.9s
[CV] clf__n_estimators=100, pca__n_components=0.75 ......

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  7.0min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('pca', PCA(copy=True, n_components=None, whiten=False)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'clf__n_estimators': [100], 'pca__n_components': [0.7, 0.75, 0.8]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=3)

In [20]:
cv.best_params_

{'clf__n_estimators': 100, 'pca__n_components': 0.75}

In [21]:
cv.grid_scores_

[mean: 0.94624, std: 0.00257, params: {'clf__n_estimators': 100, 'pca__n_components': 0.7},
 mean: 0.94924, std: 0.00256, params: {'clf__n_estimators': 100, 'pca__n_components': 0.75},
 mean: 0.94838, std: 0.00296, params: {'clf__n_estimators': 100, 'pca__n_components': 0.8}]

In [23]:
test = load_test_dataset()
test.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
submit(cv.best_estimator_, test, 'output3.csv').head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,4
4,5,3


In [27]:
pca = PCA(0.75)
result75 = pca.fit_transform(train_orig.values)
result75.shape

(42000, 33)

## Attempting by removing the 0 mean columns

In [26]:
train = drop_zero_mean(train_orig)
train.head()

Unnamed: 0,pixel12,pixel13,pixel14,pixel15,pixel32,pixel33,pixel34,pixel35,pixel36,pixel37,...,pixel770,pixel771,pixel772,pixel773,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
pca = PCA(0.75)
result75 = pca.fit_transform(train.values)
result75.shape

(42000, 33)

In [29]:
X = train.values

In [31]:
pipe = Pipeline([('pca', PCA(.75)), ('clf', RandomForestClassifier(100))])
pipe.fit(X, Y)

Pipeline(steps=[('pca', PCA(copy=True, n_components=0.75, whiten=False)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [32]:
submit(pipe, test.loc[:, train.columns], 'output4.csv').head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,4
4,5,2


## Let's Convert to black scale and see what happens

In [34]:
train = to_black_scale(train_orig)
train.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
X = train.values

In [36]:
param_grid = {
            'clf__n_estimators': [100],
            'pca__n_components': [0.7, 0.75, 0.8]
}
cv = GridSearchCV(pipe, param_grid=param_grid, verbose=3, cv=5)
cv.fit(X, Y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] clf__n_estimators=100, pca__n_components=0.7 ....................
[CV]  clf__n_estimators=100, pca__n_components=0.7, score=0.951695 -  26.6s
[CV] clf__n_estimators=100, pca__n_components=0.7 ....................
[CV]  clf__n_estimators=100, pca__n_components=0.7, score=0.947519 -  26.1s
[CV] clf__n_estimators=100, pca__n_components=0.7 ....................
[CV]  clf__n_estimators=100, pca__n_components=0.7, score=0.947137 -  26.1s
[CV] clf__n_estimators=100, pca__n_components=0.7 ....................
[CV]  clf__n_estimators=100, pca__n_components=0.7, score=0.948553 -  27.7s
[CV] clf__n_estimators=100, pca__n_components=0.7 ....................
[CV]  clf__n_estimators=100, pca__n_components=0.7, score=0.949262 -  26.3s
[CV] clf__n_estimators=100, pca__n_components=0.75 ...................
[CV]  clf__n_estimators=100, pca__n_components=0.75, score=0.949792 -  30.2s
[CV] clf__n_estimators=100, pca__n_components=0.75 ......

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  7.9min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('pca', PCA(copy=True, n_components=0.75, whiten=False)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'clf__n_estimators': [100], 'pca__n_components': [0.7, 0.75, 0.8]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=3)

In [37]:
cv.best_params_

{'clf__n_estimators': 100, 'pca__n_components': 0.75}

In [38]:
cv.grid_scores_

[mean: 0.94883, std: 0.00162, params: {'clf__n_estimators': 100, 'pca__n_components': 0.7},
 mean: 0.94945, std: 0.00080, params: {'clf__n_estimators': 100, 'pca__n_components': 0.75},
 mean: 0.94748, std: 0.00100, params: {'clf__n_estimators': 100, 'pca__n_components': 0.8}]

In [41]:
to_black_scale(test).values

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [42]:
submit(cv.best_estimator_, to_black_scale(test), 'output5.csv').head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3
