In [3]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

from helper import load_train_dataset, load_test_dataset, drop_zero_mean, submit, display, to_black_scale

%matplotlib inline

In [4]:
train_orig, labels = load_train_dataset()

## PCA + SVM approach

In [5]:
X, Y = train_orig.values, labels.values

In [8]:
pca = PCA()
clf = SVC(kernel='rbf')
pipe = Pipeline([('pca', pca), ('clf', clf)])

In [9]:
param_grid = {
            'clf__C': [1.0],
            'clf__gamma': ['auto'],
            'pca__n_components': [0.75]
}
cv = GridSearchCV(pipe, param_grid=param_grid, verbose=3, cv=5)
cv.fit(X, Y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] pca__n_components=0.75, clf__gamma=auto, clf__C=1.0 .............
[CV]  pca__n_components=0.75, clf__gamma=auto, clf__C=1.0, score=0.111481 - 3.1min
[CV] pca__n_components=0.75, clf__gamma=auto, clf__C=1.0 .............
[CV]  pca__n_components=0.75, clf__gamma=auto, clf__C=1.0, score=0.111508 - 3.0min
[CV] pca__n_components=0.75, clf__gamma=auto, clf__C=1.0 .............
[CV]  pca__n_components=0.75, clf__gamma=auto, clf__C=1.0, score=0.111561 - 3.0min
[CV] pca__n_components=0.75, clf__gamma=auto, clf__C=1.0 .............
[CV]  pca__n_components=0.75, clf__gamma=auto, clf__C=1.0, score=0.111587 - 3.0min
[CV] pca__n_components=0.75, clf__gamma=auto, clf__C=1.0 .............
[CV]  pca__n_components=0.75, clf__gamma=auto, clf__C=1.0, score=0.111482 - 3.0min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 15.3min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('pca', PCA(copy=True, n_components=None, whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'pca__n_components': [0.75], 'clf__gamma': ['auto'], 'clf__C': [1.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=3)

In [15]:
test = load_test_dataset()

In [12]:
submit(cv.best_estimator_, test, 'output6.csv').head()

Unnamed: 0,ImageId,Label
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1


## Let's see if making black and white gets me somewhere

In [6]:
X, Y = to_black_scale(train_orig).values, labels.values

In [8]:
pca = PCA()
clf = SVC(kernel='rbf')
pipe = Pipeline([('pca', pca), ('clf', clf)])

In [15]:
param_grid = {
            'clf__C': [1.0],
            'clf__gamma': ['auto'],
            'pca__n_components': [0.75]
}
cv = GridSearchCV(pipe, param_grid=param_grid, verbose=3, cv=5)
cv.fit(X, Y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] pca__n_components=0.75, clf__gamma=auto, clf__C=1.0 .............
[CV]  pca__n_components=0.75, clf__gamma=auto, clf__C=1.0, score=0.980250 -  33.5s
[CV] pca__n_components=0.75, clf__gamma=auto, clf__C=1.0 .............
[CV]  pca__n_components=0.75, clf__gamma=auto, clf__C=1.0, score=0.980364 -  29.8s
[CV] pca__n_components=0.75, clf__gamma=auto, clf__C=1.0 .............
[CV]  pca__n_components=0.75, clf__gamma=auto, clf__C=1.0, score=0.976664 -  26.1s
[CV] pca__n_components=0.75, clf__gamma=auto, clf__C=1.0 .............
[CV]  pca__n_components=0.75, clf__gamma=auto, clf__C=1.0, score=0.979636 -  28.1s
[CV] pca__n_components=0.75, clf__gamma=auto, clf__C=1.0 .............
[CV]  pca__n_components=0.75, clf__gamma=auto, clf__C=1.0, score=0.978919 -  28.7s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.4min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('pca', PCA(copy=True, n_components=None, whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'pca__n_components': [0.75], 'clf__gamma': ['auto'], 'clf__C': [1.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=3)

In [18]:
cv.grid_scores_

[mean: 0.97917, std: 0.00135, params: {'pca__n_components': 0.75, 'clf__gamma': 'auto', 'clf__C': 1.0}]

In [17]:
submit(cv.best_estimator_, to_black_scale(test), 'output7.csv').head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3


In [6]:
param_grid = {
            'clf__C': [1.0],
            'clf__gamma': ['auto'],
            'pca__n_components': [0.75],
            'pca__whiten': [True, False]
}
cv = GridSearchCV(pipe, param_grid=param_grid, verbose=3, cv=5)
cv.fit(X, Y)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True 
[CV]  pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True, score=0.978346 -  44.9s
[CV] pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True 
[CV]  pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True, score=0.978579 -  35.3s
[CV] pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True 
[CV]  pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True, score=0.975830 -  34.9s
[CV] pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True 
[CV]  pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True, score=0.977968 -  34.3s
[CV] pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True 
[CV]  pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True, score=0.978323 -  34.4s
[CV] pca__n_components=0.75, clf__C=1.0, clf__gamma=aut

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  5.2min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('pca', PCA(copy=True, n_components=None, whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'pca__n_components': [0.75], 'clf__C': [1.0], 'clf__gamma': ['auto'], 'pca__whiten': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=3)

In [7]:
cv.grid_scores_

[mean: 0.97781, std: 0.00101, params: {'pca__n_components': 0.75, 'clf__C': 1.0, 'clf__gamma': 'auto', 'pca__whiten': True},
 mean: 0.97917, std: 0.00135, params: {'pca__n_components': 0.75, 'clf__C': 1.0, 'clf__gamma': 'auto', 'pca__whiten': False}]

In [8]:
cv.best_params_

{'clf__C': 1.0,
 'clf__gamma': 'auto',
 'pca__n_components': 0.75,
 'pca__whiten': False}

## despite low score lets still try whitten=True

In [9]:
pca = PCA(whiten=True, n_components=.75)
clf = SVC(kernel='rbf')
pipe = Pipeline([('pca', pca), ('clf', clf)])
pipe.fit(X, Y)

Pipeline(steps=[('pca', PCA(copy=True, n_components=0.75, whiten=True)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [12]:
submit(pipe, to_black_scale(test), 'output8.csv').head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3


## Let's try some gamma variation

In [13]:
param_grid = {
            'clf__C': [1.0],
            'clf__gamma': ['auto', 0.01, 0.05, 0.001],
            'pca__n_components': [0.75],
            'pca__whiten': [True, False]
}
cv = GridSearchCV(pipe, param_grid=param_grid, verbose=3, cv=5)
cv.fit(X, Y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True 
[CV]  pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True, score=0.978346 -  38.0s
[CV] pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True 
[CV]  pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True, score=0.978579 -  37.1s
[CV] pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True 
[CV]  pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True, score=0.975830 -  36.7s
[CV] pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True 
[CV]  pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True, score=0.977968 -  41.5s
[CV] pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True 
[CV]  pca__n_components=0.75, clf__C=1.0, clf__gamma=auto, pca__whiten=True, score=0.978323 -  41.7s
[CV] pca__n_components=0.75, clf__C=1.0, clf__gamma=aut

[Parallel(n_jobs=1)]: Done  31 tasks       | elapsed: 21.1min


[CV]  pca__n_components=0.75, clf__C=1.0, clf__gamma=0.001, pca__whiten=True, score=0.924075 -  52.9s
[CV] pca__n_components=0.75, clf__C=1.0, clf__gamma=0.001, pca__whiten=True 
[CV]  pca__n_components=0.75, clf__C=1.0, clf__gamma=0.001, pca__whiten=True, score=0.921181 -  51.5s
[CV] pca__n_components=0.75, clf__C=1.0, clf__gamma=0.001, pca__whiten=True 
[CV]  pca__n_components=0.75, clf__C=1.0, clf__gamma=0.001, pca__whiten=True, score=0.923425 -  51.4s
[CV] pca__n_components=0.75, clf__C=1.0, clf__gamma=0.001, pca__whiten=True 
[CV]  pca__n_components=0.75, clf__C=1.0, clf__gamma=0.001, pca__whiten=True, score=0.928537 -  51.5s
[CV] pca__n_components=0.75, clf__C=1.0, clf__gamma=0.001, pca__whiten=False 
[CV]  pca__n_components=0.75, clf__C=1.0, clf__gamma=0.001, pca__whiten=False, score=0.930161 -  37.6s
[CV] pca__n_components=0.75, clf__C=1.0, clf__gamma=0.001, pca__whiten=False 
[CV]  pca__n_components=0.75, clf__C=1.0, clf__gamma=0.001, pca__whiten=False, score=0.927645 -  37.4s

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed: 27.6min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('pca', PCA(copy=True, n_components=0.75, whiten=True)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'pca__n_components': [0.75], 'clf__C': [1.0], 'clf__gamma': ['auto', 0.01, 0.05, 0.001], 'pca__whiten': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=3)

In [14]:
cv.grid_scores_

[mean: 0.97781, std: 0.00101, params: {'pca__n_components': 0.75, 'clf__C': 1.0, 'clf__gamma': 'auto', 'pca__whiten': True},
 mean: 0.97917, std: 0.00135, params: {'pca__n_components': 0.75, 'clf__C': 1.0, 'clf__gamma': 'auto', 'pca__whiten': False},
 mean: 0.97014, std: 0.00128, params: {'pca__n_components': 0.75, 'clf__C': 1.0, 'clf__gamma': 0.01, 'pca__whiten': True},
 mean: 0.97217, std: 0.00227, params: {'pca__n_components': 0.75, 'clf__C': 1.0, 'clf__gamma': 0.01, 'pca__whiten': False},
 mean: 0.97760, std: 0.00090, params: {'pca__n_components': 0.75, 'clf__C': 1.0, 'clf__gamma': 0.05, 'pca__whiten': True},
 mean: 0.98036, std: 0.00095, params: {'pca__n_components': 0.75, 'clf__C': 1.0, 'clf__gamma': 0.05, 'pca__whiten': False},
 mean: 0.92438, std: 0.00239, params: {'pca__n_components': 0.75, 'clf__C': 1.0, 'clf__gamma': 0.001, 'pca__whiten': True},
 mean: 0.92995, std: 0.00166, params: {'pca__n_components': 0.75, 'clf__C': 1.0, 'clf__gamma': 0.001, 'pca__whiten': False}]

In [15]:
cv.best_params_

{'clf__C': 1.0,
 'clf__gamma': 0.05,
 'pca__n_components': 0.75,
 'pca__whiten': False}

In [16]:
submit(cv.best_estimator_, to_black_scale(test), 'output9.csv').head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3


## More paramtere tuning attempt

In [11]:
param_grid = {
            'clf__C': np.arange(1, 3, .05),
            'clf__gamma': np.arange(0, .05, .001),
            'pca__n_components': [0.75, .8, .85],
            'pca__whiten': [False]
}
cv = RandomizedSearchCV(pipe, param_distributions=param_grid, verbose=3, cv=5)
cv.fit(X, Y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] clf__C=1.0, pca__n_components=0.8, pca__whiten=False, clf__gamma=0.045 
[CV]  clf__C=1.0, pca__n_components=0.8, pca__whiten=False, clf__gamma=0.045, score=0.981678 - 1.4min
[CV] clf__C=1.0, pca__n_components=0.8, pca__whiten=False, clf__gamma=0.045 
[CV]  clf__C=1.0, pca__n_components=0.8, pca__whiten=False, clf__gamma=0.045, score=0.980959 - 1.5min
[CV] clf__C=1.0, pca__n_components=0.8, pca__whiten=False, clf__gamma=0.045 
[CV]  clf__C=1.0, pca__n_components=0.8, pca__whiten=False, clf__gamma=0.045, score=0.978450 - 1.3min
[CV] clf__C=1.0, pca__n_components=0.8, pca__whiten=False, clf__gamma=0.045 
[CV]  clf__C=1.0, pca__n_components=0.8, pca__whiten=False, clf__gamma=0.045, score=0.979874 - 1.4min
[CV] clf__C=1.0, pca__n_components=0.8, pca__whiten=False, clf__gamma=0.045 
[CV]  clf__C=1.0, pca__n_components=0.8, pca__whiten=False, clf__gamma=0.045, score=0.981301 - 1.3min
[CV] clf__C=1.75, pca__n_components=0.85, pc

[Parallel(n_jobs=1)]: Done  31 tasks       | elapsed: 29.5min


[CV]  clf__C=2.4, pca__n_components=0.8, pca__whiten=False, clf__gamma=0.033, score=0.982387 -  52.0s
[CV] clf__C=2.4, pca__n_components=0.8, pca__whiten=False, clf__gamma=0.033 
[CV]  clf__C=2.4, pca__n_components=0.8, pca__whiten=False, clf__gamma=0.033, score=0.980474 -  51.6s
[CV] clf__C=2.4, pca__n_components=0.8, pca__whiten=False, clf__gamma=0.033 
[CV]  clf__C=2.4, pca__n_components=0.8, pca__whiten=False, clf__gamma=0.033, score=0.982017 -  51.4s
[CV] clf__C=2.4, pca__n_components=0.8, pca__whiten=False, clf__gamma=0.033 
[CV]  clf__C=2.4, pca__n_components=0.8, pca__whiten=False, clf__gamma=0.033, score=0.981658 -  52.3s
[CV] clf__C=2.65, pca__n_components=0.85, pca__whiten=False, clf__gamma=0.047 
[CV]  clf__C=2.65, pca__n_components=0.85, pca__whiten=False, clf__gamma=0.047, score=0.980726 - 2.2min
[CV] clf__C=2.65, pca__n_components=0.85, pca__whiten=False, clf__gamma=0.047 
[CV]  clf__C=2.65, pca__n_components=0.85, pca__whiten=False, clf__gamma=0.047, score=0.980364 - 2.

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 59.4min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(steps=[('pca', PCA(copy=True, n_components=None, whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'pca__whiten': [False], 'clf__C': array([ 1.  ,  1.05,  1.1 ,  1.15,  1.2 ,  1.25,  1.3 ,  1.35,  1.4 ,
        1.45,  1.5 ,  1.55,  1.6 ,  1.65,  1.7 ,  1.75,  1.8 ,  1.85,
        1.9 ,  1.95,  2.  ,  2.05,  2.1 ,  2.15,  2.2 ,  2.25,  2.3 ,
        2.35,  2.4 ,  2.45,  2.5 , ...039,
        0.04 ,  0.041,  0.042,  0.043,  0.044,  0.045,  0.046,  0.047,
        0.048,  0.049])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=3)

In [12]:
cv.best_params_

{'clf__C': 2.4000000000000012,
 'clf__gamma': 0.033000000000000002,
 'pca__n_components': 0.8,
 'pca__whiten': False}

In [13]:
cv.grid_scores_

[mean: 0.98045, std: 0.00117, params: {'clf__C': 1.0, 'pca__n_components': 0.8, 'pca__whiten': False, 'clf__gamma': 0.044999999999999998},
 mean: 0.98043, std: 0.00062, params: {'clf__C': 1.7500000000000007, 'pca__n_components': 0.85, 'pca__whiten': False, 'clf__gamma': 0.043999999999999997},
 mean: 0.97590, std: 0.00111, params: {'clf__C': 1.0, 'pca__n_components': 0.8, 'pca__whiten': False, 'clf__gamma': 0.013000000000000001},
 mean: 0.93862, std: 0.00113, params: {'clf__C': 2.7000000000000015, 'pca__n_components': 0.75, 'pca__whiten': False, 'clf__gamma': 0.001},
 mean: 0.98179, std: 0.00092, params: {'clf__C': 2.9000000000000017, 'pca__n_components': 0.75, 'pca__whiten': False, 'clf__gamma': 0.0},
 mean: 0.98062, std: 0.00068, params: {'clf__C': 2.7500000000000018, 'pca__n_components': 0.85, 'pca__whiten': False, 'clf__gamma': 0.017000000000000001},
 mean: 0.98207, std: 0.00108, params: {'clf__C': 2.4000000000000012, 'pca__n_components': 0.8, 'pca__whiten': False, 'clf__gamma': 0.0

In [16]:
submit(cv.best_estimator_, to_black_scale(test), 'output10.csv').head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3
