**Tasks**

**Nonlinear classifiers**

Try with nonlinear classifiers, can you do better than the baseline models from above?

- Try with a random Forest, does increasing the number of trees help?
- Try with SVMs - does the RBF kernel perform better than the linear one?

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

import tensorflow as tf

from sklearn.model_selection import train_test_split, ParameterGrid, GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC

%matplotlib inline

In [2]:
# get the class label limit
class_limit = 6

# class names
class_names = ["car", "bike", "other", "van", "motorcycle", "truck"]

In [3]:
# Load the numpy .npy file
train_dataset_array = np.load("train_dataset_array.npy")
train_dataset_array_labels = np.load("train_dataset_array_labels.npy")
train_dataset_array_features = np.load("train_features.npy")

print('Train data loaded')

Train data loaded


In [4]:
# Load the numpy .npy file
test_dataset_array = np.load("test_dataset_array.npy")
test_dataset_array_labels = np.load("test_dataset_array_labels.npy")
test_dataset_array_features = np.load("test_features.npy")

print('Test data loaded')

Test data loaded


In [5]:
# Load the numpy .npy file
valid_dataset_array = np.load("valid_dataset_array.npy")
valid_dataset_array_labels = np.load("valid_dataset_array_labels.npy")
valid_dataset_array_features = np.load("valid_features.npy")

print('Validation data loaded')

Validation data loaded


In [6]:
# Create X/y arrays
X_tr = train_dataset_array_features
y_tr = train_dataset_array_labels

print('X:', X_tr.shape, X_tr.dtype)
print('y:', y_tr.shape, y_tr.dtype)

X: (280, 1024) float32
y: (280,) int64


In [7]:
# Create X/y arrays
X_te = test_dataset_array_features
y_te = test_dataset_array_labels

print('X:', X_te.shape, X_te.dtype)
print('y:', y_te.shape, y_te.dtype)

X: (50, 1024) float32
y: (50,) int64


In [8]:
# Create X/y arrays
X_val = valid_dataset_array_features
y_val = valid_dataset_array_labels

print('X_val:', X_val.shape, X_val.dtype)
print('y_val:', y_val.shape, y_val.dtype)

X_val: (139, 1024) float32
y_val: (139,) int64


In [9]:
pd.value_counts(y_tr, normalize=True)

1    0.235714
0    0.228571
4    0.182143
5    0.150000
2    0.114286
3    0.089286
dtype: float64

In [10]:
def label_img(number):
    # conversion 
    if number==0 : return 'car'
    #                             
    if number==1 : return 'bike'
    #                             
    if number==2 : return 'other'
    #                             
    if number==3 : return 'van'
    #                             
    if number==4 : return 'motorcycle'
    #                             
    if number==5 : return 'truck'

In [11]:
# We don't use PCA since random forrests are not that prone to overfitting
pipe = Pipeline([('pca', None),
                 ('forest', RandomForestClassifier(n_jobs=-1))])

In [12]:
# We try different numbers of estimators 
estimators = [1000, 1500, 2000]

grid = ParameterGrid({'forest__n_estimators': estimators,
                     })
print ('Possible combinations: {} '.format(len(grid)))

Possible combinations: 3 


In [13]:
val_forrest_scores = []

for params_dict in grid:
    pipe.set_params(**params_dict)
    pipe.fit(X_tr, y_tr)
    params_dict['accuracy'] = pipe.score(X_val, y_val)
    val_forrest_scores.append(params_dict)

In [14]:
# Top validation-scores
forest_df = pd.DataFrame(val_forrest_scores)
forest_df.sort_values(by='accuracy', ascending=False).head()

Unnamed: 0,forest__n_estimators,accuracy
0,1000,0.834532
1,1500,0.827338
2,2000,0.820144


In [15]:
best_forrest = forest_df['accuracy'].idxmax()
print ('Best accuracy in the train set: {} with {} estimators'
       .format(forest_df.loc[best_forrest, 'accuracy'], forest_df.loc[best_forrest, 'forest__n_estimators']))

Best accuracy in the train set: 0.8345323741007195 with 1000 estimators


In [16]:
# Tuned Forrest
forest = RandomForestClassifier(n_estimators = forest_df.loc[best_forrest,'forest__n_estimators'], n_jobs=-1)
pipe = Pipeline([('pca', None),
                 ('forest', forest)])

pipe.fit(X_tr, y_tr)
acc_forest_val = pipe.score(X_val, y_val)
print ('Random forest accuracy on validation set: {:.3f}'.format(acc_forest_val))
acc_forest_te = pipe.score(X_te, y_te)
print ('Random forest accuracy on test set: {:.3f}'.format(acc_forest_te))

Random forest accuracy on validation set: 0.820
Random forest accuracy on test set: 0.880


In [17]:
# We use PCA to speed up processing and prevent overfitting. We set it to 150 retaining 90+% of the variance
pca = PCA(n_components=150)
pipe = Pipeline([('pca', pca),
                 ('linearsvc', LinearSVC())])

In [18]:
grid_cv = GridSearchCV(pipe, {'linearsvc__C':[0.0001, 0.001,0.01,0.1,1,]}, 
                       cv=5,
                       n_jobs=-1
                      )

grid_cv.fit(X_tr, y_tr)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=150, random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('linearsvc',
                                        LinearSVC(C=1.0, class_weight=None,
                                                  dual=True, fit_intercept=True,
                                                  intercept_scaling=1,
                                                  loss='squared_hinge',
                                                  max_iter=1000,
                                                  multi_class='ovr',
                                                  penalty='l2',
              

In [19]:
df_lin_svp = pd.DataFrame(grid_cv.cv_results_)
df_lin_svp.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_linearsvc__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.09235,0.006627,0.002146,0.000169,0.0001,{'linearsvc__C': 0.0001},0.864407,0.842105,0.781818,0.854545,0.777778,0.825,0.036805,5
1,0.071265,0.004613,0.001838,0.000312,0.001,{'linearsvc__C': 0.001},0.898305,0.877193,0.872727,0.909091,0.888889,0.889286,0.013336,1
2,0.098249,0.007711,0.001699,0.000146,0.01,{'linearsvc__C': 0.01},0.881356,0.894737,0.854545,0.890909,0.888889,0.882143,0.014349,3
3,0.144099,0.007527,0.001699,0.000299,0.1,{'linearsvc__C': 0.1},0.864407,0.877193,0.872727,0.909091,0.87037,0.878571,0.015662,4
4,0.183563,0.031701,0.001394,0.000367,1.0,{'linearsvc__C': 1},0.898305,0.877193,0.872727,0.909091,0.87037,0.885714,0.015299,2


In [20]:
best = df_lin_svp['mean_test_score'].idxmax()

print('Best mean test accuracy was {:.3f} with a C value of {}'
      .format(df_lin_svp.loc[best, 'mean_test_score'], 
              df_lin_svp.loc[best, 'param_linearsvc__C']))

Best mean test accuracy was 0.889 with a C value of 0.001


In [21]:
# Seeing the metrics
print("Accuracy on training set: {:.3f}".format(grid_cv.score(X_tr, y_tr)))
print("Accuracy on test set: {:.3f}".format(grid_cv.score(X_te, y_te)))
print("Accuracy on validation set: {:.3f}".format(grid_cv.score(X_val, y_val)))

Accuracy on training set: 0.975
Accuracy on test set: 0.920
Accuracy on validation set: 0.863


In [22]:
svc_rbf = SVC(kernel='rbf', random_state=0)

pipe = Pipeline([('pca', pca),
                 ('svc_rbf', svc_rbf)])

In [23]:
grid_cv_rbf = GridSearchCV(pipe, {'svc_rbf__C':[0.01, 0.1, 1, 10], 
                                  'svc_rbf__gamma':[0.0001, 0.001, 0.01, 0.1, 1]}, 
                           cv=5,
                           n_jobs=-1)

grid_cv_rbf.fit(X_tr, y_tr)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=150, random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('svc_rbf',
                                        SVC(C=1.0, cache_size=200,
                                            class_weight=None, coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='auto_deprecated',
                                            kernel='rbf', max_iter=-1,
                                            probability=False, random_state=0,
                                            shrinking=True, tol=0.001

In [24]:
df_rbf_svp = pd.DataFrame(grid_cv_rbf.cv_results_)
df_rbf_svp.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svc_rbf__C,param_svc_rbf__gamma,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.081561,0.004201,0.005939,0.001944,0.01,0.0001,"{'svc_rbf__C': 0.01, 'svc_rbf__gamma': 0.0001}",0.237288,0.22807,0.236364,0.236364,0.240741,0.235714,0.004178,8
1,0.083021,0.003099,0.004508,0.000307,0.01,0.001,"{'svc_rbf__C': 0.01, 'svc_rbf__gamma': 0.001}",0.237288,0.22807,0.236364,0.236364,0.240741,0.235714,0.004178,8
2,0.08129,0.003831,0.004555,0.000458,0.01,0.01,"{'svc_rbf__C': 0.01, 'svc_rbf__gamma': 0.01}",0.237288,0.22807,0.236364,0.236364,0.240741,0.235714,0.004178,8
3,0.082771,0.005957,0.005428,0.001609,0.01,0.1,"{'svc_rbf__C': 0.01, 'svc_rbf__gamma': 0.1}",0.237288,0.22807,0.236364,0.236364,0.240741,0.235714,0.004178,8
4,0.083881,0.012555,0.005118,0.000788,0.01,1.0,"{'svc_rbf__C': 0.01, 'svc_rbf__gamma': 1}",0.237288,0.22807,0.236364,0.236364,0.240741,0.235714,0.004178,8


In [25]:
best = df_rbf_svp['mean_test_score'].idxmax()

print('Best accuracy {:.3f} with a C value of {} and a gamma of {}'
      .format(df_rbf_svp.loc[best, 'mean_test_score'], 
              df_rbf_svp.loc[best, 'param_svc_rbf__C'], 
              df_rbf_svp.loc[best, 'param_svc_rbf__gamma'] ))

Best accuracy 0.900 with a C value of 10 and a gamma of 0.001


In [26]:
# Seeing the metrics
print("Accuracy on training set: {:.3f}".format(grid_cv_rbf.score(X_tr, y_tr)))
print("Accuracy on test set: {:.3f}".format(grid_cv_rbf.score(X_te, y_te)))
print("Accuracy on validation set: {:.3f}".format(grid_cv_rbf.score(X_val, y_val)))

Accuracy on training set: 1.000
Accuracy on test set: 0.920
Accuracy on validation set: 0.899


In [27]:
# Getting a baseline-accuracy based on the most frequent category
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_tr, y_tr)
accuracy_tr = dummy.score(X_tr, y_tr)
accuracy_te = dummy.score(X_te, y_te)

print('Baseline (most frequent) accuracy on training set {:.3f}'.format(accuracy_tr))
print('Baseline (most frequent) accuracy on test set {:.3f}'.format(accuracy_te))

Baseline (most frequent) accuracy on training set 0.236
Baseline (most frequent) accuracy on test set 0.240
