In [1]:
import os
import re
import random

import tensorflow as tf
import tensorflow.python.platform
from tensorflow.python.platform import gfile
import numpy as np
import pandas as pd
import sklearn
from sklearn import metrics
from sklearn import model_selection
import sklearn.linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import SVC, LinearSVC
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import scipy.linalg
import sklearn.preprocessing
import sklearn.linear_model
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier

(1) Compare the performance of SVMs for different types of kernels, tuning the kernel parameters using cross-validation. You may train the SVMs for multi-class classification in any fashion you wish (one-vs-one, one-vs-rest, multiclass). You may also use scikit-learn's built-in functions to perform cross-validation over all parameters.

In [2]:
X_train = pickle.load(open('features_train_all','rb'))
y_train = pickle.load(open('labels_train_all','rb'))

In [3]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [4]:
X_train_sub = X_train[:500]
y_train_sub = y_train[:500]

#### (a) rbf kernel in one-vs-rest fashion

In [5]:
rbf_kernel = SVC(kernel='rbf', max_iter=1000,decision_function_shape='ovr')
parameters = {'C':[10**i for i in range(-4, 5)], 'gamma': [0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]}
clf_rbf = GridSearchCV(rbf_kernel, parameters)

In [6]:
clf_rbf.fit(X_train_sub, y_train_sub)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'gamma': [0, 0.1, 0.01, 0.001, 0.0001, 1e-05, 1e-06]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [7]:
clf_rbf.best_params_

{'C': 1000, 'gamma': 0.0001}

In [8]:
rbf_kernel_opt = SVC(C=1000, gamma=0.0001, kernel='rbf', max_iter=1000,decision_function_shape='ovr')
rbf_kernel_opt.fit(X_train1, y_train1)

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [9]:
y_rbf = rbf_kernel_opt.predict(X_test1)

In [10]:
accuracy_rbf_opt = accuracy_score(y_test1, y_rbf)
misclassification_error_rbf = 1 - accuracy_rbf_opt
print("The multi-class misclassification error obtained using rbf kernel SVM in one-vs-rest is ", + misclassification_error_rbf)

The multi-class misclassification error obtained using rbf kernel SVM in one-vs-rest is  0.327546296296


#### (b) sigmoid kernel in one-vs-rest fashion

In [11]:
sigmoid_kernel = SVC(kernel='sigmoid', max_iter=1000,decision_function_shape='ovr')
parameters = {'C':[10**i for i in range(-4, 5)], 'coef0': [0, 1e-1, 1e-2, 1e-3, 1e-4]}
clf_sigmoid = GridSearchCV(sigmoid_kernel, parameters)

In [12]:
clf_sigmoid.fit(X_train_sub, y_train_sub)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='sigmoid',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'coef0': [0, 0.1, 0.01, 0.001, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [13]:
clf_sigmoid.best_params_

{'C': 100, 'coef0': 0}

In [14]:
sigmoid_kernel_opt = SVC(C=100, coef0=0, kernel='sigmoid', max_iter=1000,decision_function_shape='ovr')
sigmoid_kernel_opt.fit(X_train1, y_train1)

SVC(C=100, cache_size=200, class_weight=None, coef0=0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='sigmoid',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [15]:
y_sigmoid = sigmoid_kernel_opt.predict(X_test1)

In [16]:
accuracy_sigmoid_opt = accuracy_score(y_test1, y_sigmoid)
misclassification_error_sigmoid = 1 - accuracy_sigmoid_opt
print("The multi-class misclassification error obtained using sigmoid kernel SVM in one-vs-rest is ", + misclassification_error_sigmoid)

The multi-class misclassification error obtained using sigmoid kernel SVM in one-vs-rest is  0.324074074074


#### (c) polynomial kernel in one-vs-rest fashion

In [17]:
poly_kernel = SVC(kernel='poly', max_iter=1000,decision_function_shape='ovr')
parameters = {'C':[10**i for i in range(-4, 5)],'degree': [1, 2, 3, 4, 5], 'coef0': [0, 1e-1, 1e-2, 1e-3, 1e-4]}
clf_poly = GridSearchCV(poly_kernel, parameters)

In [18]:
clf_poly.fit(X_train_sub, y_train_sub)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'degree': [1, 2, 3, 4, 5], 'coef0': [0, 0.1, 0.01, 0.001, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [19]:
clf_poly.best_params_

{'C': 100, 'coef0': 0, 'degree': 1}

In [20]:
poly_kernel_opt = SVC(C=100, coef0=0, degree=1, kernel='poly', max_iter=1000,decision_function_shape='ovr')
poly_kernel_opt.fit(X_train1, y_train1)

SVC(C=100, cache_size=200, class_weight=None, coef0=0,
  decision_function_shape='ovr', degree=1, gamma='auto', kernel='poly',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
y_poly = poly_kernel_opt.predict(X_test1)

In [22]:
accuracy_poly_opt = accuracy_score(y_test1, y_poly)
misclassification_error_poly = 1 - accuracy_poly_opt
print("The multi-class misclassification error obtained using polynomial kernel SVM in one-vs-rest is ", + misclassification_error_poly)

The multi-class misclassification error obtained using polynomial kernel SVM in one-vs-rest is  0.324074074074


Answer: I used three different kernel SVM, polynomial, sigmoid and rbf. The misclassification errors of polynomial kernel and sigmoid kernel are the same, 0.324. The misclassification error of rbf kernel is higher than other two kernel methods, which is 0.327. Thus, polynomial and sigmoid perform best.

(2) Experiment with several ensembles of classifiers, using any technique you wish. You may consider bagging (ensembles of classifiers learnt from random subsamples of examples), ensemble of classiers learnt from random subsets of features, ensembles of classifiers with different kernels, etc.

#### (a) Bagging Classifier with polynomial kernel

In [24]:
bagging = BaggingClassifier(base_estimator=SVC(C=100, coef0=0, degree=1, kernel='poly', max_iter=1000,decision_function_shape='ovr'))
parameters = {'n_estimators': (1, 2, 3, 4, 5, 6, 7, 8, 9,10)}
clf_bagging = GridSearchCV(bagging, parameters)

In [25]:
clf_bagging.fit(X_train_sub, y_train_sub)

GridSearchCV(cv=None, error_score='raise',
       estimator=BaggingClassifier(base_estimator=SVC(C=100, cache_size=200, class_weight=None, coef0=0,
  decision_function_shape='ovr', degree=1, gamma='auto', kernel='poly',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [26]:
clf_bagging.best_params_

{'n_estimators': 9}

In [27]:
bagging_opt = BaggingClassifier(n_estimators=9, base_estimator=SVC(C=100, coef0=0, degree=1, kernel='poly', max_iter=1000,decision_function_shape='ovr'))
bagging_opt.fit(X_train1, y_train1)
y_bagging = bagging_opt.predict(X_test1)

In [28]:
accuracy_bagging = accuracy_score(y_test1, y_bagging)
misclassification_error_bagging = 1 - accuracy_bagging
print("The multi-class misclassification error obtained using bagging with polynomial kernel in one-vs-rest is ", + misclassification_error_bagging)

The multi-class misclassification error obtained using bagging with polynomial kernel in one-vs-rest is  0.336805555556


#### (b) Bagging Classifier with signoid kernel

In [29]:
bagging_s = BaggingClassifier(base_estimator=SVC(C=100, coef0=0, degree=1, kernel='sigmoid', max_iter=1000,decision_function_shape='ovr'))
parameters = {'n_estimators': (1, 2, 3, 4, 5, 6, 7, 8, 9,10)}
clf_bagging_s = GridSearchCV(bagging_s, parameters)

In [30]:
clf_bagging_s.fit(X_train_sub, y_train_sub)

GridSearchCV(cv=None, error_score='raise',
       estimator=BaggingClassifier(base_estimator=SVC(C=100, cache_size=200, class_weight=None, coef0=0,
  decision_function_shape='ovr', degree=1, gamma='auto', kernel='sigmoid',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [31]:
clf_bagging_s.best_params_

{'n_estimators': 4}

In [32]:
bagging_opt_s = BaggingClassifier(n_estimators=8, base_estimator=SVC(C=100, coef0=0, degree=1, kernel='sigmoid', max_iter=1000,decision_function_shape='ovr'))
bagging_opt_s.fit(X_train1, y_train1)
y_bagging_s = bagging_opt_s.predict(X_test1)

In [33]:
accuracy_bagging_s = accuracy_score(y_test1, y_bagging_s)
misclassification_error_bagging_s = 1 - accuracy_bagging_s
print("The multi-class misclassification error obtained using bagging with sigmoid kernel in one-vs-rest is ", + misclassification_error_bagging_s)

The multi-class misclassification error obtained using bagging with sigmoid kernel in one-vs-rest is  0.33912037037
