# SVM Model Tuning

In this notebook, we will take the folds generated by the OU class in the previous notebook  and try to find the best set of parameters for our SVM to perform binary classification. 

In [10]:
import pandas as pd
import numpy as np

import sklearn
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import GridSearchCV

import pickle

import OU

import copy

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
save_dir = './data/'

In [12]:
info = np.load(save_dir + "/info.npy", allow_pickle=True)

First we need to recompile the fold dictionary into one large dataframe so that sci-kit learn's GridSearchCV class can search for the optimal parameters efficiently. 

In [15]:
splits = []
multi_cv_df = pd.DataFrame()
multi_cv_labels = pd.Series()

for i in range(len(info)):
    train = info[i]['train']['df_scale'].copy()
    train_labels = copy.copy(info[i]['train']['labels'])
    
    test = info[i]['test']['df_scale'].copy()
    test_labels = copy.copy(info[i]['test']['labels'])

    train_len = train.shape[0]
    test_len = test.shape[0]
    
    # Append rows to dataframe
    multi_cv_df = multi_cv_df.append(train, ignore_index=True)
    multi_cv_labels = multi_cv_labels.append(train_labels, ignore_index=True)
    
    # Append labels to a dataframe
    multi_cv_df = multi_cv_df.append(test, ignore_index=True)
    multi_cv_labels = multi_cv_labels.append(test_labels, ignore_index=True)
    
    # Append the indices of the folds to a list
    splits.append((multi_cv_df.iloc[-train_len-test_len:-test_len].index, multi_cv_df.iloc[-test_len:].index))
    
    # Quality Assurance
    assert(np.array_equal(multi_cv_df.loc[splits[i][0]].values, train.values))
    assert(np.array_equal(multi_cv_labels.loc[splits[i][0]].values, train_labels.values))
    assert(np.array_equal(multi_cv_df.loc[splits[i][1]], test.values))
    assert(np.array_equal(multi_cv_labels.loc[splits[i][1]], test_labels))
    
splits = np.array(splits)

np.save(save_dir + 'splits.npy', splits)

  This is separate from the ipykernel package so we can avoid doing imports until


In [16]:
# Save off data
multi_cv_df.to_csv(save_dir + 'df.csv')
multi_cv_labels.to_csv(save_dir + 'labels.csv')

# Gridsearch

We want to find the optimal hyperparameters for our SVM by exploring all combinations of possible hyperparameter


In [17]:
params = [{ 'kernel': ['rbf'],
            'C': [0.1,1,10,100], 
            'gamma': [1, 0.1, 0.001, 0.0001], 
            'cache_size': [2000], 
            'class_weight': [{0: 0.5, 1: 0.5}, {0: 0.6, 1: 0.4}, 
                             {0: 0.7, 1: 0.3}, {0: 0.8, 1: 0.2}]
          }, 
          { 'kernel': ['poly'], 
            'C': [0.1, 1,10,100,], 
            'gamma': [1, 0.1, 0.001, 0.0001],
            'degree': [3, 5],
            'cache_size': [2000],
            'class_weight': [{0: 0.5, 1: 0.5}, 
                             {0: 0.6, 1: 0.4}, {0: 0.7, 1: 0.3}]
          }]

In [18]:
# Use all cores (n_jobs-1)
gridcv = GridSearchCV(svm.SVC(), params, verbose=1, cv=list(splits), n_jobs=-1, 
                    scoring=['precision'], refit=False)

gridcv.fit(multi_cv_df, multi_cv_labels)

Fitting 399 folds for each of 160 candidates, totalling 63840 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [None]:
# save the object to a file
with open(save_dir+'gridsearch_results.pkl', 'wb') as f:
    pickle.dump(gridcv, f)