# SVM

### Base imports

In [1]:
import os

import numpy as np

import pandas as pd
pd.set_option('display.max_columns', 100)

import matplotlib
%matplotlib inline
from matplotlib import pylab as plt
import seaborn as sns

### ML imports

In [2]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import KFold, ShuffleSplit, StratifiedKFold

In [3]:
from sklearn.svm import SVC as estimator_class

### Local imports

In [4]:
from disease_ml.data import load_data
from disease_ml.utils import make_submission

### Constants

In [5]:
EXPERIMENT_TITLE = 'SVM'

DO_GRID_SEARCH = True

SCORING = 'neg_log_loss'
CV = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

N_JOBS = 3

SUBMISSIONS_DIR = 'submissions/'

### Loading data

In [6]:
data_params = {
    'imputer_strategy': 'most_frequent',
    'scale': True,
    
    'hashing_trick': True,
    'binary': False,
    'ngram_range': (1, 3)
}

x_train, y_train, x_test = load_data(**data_params)

In [7]:
print('x_train.shape={}'.format(x_train.shape))
print('y_train.shape={}'.format(y_train.shape))
print('x_test.shape={}'.format(x_test.shape))

x_train.shape=(4099, 16137)
y_train.shape=(4099,)
x_test.shape=(1366, 16137)


### Training

In [8]:
estimator_class()

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [9]:
estimator_params = {
    'random_state': [0],
    'probability': True,
    
    'C': 1000,
    'kernel': 'rbf'
}

estimator_param_grid = {
    'random_state': [0],
    'probability': [True],
    
    'C': [1000, 10000],
    'kernel': ['rbf']
}

In [10]:
if DO_GRID_SEARCH:
    gs = GridSearchCV(estimator_class(), estimator_param_grid,
                      scoring=SCORING, cv=CV,
                      refit=True, n_jobs=N_JOBS, verbose=2)
    gs.fit(x_train, y_train)
    
    score = gs.best_score_
    estimator = gs.best_estimator_
    estimator_params = gs.best_params_
else:
    cv_scores = cross_val_score(estimator_class(**estimator_params),
                                x_train, y_train,
                                scoring=SCORING, cv=CV,
                                n_jobs=N_JOBS, verbose=2)
    score = np.mean(cv_scores)
    
    print('Score: {:.5} | Std: {:.5}'.format(score, np.std(cv_scores)))
    print(cv_scores)
    
    # refitting on all data
    estimator = estimator_class(**estimator_params)
    estimator.fit(x_train, y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] C=1000, kernel=rbf, probability=True, random_state=0 ............
[CV] C=1000, kernel=rbf, probability=True, random_state=0 ............
[CV] C=1000, kernel=rbf, probability=True, random_state=0 ............


TypeError: catching classes that do not inherit from BaseException is not allowed

### Printing results of training

In [None]:
if DO_GRID_SEARCH:
    print('Grid search results:')
    cv_results_indexes_sorted = np.argsort(gs.cv_results_['mean_test_score'])[::-1]
    mean_test_scores = gs.cv_results_['mean_test_score'][cv_results_indexes_sorted]
    std_test_scores = gs.cv_results_['std_test_score'][cv_results_indexes_sorted]
    params = np.array(gs.cv_results_['params'])[cv_results_indexes_sorted]

    for test_score, std, params in zip(mean_test_scores, std_test_scores, params):
        print('- score={:.5}, std={:.5} | params={}'.format(test_score, std, params))

### Submitting results

In [None]:
# params = {
#     'experiment': {
#         'title': EXPERIMENT_TITLE,
#         'do_grid_search': DO_GRID_SEARCH,
#         'scoring': SCORING,
#         'k_folds': str(CV),
#         'score': score,
#         'n_jobs': N_JOBS
#     },
    
#     'data': data_params,
#     'estimator': estimator.get_params()
# }

# predictions = estimator.predict_proba(x_test)[:, 1]
# make_submission(predictions, SUBMISSIONS_DIR, EXPERIMENT_TITLE,
#                 estimator, params,
#                 score, add_to_blending=True)