# Model: Support Vector Machine

In [1]:
# Import relevant packages
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import pickle


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, recall_score, roc_auc_score, precision_score, balanced_accuracy_score
from sklearn.metrics import plot_confusion_matrix, auc, roc_curve, plot_roc_curve, plot_precision_recall_curve, precision_recall_curve, confusion_matrix

from sklearn.svm import SVC

from utils import process_data, split_train, grid_search

In [2]:
data = process_data(type_ = 'normal')

Beginning data processing ...
Reading in batched data ...


100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:21<00:00,  1.13s/it]


Splitting data into train and test ...
Scaling data ...
Completed normal data processing.


In [3]:
X_train = data['X_train_scaled']
y_train = data['y_train']

X_test = data['X_test_scaled']
y_test = data['y_test']

## SVM Model

In [4]:
# def prediction_outputs(y, y_predictions, y_predictions_prob):
#     # Confusion Matrix
#     conf_mat = confusion_matrix(y, y_predictions)
    
#     # ROC Curve
#     basic_falpos, basic_trupos, _ = roc_curve(y, [0 for _ in range(len(y))])
#     pred_falpos, pred_trupos, pred_thresholds = roc_curve(y, y_predictions_prob)
    
#     # Precision Recall
#     precision, recall, pr_thresholds = precision_recall_curve(y, y_predictions_prob)
    
#     outputs = {
#         'conf_mat': conf_mat,
#         'basic_falpos': basic_falpos,
#         'basic_trupos': basic_trupos,
#         'pred_falpos': pred_falpos,
#         'pred_trupos': pred_trupos,
#         'pred_thresholds': pred_thresholds,
#         'precision': precision,
#         'recall': recall,
#         'pr_thresholds': pr_thresholds
#     }
    
#     return outputs

In [5]:
# # Support Vector Machines
# def svm_model(x_train, y_train, x_test, y_test, standardize = True, scoring = 'balanced_accuracy', test = True, folds = 5):
#     if test:
#         hyperparameters = [
#             {'kernel': ['linear']},
#             #{'kernel': ['poly'], 'degree': [2]},
#             #{'kernel': ['rbf'], 'gamma': [0.01]}
#         ]    
#     else:
#         #C = [0.001, 0.01, 0.1, 1, 10, 100]
#         C = [0.1, 1, 10]

#         hyperparameters = [
#             {'kernel': ['linear'], 'C': C},
#             {'kernel': ['poly'], 'degree': [2, 3, 4], 'C': C},
#             {'kernel': ['rbf'], 'gamma': [0.001, 0.01, 0.1], 'C': C}
#         ]
    
#     if standardize:
#         x_train, x_test, _ = standardize_data(x_train, x_test)

#     svm_model = SVC(probability = True)
#     stratKFold = StratifiedKFold(n_splits = folds)
    
#     grid = GridSearchCV(svm_model, hyperparameters, cv = stratKFold, scoring = scoring, verbose = 10, n_jobs = -1)
#     model = grid.fit(x_train, y_train)
    
#     train_acc = accuracy_score(y_train, model.predict(x_train))
#     test_acc = accuracy_score(y_test, model.predict(x_test))

#     train_acc_bal = balanced_accuracy_score(y_train, model.predict(x_train))
#     test_acc_bal = balanced_accuracy_score(y_test, model.predict(x_test))    
    
#     y_predictions = model.predict(x_train)
#     y_predictions_prob = model.predict_proba(x_train)[:, 1]    
#     train_pred_out = prediction_outputs(y_train, y_predictions, y_predictions_prob)

#     y_predictions = model.predict(x_test)
#     y_predictions_prob = model.predict_proba(x_test)[:, 1]    
#     test_pred_out = prediction_outputs(y_test, y_predictions, y_predictions_prob)
    
#     outputs = {
#         'best_params': model.best_params_,
#         'model': model,
#         'train_acc': train_acc,
#         'test_acc': test_acc,
#         'train_acc_bal': train_acc_bal,
#         'test_acc_bal': test_acc_bal,        
#         'train_conf_mat': train_pred_out['conf_mat'],
#         'test_conf_mat': test_pred_out['conf_mat'],
#         'train_basic_roc': (train_pred_out['basic_falpos'], train_pred_out['basic_trupos']),
#         'train_log_roc': (train_pred_out['pred_falpos'], train_pred_out['pred_trupos'], train_pred_out['pred_thresholds']),
#         'train_precision': (train_pred_out['precision'], train_pred_out['recall'], train_pred_out['pr_thresholds']),
#         'test_basic_roc': (test_pred_out['basic_falpos'], test_pred_out['basic_trupos']),
#         'test_log_roc': (test_pred_out['pred_falpos'], test_pred_out['pred_trupos'], test_pred_out['pred_thresholds']),
#         'test_precision': (test_pred_out['precision'], test_pred_out['recall'], test_pred_out['pr_thresholds']),
#     }
    
#     return outputs

In [6]:
# test_mode = False
# cv_folds = 5
# scoring_metric = 'roc_auc'

In [7]:
X_input, y_input = split_train(X_train, y_train, keep_percent=0.1)

Splitting train ...


In [None]:
criterion = make_scorer(roc_auc_score, needs_proba=True)

clf_svm = SVC(probability = True)

C = [0.1, 1, 10]
param_grid = [
    {'kernel': ['linear'], 'C': C},
    {'kernel': ['poly'], 'degree': [2, 3, 4], 'C': C},
    {'kernel': ['rbf'], 'gamma': [0.001, 0.01, 0.1], 'C': C}
]

grid_result_svm = grid_search(clf_svm, criterion, param_grid, k=5, X=X_input, y=y_input)

# Choose best parameters from hyperparameter tuning
clf_svm = grid_result_svm.best_estimator_

# save best svm model
pickle.dump(grid_result_svm, open('models/grid_svm.sav', 'wb'))
pickle.dump(clf_svm, open('models/model_svm.sav', 'wb'))

Fitting 5 folds for each of 21 candidates, totalling 105 fits
[CV 1/5; 1/21] START C=0.1, kernel=linear.......................................
[CV 1/5; 1/21] END ........C=0.1, kernel=linear;, score=0.993 total time= 1.1min
[CV 2/5; 1/21] START C=0.1, kernel=linear.......................................
[CV 2/5; 1/21] END ........C=0.1, kernel=linear;, score=0.981 total time= 1.1min
[CV 3/5; 1/21] START C=0.1, kernel=linear.......................................
[CV 3/5; 1/21] END ........C=0.1, kernel=linear;, score=0.987 total time= 1.1min
[CV 4/5; 1/21] START C=0.1, kernel=linear.......................................
[CV 4/5; 1/21] END ........C=0.1, kernel=linear;, score=0.991 total time= 1.1min
[CV 5/5; 1/21] START C=0.1, kernel=linear.......................................
[CV 5/5; 1/21] END ........C=0.1, kernel=linear;, score=0.973 total time=  58.3s
[CV 1/5; 2/21] START C=1, kernel=linear.........................................
[CV 1/5; 2/21] END ..........C=1, kernel=linear

[CV 1/5; 11/21] END C=10, degree=3, kernel=poly;, score=0.965 total time= 3.6min
[CV 2/5; 11/21] START C=10, degree=3, kernel=poly...............................
[CV 2/5; 11/21] END C=10, degree=3, kernel=poly;, score=0.846 total time= 9.3min
[CV 3/5; 11/21] START C=10, degree=3, kernel=poly...............................
[CV 3/5; 11/21] END C=10, degree=3, kernel=poly;, score=0.907 total time= 3.1min
[CV 4/5; 11/21] START C=10, degree=3, kernel=poly...............................
[CV 4/5; 11/21] END C=10, degree=3, kernel=poly;, score=0.920 total time= 3.3min
[CV 5/5; 11/21] START C=10, degree=3, kernel=poly...............................
[CV 5/5; 11/21] END C=10, degree=3, kernel=poly;, score=0.826 total time= 2.3min
[CV 1/5; 12/21] START C=10, degree=4, kernel=poly...............................
[CV 1/5; 12/21] END C=10, degree=4, kernel=poly;, score=0.877 total time= 5.1min
[CV 2/5; 12/21] START C=10, degree=4, kernel=poly...............................
[CV 2/5; 12/21] END C=10, de

In [None]:
# svm_output = svm_model(X_input, y_input, X_test, y_test, standardize = False, scoring = scoring_metric,
#                        test = test_mode, folds = cv_folds)

In [None]:
# svm_output

In [None]:
# pickle.dump(svm_output, open(f'svm_output.sav', 'wb'))

In [None]:
# pickle.load(open(f'svm_output.sav', 'rb'))

In [None]:
sum(y_train)/len(y_train)

In [None]:
sum(y_train[:100000])/len(y_train[:100000])

In [None]:
sum(y_test[:100000])/len(y_test[:100000])

## CHANGE TEST MODE TO FALSE ONCE THIS RUNS SUCCESSFULLY