# Model: Support Vector Machine

In [1]:
# Import relevant packages
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, recall_score, roc_auc_score, precision_score
from sklearn.metrics import plot_confusion_matrix, auc, roc_curve, plot_roc_curve, plot_precision_recall_curve

from sklearn.svm import SVC

from utils import process_data

In [2]:
data = process_data(type_ = 'normal')

Beginning data processing ...
Reading in batched data ...


100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [01:16<00:00,  4.01s/it]


Splitting data into train and test ...
Scaling data ...
Completed normal data processing.


In [3]:
X_train = data['X_train_scaled']
y_train = data['y_train']

X_test = data['X_test_scaled']
y_test = data['y_test']

## Useful Functions

In [4]:
def plot_roc(clf, X, y, title):
    fpr, tpr, thresholds = roc_curve(y, clf.predict_proba(X)[:, 1])
    auc_ = auc(fpr, tpr)

    plt.figure(1)
    plt.plot([0, 1], [0, 1], '--')
    plt.plot(fpr, tpr, label='area = {:.3f}'.format(auc_))
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title(title)
    plt.legend(loc='best')
    plt.show()

In [5]:
def plot_impurities(clf):
    importances_impurity = clf.feature_importances_
    impurity_importances = pd.Series(importances_impurity).nlargest(10) 
    # select the 10 X variables with largest feature importance values
    fig, ax = plt.subplots()
    impurity_importances.plot.bar(ax=ax)
    ax.set_title("Feature importances using MDI (mean decrease in impurity)")
    ax.set_ylabel("Mean decrease in impurity")
    fig.tight_layout()

In [6]:
def plot_permutations(clf):
    importances_permutation = permutation_importance(
        clf, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
    )
    permutation_importances = pd.Series(importances_permutation.importances_mean).nlargest(10)

    fig, ax = plt.subplots()
    permutation_importances.plot.bar(ax=ax)
    ax.set_title("Feature importances using permutation importances")
    ax.set_ylabel("Mean decrease in accuracy")
    fig.tight_layout()

## SVM Model

In [7]:
# Support Vector Machines
def svm_model(x_train, y_train, x_test, y_test, standardize = True, scoring = 'balanced_accuracy', test = True, folds = 5):
    if test:
        hyperparameters = [
            {'kernel': ['linear']},
            #{'kernel': ['poly'], 'degree': [2]},
            #{'kernel': ['rbf'], 'gamma': [0.01]}
        ]    
    else:
        #C = [0.001, 0.01, 0.1, 1, 10, 100]
        C = [0.1, 1, 10]

        hyperparameters = [
            {'kernel': ['linear'], 'C': C},
            {'kernel': ['poly'], 'degree': [2, 3, 4], 'C': C},
            {'kernel': ['rbf'], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1], 'C': C}
        ]
    
    if standardize:
        x_train, x_test, _ = standardize_data(x_train, x_test)

    svm_model = SVC(probability = True)
    stratKFold = StratifiedKFold(n_splits = folds)
    
    grid = GridSearchCV(svm_model, hyperparameters, cv = stratKFold, scoring = scoring, verbose = 10, n_jobs = -1)
    model = grid.fit(x_train, y_train)
    
    train_acc = accuracy_score(y_train, model.predict(x_train))
    test_acc = accuracy_score(y_test, model.predict(x_test))

    train_acc_bal = balanced_accuracy_score(y_train, model.predict(x_train))
    test_acc_bal = balanced_accuracy_score(y_test, model.predict(x_test))    
    
    y_predictions = model.predict(x_train)
    y_predictions_prob = model.predict_proba(x_train)[:, 1]    
    train_pred_out = prediction_outputs(y_train, y_predictions, y_predictions_prob)

    y_predictions = model.predict(x_test)
    y_predictions_prob = model.predict_proba(x_test)[:, 1]    
    test_pred_out = prediction_outputs(y_test, y_predictions, y_predictions_prob)
    
    outputs = {
        'best_params': model.best_params_,
        'model': model,
        'train_acc': train_acc,
        'test_acc': test_acc,
        'train_acc_bal': train_acc_bal,
        'test_acc_bal': test_acc_bal,        
        'train_conf_mat': train_pred_out['conf_mat'],
        'test_conf_mat': test_pred_out['conf_mat'],
        'train_basic_roc': (train_pred_out['basic_falpos'], train_pred_out['basic_trupos']),
        'train_log_roc': (train_pred_out['pred_falpos'], train_pred_out['pred_trupos'], train_pred_out['pred_thresholds']),
        'train_precision': (train_pred_out['precision'], train_pred_out['recall'], train_pred_out['pr_thresholds']),
        'test_basic_roc': (test_pred_out['basic_falpos'], test_pred_out['basic_trupos']),
        'test_log_roc': (test_pred_out['pred_falpos'], test_pred_out['pred_trupos'], test_pred_out['pred_thresholds']),
        'test_precision': (test_pred_out['precision'], test_pred_out['recall'], test_pred_out['pr_thresholds']),
    }
    
    return outputs

In [8]:
test_mode = False
cv_folds = 5
scoring_metric = 'roc_auc'

In [None]:
svm_output = svm_model(X_train[:10000], y_train[:10000], X_test[:10000], y_test[:10000], standardize = False, scoring = scoring_metric,
                       test = test_mode, folds = cv_folds)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


## CHANGE TEST MODE TO FALSE ONCE THIS RUNS SUCCESSFULLY