# Big Data for Health (CSE6250) 

Goal: Using Support Vector Machine model to predict sepsis onset using MIMIC III Data

Author: Caleb Sabatini

## Model Imports

In [5]:
from Python.model_data import model_data
from Python.utils import classification_metrics, plot_roc, plot_prc

import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import make_scorer, roc_auc_score
from scipy import stats
from imblearn.over_sampling import RandomOverSampler

np.random.seed(0)

## Support Vector Machine - Without Oversampling

In [9]:
df_train, df_test = model_data(model_type="SVM", test_size=.1)

x_train, y_train = df_train.drop(columns=['label', 'icustay_id']), df_train['label']
x_test, y_test = df_test.drop(columns=['label', 'icustay_id']), df_test['label']

x_train = x_train.fillna(x_train.median())
x_test = x_test.fillna(x_test.median())

sc = MinMaxScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

model = SVC(kernel='rbf', probability=True, random_state=0)
auc = make_scorer(roc_auc_score)
param_grid = {"C": np.arange(2, 10, 2), "gamma": np.arange(0.1, 1, 0.2)}
#grid_search = GridSearchCV(model, param_grid, n_jobs=5, cv=5, scoring=auc)
#grid_search.fit(x_train, y_train) 
#best_params = grid_search.best_params_
best_params = {'C':10, 'gamma':0.1}

model = SVC(kernel='rbf', probability=True, random_state=0, C=best_params['C'], gamma=best_params['gamma'])
model.fit(x_train, y_train)
y_pred = model.predict_proba(x_test)[:,1]

classification_metrics("SVM", y_test, y_pred)
plot_roc(figname="output/SVM_roc.png", SVM="output/SVM_roc.csv")
plot_prc(figname="output/SVM_prc.png", SVM="output/SVM_prc.csv")

Train/Test data size: 4885/543
______________________________________________
Classifier: SVM
Accuracy:  0.8785
AUC:  0.7801
AUPRC:  0.2989
Precision:  0.0000
Recall:  0.0000
F1-score:  0.0000
______________________________________________



<Figure size 432x288 with 0 Axes>

## Support Vector Machine - With Oversampling

In [10]:
ros = RandomOverSampler(random_state=0)
x_train, y_train = ros.fit_resample(x_train, y_train)

model = SVC(kernel='rbf', probability=True, random_state=0)
auc = make_scorer(roc_auc_score)
param_grid = {"C": np.arange(2, 10, 2), "gamma": np.arange(0.1, 1, 0.2)}
#grid_search = GridSearchCV(model, param_grid, n_jobs=5, cv=5, scoring=auc)
#grid_search.fit(x_train, y_train) 
#best_params = grid_search.best_params_
best_params = {'C':10, 'gamma':0.1}

model = SVC(kernel='rbf', probability=True, random_state=0, C=best_params['C'], gamma=best_params['gamma'])
model.fit(x_train, y_train)
y_pred = model.predict_proba(x_test)[:,1]

classification_metrics("SVM", y_test, y_pred)
plot_roc(figname="output/SVM_roc.png", SVM="output/SVM_roc.csv")
plot_prc(figname="output/SVM_prc.png", SVM="output/SVM_prc.csv")

______________________________________________
Classifier: SVM
Accuracy:  0.7864
AUC:  0.7717
AUPRC:  0.3448
Precision:  0.2846
Recall:  0.5556
F1-score:  0.3763
______________________________________________



<Figure size 432x288 with 0 Axes>