# Importing Libraries and Dataset

In [11]:
#Import Libraries
import pandas as pd
import numpy as np
from sklearn import metrics as skmetrics
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import PredefinedSplit

In [3]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [12]:
X_smote = pd.read_csv('data/X_smote.csv')
y_smote = pd.read_csv('data/y_smote.csv')
X_val = pd.read_csv('data/X_val.csv')
y_val = pd.read_csv('data/y_val.csv')
X_test = pd.read_csv('data/X_test.csv')
y_test = pd.read_csv('data/y_test.csv')

In [13]:
stdsc = StandardScaler()

X_smote = stdsc.fit_transform(X_smote)
X_val = stdsc.transform(X_val)
X_test = stdsc.transform(X_test)

# Evaluation Metrics Function

In [14]:
def print_statistics(y_actual, y_pred, y_prob, dataset_type):
  print(f"====================={dataset_type}====================")
  print(f"accuracy: {round(skmetrics.accuracy_score(y_actual, y_pred),5)}")
  print(f"precision: {round(skmetrics.precision_score(y_actual, y_pred),5)}")
  print(f"recall: {round(skmetrics.recall_score(y_actual, y_pred),5)}")
  print(f"f1 score: {round(skmetrics.f1_score(y_actual, y_pred),5)}")
  prec, recall, thresholds = skmetrics.precision_recall_curve(y_actual, y_prob)
  print(f"pr auc score: {round(skmetrics.auc(recall, prec),5)}")

# SVM Model

In [21]:
svc = SVC(kernel = 'rbf')
model_train = svc.fit(X_smote,y_smote.values.ravel())

In [22]:
# Evaluating Train and Test
train_pred_proba = model_train.predict(X_smote)
train_pred = np.round(train_pred_proba)
val_pred_proba = model_train.predict(X_val)
val_pred = np.round(val_pred_proba)
test_pred_proba = model_train.predict(X_test)
test_pred = np.round(test_pred_proba)

In [23]:
print_statistics(y_smote, train_pred, train_pred_proba, 'train')
print_statistics(y_val, val_pred, val_pred_proba, 'val')
print_statistics(y_test, test_pred, test_pred_proba, 'test')

accuracy: 0.96611
precision: 0.98425
recall: 0.94738
f1 score: 0.96547
pr auc score: 0.97897
accuracy: 0.93613
precision: 0.93169
recall: 0.82391
f1 score: 0.87449
pr auc score: 0.90157
accuracy: 0.92746
precision: 0.93435
recall: 0.78663
f1 score: 0.85415
pr auc score: 0.8893


# Random Search for hyperparameter tuning

In [21]:
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

In [22]:
scoring = ['accuracy', 'recall']

split_index = [-1]*len(X_smote) + [0]*len(X_val)
X = np.concatenate((X_smote, X_val), axis=0)
y = np.concatenate((y_smote, y_val), axis=0)
pds = PredefinedSplit(test_fold = split_index)

random_search = RandomizedSearchCV(estimator=SVC(), 
                           param_distributions=param_grid, 
                           n_iter=10,
                           scoring=scoring, 
                           refit='recall', 
                           n_jobs=-1, 
                           cv=pds, 
                           verbose=0)

random_result = random_search.fit(X, y.ravel())
random_result

In [24]:
# Print the best accuracy score for the training dataset
print(f'The best accuracy score for the training dataset is {random_result.best_score_:.4f}')
# Print the hyperparameters for the best score
print(f'The best hyperparameters are {random_result.best_params_}')
# Print the best accuracy score for the validation dataset
print(f'The accuracy score for the validation dataset is {random_search.score(X_val, y_val):.4f}')
# Print the best accuracy score for the testing dataset
print(f'The accuracy score for the testing dataset is {random_search.score(X_test, y_test):.4f}')

The best accuracy score for the training dataset is 0.8496
The best hyperparameters are {'kernel': 'poly', 'gamma': 0.01, 'C': 0.1}
The accuracy score for the validation dataset is 0.8882
The accuracy score for the testing dataset is 0.8059


In [25]:
# SVC with the best hyperparamters
svc = SVC(kernel = 'poly', gamma=0.01, C=0.1)
model = svc.fit(X_smote,y_smote.values.ravel())

In [26]:
# Evaluating Train and Test
train_pred_proba = model.predict(X_smote)
train_pred = np.round(train_pred_proba)
val_pred_proba = model.predict(X_val)
val_pred = np.round(val_pred_proba)
test_pred_proba = model.predict(X_test)
test_pred = np.round(test_pred_proba)

In [27]:
print_statistics(y_smote, train_pred, train_pred_proba, 'train')
print_statistics(y_val, val_pred, val_pred_proba, 'val')
print_statistics(y_test, test_pred, test_pred_proba, 'test')

accuracy: 0.97289
precision: 0.98619
recall: 0.95922
f1 score: 0.97251
pr auc score: 0.9829
accuracy: 0.94099
precision: 0.92577
recall: 0.84961
f1 score: 0.88606
pr auc score: 0.908
accuracy: 0.93197
precision: 0.93694
recall: 0.80206
f1 score: 0.86427
pr auc score: 0.89622
