# Importing Libraries and Dataset

In [24]:
#Import Libraries
import pandas as pd
import numpy as np
from sklearn import metrics as skmetrics
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import PredefinedSplit

In [25]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [26]:
X_smote = pd.read_csv('data/X_smote.csv')
y_smote = pd.read_csv('data/y_smote.csv')
X_val = pd.read_csv('data/X_val.csv')
y_val = pd.read_csv('data/y_val.csv')
X_test = pd.read_csv('data/X_test.csv')
y_test = pd.read_csv('data/y_test.csv')

In [27]:
stdsc = StandardScaler()

X_smote = stdsc.fit_transform(X_smote)
X_val = stdsc.transform(X_val)
X_test = stdsc.transform(X_test)

# Evaluation Metrics Function

In [28]:
def print_statistics(y_actual, y_pred, y_prob, dataset_type):
  print(f"====================={dataset_type}====================")
  print(f"accuracy: {round(skmetrics.accuracy_score(y_actual, y_pred),5)}")
  print(f"precision (macro): {round(skmetrics.precision_score(y_actual, y_pred, average='macro'),5)}")
  print(f"recall (macro): {round(skmetrics.recall_score(y_actual, y_pred, average='macro'),5)}")
  print(f"f1 score (macro): {round(skmetrics.f1_score(y_actual, y_pred, average='macro'),5)}")
  print(f"f1 score of class 1: {round(skmetrics.f1_score(y_actual, y_pred, pos_label=1),5)}")
  print(f"f1 score of class 0: {round(skmetrics.f1_score(y_actual, y_pred, pos_label=0),5)}")
  prec, recall, thresholds = skmetrics.precision_recall_curve(y_actual, y_prob)
  print(f"pr auc score of class 1: {round(skmetrics.auc(recall, prec),5)}")
  prec_0, recall_0, thresholds = skmetrics.precision_recall_curve(y_actual, y_prob, pos_label=0)
  print(f"pr auc score of class 0: {round(skmetrics.auc(recall_0, prec_0),5)}")

# SVM Model

In [29]:
svc = SVC(kernel = 'rbf')
model_train = svc.fit(X_smote,y_smote.values.ravel())

In [30]:
# Evaluating Train and Test
train_pred_proba = model_train.predict(X_smote)
train_pred = np.round(train_pred_proba)
val_pred_proba = model_train.predict(X_val)
val_pred = np.round(val_pred_proba)
test_pred_proba = model_train.predict(X_test)
test_pred = np.round(test_pred_proba)

In [31]:
print_statistics(y_smote, train_pred, train_pred_proba, 'train')
print_statistics(y_val, val_pred, val_pred_proba, 'val')
print_statistics(y_test, test_pred, test_pred_proba, 'test')

accuracy: 0.94721
precision (macro): 0.95002
recall (macro): 0.92584
f1 score (macro): 0.9367
f1 score of class 1: 0.9109
f1 score of class 0: 0.9625
pr auc score of class 1: 0.93322
pr auc score of class 0: 0.36937
accuracy: 0.94105
precision (macro): 0.93921
recall (macro): 0.90908
f1 score (macro): 0.92268
f1 score of class 1: 0.88498
f1 score of class 0: 0.96037
pr auc score of class 1: 0.90925
pr auc score of class 0: 0.40002
accuracy: 0.94036
precision (macro): 0.93427
recall (macro): 0.91224
f1 score (macro): 0.92245
f1 score of class 1: 0.88518
f1 score of class 0: 0.95972
pr auc score of class 1: 0.90671
pr auc score of class 0: 0.40748


# Random Search for hyperparameter tuning

In [32]:
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

In [33]:
scoring = ['f1_macro']

split_index = [-1]*len(X_smote) + [0]*len(X_val)
X = np.concatenate((X_smote, X_val), axis=0)
y = np.concatenate((y_smote, y_val), axis=0)
pds = PredefinedSplit(test_fold = split_index)

random_search = RandomizedSearchCV(estimator=SVC(), 
                           param_distributions=param_grid, 
                           n_iter=50,
                           scoring=scoring, 
                           refit='f1_macro', 
                           n_jobs=-1, 
                           cv=pds, 
                           verbose=0)

random_result = random_search.fit(X, y.ravel())

In [34]:
# Print the best accuracy score for the training dataset
print(f'The best accuracy score for the training dataset is {random_result.best_score_:.4f}')
# Print the hyperparameters for the best score
print(f'The best hyperparameters are {random_result.best_params_}')
# Print the best accuracy score for the validation dataset
print(f'The accuracy score for the validation dataset is {random_search.score(X_val, y_val):.4f}')
# Print the best accuracy score for the testing dataset

The best accuracy score for the training dataset is 0.9227
The best hyperparameters are {'kernel': 'rbf', 'gamma': 0.001, 'C': 1}
The accuracy score for the validation dataset is 0.9227


In [35]:
# SVC with the best hyperparamters
svc = SVC(kernel = 'rbf', gamma=0.001, C=1)
model = svc.fit(X_smote,y_smote.values.ravel())

In [36]:
# Evaluating Train and Test
train_pred_proba = model.predict(X_smote)
train_pred = np.round(train_pred_proba)
val_pred_proba = model.predict(X_val)
val_pred = np.round(val_pred_proba)
test_pred_proba = model.predict(X_test)
test_pred = np.round(test_pred_proba)

In [23]:
print_statistics(y_smote, train_pred, train_pred_proba, 'train')
print_statistics(y_val, val_pred, val_pred_proba, 'val')
print_statistics(y_test, test_pred, test_pred_proba, 'test')

accuracy: 0.95968
precision (macro): 0.96046
recall (macro): 0.95968
f1 score (macro): 0.95966
f1 score of class 1: 0.95883
f1 score of class 0: 0.9605
pr auc score of class 1: 0.97449
pr auc score of class 0: 0.26521
accuracy: 0.94868
precision (macro): 0.94829
recall (macro): 0.91997
f1 score (macro): 0.93286
f1 score of class 1: 0.90027
f1 score of class 0: 0.96545
pr auc score of class 1: 0.92176
pr auc score of class 0: 0.39356
accuracy: 0.94556
precision (macro): 0.94733
recall (macro): 0.91298
f1 score (macro): 0.9283
f1 score of class 1: 0.89312
f1 score of class 0: 0.96348
pr auc score of class 1: 0.91774
pr auc score of class 0: 0.39176
