# Importing Libraries and Dataset

In [1]:
#Import Libraries
import pandas as pd
import numpy as np
from sklearn import metrics as skmetrics
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import PredefinedSplit

In [2]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [3]:
X_smote = pd.read_csv('data/X_smote.csv')
y_smote = pd.read_csv('data/y_smote.csv')
X_val = pd.read_csv('data/X_val.csv')
y_val = pd.read_csv('data/y_val.csv')
X_test = pd.read_csv('data/X_test.csv')
y_test = pd.read_csv('data/y_test.csv')

In [4]:
stdsc = StandardScaler()

X_smote = stdsc.fit_transform(X_smote)
X_val = stdsc.transform(X_val)
X_test = stdsc.transform(X_test)

# Evaluation Metrics Function

In [5]:
def print_statistics(y_actual, y_pred, y_prob, dataset_type):
  print(f"====================={dataset_type}====================")
  print(f"accuracy: {round(skmetrics.accuracy_score(y_actual, y_pred),5)}")
  print(f"precision (macro): {round(skmetrics.precision_score(y_actual, y_pred, average='macro'),5)}")
  print(f"recall (macro): {round(skmetrics.recall_score(y_actual, y_pred, average='macro'),5)}")
  print(f"f1 score (macro): {round(skmetrics.f1_score(y_actual, y_pred, average='macro'),5)}")
  print(f"f1 score of class 1: {round(skmetrics.f1_score(y_actual, y_pred, pos_label=1),5)}")
  print(f"f1 score of class 0: {round(skmetrics.f1_score(y_actual, y_pred, pos_label=0),5)}")
  prec, recall, thresholds = skmetrics.precision_recall_curve(y_actual, y_prob)
  print(f"pr auc score of class 1: {round(skmetrics.auc(recall, prec),5)}")
  prec_0, recall_0, thresholds = skmetrics.precision_recall_curve(y_actual, y_prob, pos_label=0)
  print(f"pr auc score of class 0: {round(skmetrics.auc(recall_0, prec_0),5)}")

# SVM Model

In [8]:
svc = SVC(kernel = 'rbf')
model_train = svc.fit(X_smote,y_smote.values.ravel())

In [9]:
# Evaluating Train and Test
train_pred_proba = model_train.predict(X_smote)
train_pred = np.round(train_pred_proba)
val_pred_proba = model_train.predict(X_val)
val_pred = np.round(val_pred_proba)
test_pred_proba = model_train.predict(X_test)
test_pred = np.round(test_pred_proba)

In [10]:
print_statistics(y_smote, train_pred, train_pred_proba, 'train')
print_statistics(y_val, val_pred, val_pred_proba, 'val')
print_statistics(y_test, test_pred, test_pred_proba, 'test')

accuracy: 0.9609
precision (macro): 0.96162
recall (macro): 0.9609
f1 score (macro): 0.96088
f1 score of class 1: 0.96011
f1 score of class 0: 0.96166
pr auc score of class 1: 0.97523
pr auc score of class 0: 0.26487
accuracy: 0.94868
precision (macro): 0.94829
recall (macro): 0.91997
f1 score (macro): 0.93286
f1 score of class 1: 0.90027
f1 score of class 0: 0.96545
pr auc score of class 1: 0.92176
pr auc score of class 0: 0.39356
accuracy: 0.94417
precision (macro): 0.94633
recall (macro): 0.91041
f1 score (macro): 0.92634
f1 score of class 1: 0.8901
f1 score of class 0: 0.96258
pr auc score of class 1: 0.91572
pr auc score of class 0: 0.39191


# Random Search for hyperparameter tuning

In [11]:
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

In [18]:
scoring = ['f1_macro']

split_index = [-1]*len(X_smote) + [0]*len(X_val)
X = np.concatenate((X_smote, X_val), axis=0)
y = np.concatenate((y_smote, y_val), axis=0)
pds = PredefinedSplit(test_fold = split_index)

random_search = RandomizedSearchCV(estimator=SVC(), 
                           param_distributions=param_grid, 
                           n_iter=20,
                           scoring=scoring, 
                           refit='f1_macro', 
                           n_jobs=-1, 
                           cv=pds, 
                           verbose=0)

random_result = random_search.fit(X, y.ravel())

In [20]:
# Print the best accuracy score for the training dataset
print(f'The best accuracy score for the training dataset is {random_result.best_score_:.4f}')
# Print the hyperparameters for the best score
print(f'The best hyperparameters are {random_result.best_params_}')
# Print the best accuracy score for the validation dataset
print(f'The accuracy score for the validation dataset is {random_search.score(X_val, y_val):.4f}')
# Print the best accuracy score for the testing dataset
print(f'The accuracy score for the testing dataset is {random_search.score(X_test, y_test):.4f}')

The best accuracy score for the training dataset is 0.9487
The best hyperparameters are {'kernel': 'sigmoid', 'gamma': 0.001, 'C': 0.1}
The accuracy score for the validation dataset is 0.9487
The accuracy score for the testing dataset is 0.9456


In [24]:
# Print the best accuracy score for the training dataset
print(f'The best accuracy score for the training dataset is {random_result.best_score_:.4f}')
# Print the hyperparameters for the best score
print(f'The best hyperparameters are {random_result.best_params_}')
# Print the best accuracy score for the validation dataset
print(f'The accuracy score for the validation dataset is {random_search.score(X_val, y_val):.4f}')
# Print the best accuracy score for the testing dataset
print(f'The accuracy score for the testing dataset is {random_search.score(X_test, y_test):.4f}')

The best accuracy score for the training dataset is 0.8496
The best hyperparameters are {'kernel': 'poly', 'gamma': 0.01, 'C': 0.1}
The accuracy score for the validation dataset is 0.8882
The accuracy score for the testing dataset is 0.8059


In [21]:
# SVC with the best hyperparamters
svc = SVC(kernel = 'sigmoid', gamma=0.001, C=1)
model = svc.fit(X_smote,y_smote.values.ravel())

In [22]:
# Evaluating Train and Test
train_pred_proba = model.predict(X_smote)
train_pred = np.round(train_pred_proba)
val_pred_proba = model.predict(X_val)
val_pred = np.round(val_pred_proba)
test_pred_proba = model.predict(X_test)
test_pred = np.round(test_pred_proba)

In [23]:
print_statistics(y_smote, train_pred, train_pred_proba, 'train')
print_statistics(y_val, val_pred, val_pred_proba, 'val')
print_statistics(y_test, test_pred, test_pred_proba, 'test')

accuracy: 0.95968
precision (macro): 0.96046
recall (macro): 0.95968
f1 score (macro): 0.95966
f1 score of class 1: 0.95883
f1 score of class 0: 0.9605
pr auc score of class 1: 0.97449
pr auc score of class 0: 0.26521
accuracy: 0.94868
precision (macro): 0.94829
recall (macro): 0.91997
f1 score (macro): 0.93286
f1 score of class 1: 0.90027
f1 score of class 0: 0.96545
pr auc score of class 1: 0.92176
pr auc score of class 0: 0.39356
accuracy: 0.94556
precision (macro): 0.94733
recall (macro): 0.91298
f1 score (macro): 0.9283
f1 score of class 1: 0.89312
f1 score of class 0: 0.96348
pr auc score of class 1: 0.91774
pr auc score of class 0: 0.39176


In [17]:
print_statistics(y_smote, train_pred, train_pred_proba, 'train')
print_statistics(y_val, val_pred, val_pred_proba, 'val')
print_statistics(y_test, test_pred, test_pred_proba, 'test')

accuracy: 0.50048
precision (macro): 0.60538
recall (macro): 0.50048
f1 score (macro): 0.33497
f1 score of class 1: 0.66673
f1 score of class 0: 0.0032
pr auc score of class 1: 0.74996
pr auc score of class 0: 0.74948
accuracy: 0.27219
precision (macro): 0.63534
recall (macro): 0.50143
f1 score (macro): 0.21586
f1 score of class 1: 0.42603
f1 score of class 0: 0.00568
pr auc score of class 1: 0.63534
pr auc score of class 0: 0.86428
accuracy: 0.26942
precision (macro): 0.30131
recall (macro): 0.49791
f1 score (macro): 0.21287
f1 score of class 1: 0.42384
f1 score of class 0: 0.00189
pr auc score of class 1: 0.63277
pr auc score of class 0: 0.86523


In [27]:
print_statistics(y_smote, train_pred, train_pred_proba, 'train')
print_statistics(y_val, val_pred, val_pred_proba, 'val')
print_statistics(y_test, test_pred, test_pred_proba, 'test')

accuracy: 0.97289
precision: 0.98619
recall: 0.95922
f1 score: 0.97251
pr auc score: 0.9829
accuracy: 0.94099
precision: 0.92577
recall: 0.84961
f1 score: 0.88606
pr auc score: 0.908
accuracy: 0.93197
precision: 0.93694
recall: 0.80206
f1 score: 0.86427
pr auc score: 0.89622
