In [31]:
import sys
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier as SGD
from sklearn.kernel_approximation import RBFSampler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

import utils

default_stdout = sys.stdout
__filename__ = "SVM.model"

### Load Data

In [32]:
data = pd.read_csv(f'{utils._data_pth_}/processed/train_joined.csv', index_col=0)

### class balance adjustment

In [64]:
y, X = data['isFraud'].copy(), data.drop(columns=['isFraud']).copy()
# curtail label 0 to make classes less imbalanced
# label1_cnt = len(y[y == 1])
# labe10_indices = y[y == 0].index
# y = y.drop(labe10_indices[:1*label1_cnt])
# X = X.drop(labe10_indices[:1*label1_cnt])
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.33)

### Configureation

In [65]:
regression_type = "SVM"
_regression_loss = "log" if regression_type == "LR" else "hinge" # logistic: log, svm: hinge
monte_carlo_num = 2 # for RBF kernel
gradient_penalty = "l1"
learning_rate = "optimal"
rbf_gamma = "scale"

### Train

In [66]:
# RBF feature engineering
rbf = RBFSampler(gamma=rbf_gamma, random_state=1, n_components=X_train.shape[1]*monte_carlo_num)
# stochastic gradient descent to speed up training
reg = SGD(loss=_regression_loss, penalty=gradient_penalty, verbose=1, learning_rate=learning_rate)
model = make_pipeline(reg)
sys.stdout = open("./svm_train.out", 'w')
model.fit(X_train, Y_train)
sys.stdout.close()
sys.stdout = default_stdout

### Test

In [67]:
import metrics
# sc_train = model.score(X_train, Y_train)
# sc_test = model.score(X_test, Y_test)
y_pred_test = model.predict(X_train)
# probs=model.predict_proba(X_test)
# print(sc_train)
# print(sc_test)
print(metrics.conf_matrix(Y_train,y_pred_test))
# metrics.roc_pr_curve(Y_test,probs[:,1])

Test data
[[3484, 379552], [2241, 10384]]
Misclassification error =  12625
SENS(recall)  =  0.251225843668878
SPEC   =  0.9941303271668155
PPV(Precision)   =  0.6085589519650655
NPV   =  0.9733699889212589
F1-SCORE =  0.3556372173735518
None
