In [None]:
import sys
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier as SGD
from sklearn.kernel_approximation import RBFSampler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

import utils

default_stdout = sys.stdout
__filename__ = "SVM.model"

### Load Data

In [None]:
data = pd.read_csv(f'{utils._data_pth_}/processed/train_joined.csv', index_col=0)
y, X = data['isFraud'], data.drop(columns=['isFraud'])
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.33, random_state=utils._random_seed_)

In [None]:
print(len(X_train[X_train.card1>0]), len(X_train[X_train.card1<0]))
print(len(Y_test[Y_test==1]), len(Y_test[Y_test==0]))

### Configureation

In [None]:
regression_type = "SVM"
_regression_loss = "log" if regression_type == "LR" else "hinge" # logistic: log, svm: hinge
monte_carlo_num = 2 # for RBF kernel
gradient_penalty = "l1"
learning_rate = "optimal"
rbf_gamma = "scale"

### Train

In [None]:
# RBF feature engineering
rbf = RBFSampler(gamma=rbf_gamma, random_state=1, n_components=X_train.shape[1]*monte_carlo_num)
# stochastic gradient descent to speed up training
reg = SGD(loss=_regression_loss, penalty=gradient_penalty, verbose=1, learning_rate=learning_rate)
model = make_pipeline(reg)
sys.stdout = open("./svm_train.out", 'w')
model.fit(X_train, Y_train)
sys.stdout.close()
sys.stdout = default_stdout

### Test

In [None]:
import metrics
# sc_train = model.score(X_train, Y_train)
# sc_test = model.score(X_test, Y_test)
y_pred_test = model.predict(X_train)
# probs=model.predict_proba(X_test)
# print(sc_train)
# print(sc_test)
print(metrics.conf_matrix(Y_train,y_pred_test))
# metrics.roc_pr_curve(Y_test,probs[:,1])