In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
from classifiers import *

In [2]:
train_data = pd.read_csv('data/train.csv', sep = '|')
test_data = pd.read_csv('data/test.csv', sep = '|')
print(f'Train set has {train_data.shape[0]} entries and {train_data.shape[1]} features')
print(f'Test set has {test_data.shape[0]} entries and {test_data.shape[1]} features')


Train set has 1879 entries and 10 features
Test set has 498121 entries and 9 features


In [3]:
y = train_data['fraud']
X_all = train_data.append(test_data, sort=False).drop(columns=['fraud']).astype(float)
# encode trustLevel
X_encode = pd.get_dummies(X_all, columns=['trustLevel'], prefix='trustLevel')
X_train_encode = X_encode.iloc[:1879,:]
X_test_encode = X_encode.iloc[1879:, :]
X_norm = StandardScaler().fit_transform(X_all)
X_train_norm = X_norm[:1879,:]
X_test_norm = X_norm[1879:,:]

In [4]:
X_sim_encode = pd.get_dummies(X_all['trustLevel'], prefix='trustLevel')
X_rest = X_all.drop(columns="trustLevel")
X_norm_encode = pd.concat([X_sim_encode, 
                           pd.DataFrame(StandardScaler().fit_transform(X_rest), 
                                        columns=X_rest.columns, index=X_rest.index)], sort=False, axis=1)
# print(X_norm_encode.shape)
X_train_norm_enc = X_norm_encode.iloc[:1879,:]
X_test_norm_enc = X_norm_encode.iloc[1879:,:]

In [5]:
print(X_train_encode.shape)
# print(X_test_encode.shape)
print(X_train_norm.shape)
# print(X_test_norm)
# print(X_test_norm.shape)
print(X_train_norm_enc.shape)

(1879, 14)
(1879, 9)
(1879, 14)


In [6]:
evaluate_classification(X_train_norm, y)

Naive Bayes: test core = -113.0 
K Nerest Neighbors: test core = -250.5 
Linear SVM: test core = -260.0 
RBF SVM: test core = -260.0 
Decision Tree: test core = -81.5 
Neural Net: test core = -176.5 
Random Forest: test core = -260.0 
AdaBoost: test core = -31.0 
XGBoost: test core = -30.0 


In [7]:
evaluate_classification(X_train_encode, y)

Naive Bayes: test core = -158.0 
K Nerest Neighbors: test core = -271.5 
Linear SVM: test core = -260.0 
RBF SVM: test core = -260.0 
Decision Tree: test core = -80.5 
Neural Net: test core = -249.0 
Random Forest: test core = -260.0 
AdaBoost: test core = -32.5 
XGBoost: test core = -38.0 


In [8]:
evaluate_classification(X_train_norm_enc, y)

Naive Bayes: test core = -185.0 
K Nerest Neighbors: test core = -239.0 
Linear SVM: test core = -260.0 
RBF SVM: test core = -260.0 
Decision Tree: test core = -81.0 
Neural Net: test core = -163.0 
Random Forest: test core = -260.0 
AdaBoost: test core = -32.5 
XGBoost: test core = -38.0 


<h2> Define xgboost with cost sensitive </h2>

In [9]:
#!/usr/bin/python
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

def logistic_obj(y_hat, dtrain, alpha=5, beta=25): # alpha for FN beta for FP
    y = dtrain.get_label()
    pred = 1. / (1. + np.exp(-y_hat))
    grad = pred * (beta + alpha*y - beta*y) - alpha*y  # alpha*(p-1)*y + beta*p*(1-y)
    hess = pred * (1 - pred) * (beta + alpha*y - beta*y)
    return grad, hess

def err_rate(pred, dtrain):
    y = dtrain.get_label()
    pred = 1. / (1. + np.exp(-pred))
    loss_fn = y*np.log(pred)
    loss_fp = (1.0 - y)*np.log(1.0 - pred)
    return 'error', np.sum(-(5*loss_fn+25*loss_fp))/len(y)



In [10]:
def cross_validate(X_train, X_test, y_train, y_test, depth, num_round):
    # load data
    d_train = xgb.DMatrix(X_train, label=y_train)
    d_val = xgb.DMatrix(X_test, label=y_test)
    # build model
    param = {'max_depth': depth, 'eta': 1, 'silent': 1, 'seed': 0}
#     watchlist = [(d_val, 'eval'), (d_train, 'train')]
#     model_trn = xgb.train(param, d_train, num_round, watchlist, obj=logistic_obj, feval=err_rate)
    model_trn = xgb.train(param, d_train, num_round, obj=logistic_obj, feval=err_rate)
    # prediction
    pred = model_trn.predict(d_val) 
    pred = 1. / (1. + np.exp(-pred))
    return pred

<h2> test with one train-test split</h2>

In [11]:
## normalized dataset
X_train, X_test, y_train, y_test = train_test_split(X_train_norm, y, test_size=0.3, random_state=42)
pred = cross_validate(X_train, X_test, y_train, y_test, 4, 90)
print(f'cost = {profit_scorer(y_test, pred>0.5)}')

cost = -40


  if getattr(data, 'base', None) is not None and \


In [12]:
## encode dataset
X_train, X_test, y_train, y_test = train_test_split(X_train_encode, y, test_size=0.3, random_state=42)
pred = cross_validate(X_train, X_test, y_train, y_test, 2, 100)
print(f'cost = {profit_scorer(y_test, pred>0.5)}')

cost = -5


In [13]:
## encode and normalized dataset
X_train, X_test, y_train, y_test = train_test_split(X_train_norm_enc, y, test_size=0.3, random_state=42)
pred = cross_validate(X_train, X_test, y_train, y_test, 2, 100)
print(f'cost = {profit_scorer(y_test, pred>0.5)}')

cost = -5


<h2> test with cv split</h2>

In [15]:
## normalized dataset
cv = StratifiedKFold(n_splits=10, random_state=42)
cost=0
for train_index, test_index in cv.split(X_train_norm, y):
    X_train, X_test = X_train_norm[train_index], X_train_norm[test_index]
    y_train, y_test = y[train_index], y[test_index]
    pred = cross_validate(X_train, X_test, y_train, y_test, 2, 130)
    cost += profit_scorer(y_test, pred>0.5)

print(f'cost = {cost/10}')

  if getattr(data, 'base', None) is not None and \


cost = -37.0


In [16]:
## encode dataset
cv = StratifiedKFold(n_splits=10, random_state=42)
cost=0
for train_index, test_index in cv.split(X_train_encode, y):
    X_train, X_test = X_train_encode.iloc[train_index], X_train_encode.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    pred = cross_validate(X_train, X_test, y_train, y_test, 2, 210)
    cost += profit_scorer(y_test, pred>0.5)

print(f'cost = {cost/10}')

  if getattr(data, 'base', None) is not None and \


cost = -27.0


In [14]:
## encode and normalized dataset
cv = StratifiedKFold(n_splits=10, random_state=42)
cost=0
for train_index, test_index in cv.split(X_train_norm_enc, y):
    X_train, X_test = X_train_encode.iloc[train_index], X_train_encode.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    pred = cross_validate(X_train, X_test, y_train, y_test, 2, 210)
    cost += profit_scorer(y_test, pred>0.5)

print(f'cost = {cost/10}')

  if getattr(data, 'base', None) is not None and \


cost = -27.0


<h2> test parameters - depth/num_round</h2>

In [92]:
## encode dataset
for i in range(10,20):
    cv = StratifiedKFold(n_splits=10, random_state=42)
    cost=0
    for train_index, test_index in cv.split(X_train_norm, y):
        X_train, X_test = X_train_norm[train_index], X_train_norm[test_index]
        y_train, y_test = y[train_index], y[test_index]
#         pred = cross_validate(X_train, X_test, y_train, y_test, i, 100)
        pred = cross_validate(X_train, X_test, y_train, y_test, 2, i*10)
        cost += profit_scorer(y_test, pred>0.5)
    print(f'depth {i} cost = {cost/10}')

  if getattr(data, 'base', None) is not None and \


depth 10 cost = -40.5


  if getattr(data, 'base', None) is not None and \


depth 11 cost = -40.5


  if getattr(data, 'base', None) is not None and \


depth 12 cost = -43.5


  if getattr(data, 'base', None) is not None and \


depth 13 cost = -37.0


  if getattr(data, 'base', None) is not None and \


depth 14 cost = -43.0


  if getattr(data, 'base', None) is not None and \


depth 15 cost = -43.0


  if getattr(data, 'base', None) is not None and \


depth 16 cost = -40.0


  if getattr(data, 'base', None) is not None and \


depth 17 cost = -43.0


  if getattr(data, 'base', None) is not None and \


depth 18 cost = -40.0


  if getattr(data, 'base', None) is not None and \


depth 19 cost = -49.0


In [91]:
## encode dataset
for i in range(1,10):
    cv = StratifiedKFold(n_splits=10, random_state=42)
    cost=0
    for train_index, test_index in cv.split(X_train_encode, y):
        X_train, X_test = X_train_encode.iloc[train_index], X_train_encode.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        pred = cross_validate(X_train, X_test, y_train, y_test, i, 210)
#         pred = cross_validate(X_train, X_test, y_train, y_test, 2, i*10)
        cost += profit_scorer(y_test, pred>0.5)
    print(f'depth {i} cost = {cost/10}')

  if getattr(data, 'base', None) is not None and \


depth 1 cost = -45.0


  if getattr(data, 'base', None) is not None and \


depth 2 cost = -27.0


  if getattr(data, 'base', None) is not None and \


depth 3 cost = -48.5


  if getattr(data, 'base', None) is not None and \


depth 4 cost = -48.0


  if getattr(data, 'base', None) is not None and \


depth 5 cost = -55.0


  if getattr(data, 'base', None) is not None and \


depth 6 cost = -41.0


  if getattr(data, 'base', None) is not None and \


depth 7 cost = -38.0


  if getattr(data, 'base', None) is not None and \


depth 8 cost = -53.5


  if getattr(data, 'base', None) is not None and \


depth 9 cost = -50.0
