In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
from classifiers import *
from sklearn.metrics import confusion_matrix

In [2]:
train_data = pd.read_csv('data/train.csv', sep = '|')
test_data = pd.read_csv('data/test.csv', sep = '|')
print(f'Train set has {train_data.shape[0]} entries and {train_data.shape[1]} features')
print(f'Test set has {test_data.shape[0]} entries and {test_data.shape[1]} features')


Train set has 1879 entries and 10 features
Test set has 498121 entries and 9 features


In [3]:
y = train_data['fraud']
X_all = train_data.append(test_data, sort=False).drop(columns=['fraud']).astype(float)
# encode trustLevel
X_encode = pd.get_dummies(X_all, columns=['trustLevel'], prefix='trustLevel')
X_train_encode = X_encode.iloc[:1879,:]
X_test_encode = X_encode.iloc[1879:, :]
X_norm = StandardScaler().fit_transform(X_all)
X_train_norm = X_norm[:1879,:]
X_test_norm = X_norm[1879:,:]

In [4]:
X_sim_encode = pd.get_dummies(X_all['trustLevel'], prefix='trustLevel')
X_rest = X_all.drop(columns="trustLevel")
X_norm_encode = pd.concat([X_sim_encode, 
                           pd.DataFrame(StandardScaler().fit_transform(X_rest), 
                                        columns=X_rest.columns, index=X_rest.index)], sort=False, axis=1)
# print(X_norm_encode.shape)
X_train_norm_enc = X_norm_encode.iloc[:1879,:]
X_test_norm_enc = X_norm_encode.iloc[1879:,:]

In [7]:
# generate data with feature selection
X_select = X_all.drop(columns=['grandTotal', 'quantityModifications'])
X_select_encode = pd.get_dummies(X_select['trustLevel'], prefix='trustLevel')
X_rest = X_select.drop(columns="trustLevel")
X_select_norm_encode = pd.concat([X_select_encode, 
                           pd.DataFrame(StandardScaler().fit_transform(X_rest), 
                                        columns=X_rest.columns, index=X_rest.index)], sort=False, axis=1)
# print(X_norm_encode.shape)
X_train_select = X_select_norm_encode.iloc[:1879,:]
X_test_select = X_select_norm_encode.iloc[1879:,:]

In [8]:
print(X_train_encode.shape)
# print(X_test_encode.shape)
print(X_train_norm.shape)
# print(X_test_norm)
# print(X_test_norm.shape)
print(X_train_norm_enc.shape)
print(X_train_select.shape)

(1879, 14)
(1879, 9)
(1879, 14)
(1879, 12)


In [9]:
evaluate_classification(X_train_norm, y)

Naive Bayes: test core = -113.0 
K Nerest Neighbors: test core = -250.5 
Linear SVM: test core = -260.0 
RBF SVM: test core = -260.0 
Decision Tree: test core = -80.5 
Neural Net: test core = -160.5 
Random Forest: test core = -260.0 
AdaBoost: test core = -31.0 
XGBoost: test core = -30.0 


In [7]:
evaluate_classification(X_train_encode, y)

Naive Bayes: test core = -158.0 
K Nerest Neighbors: test core = -271.5 
Linear SVM: test core = -260.0 
RBF SVM: test core = -260.0 
Decision Tree: test core = -82.0 
Neural Net: test core = -249.0 
Random Forest: test core = -260.0 
AdaBoost: test core = -32.5 
XGBoost: test core = -38.0 


In [11]:
evaluate_classification(X_train_norm_enc, y)

Naive Bayes: test core = -185.0 
K Nerest Neighbors: test core = -239.0 
Linear SVM: test core = -260.0 
RBF SVM: test core = -260.0 
Decision Tree: test core = -80.5 
Neural Net: test core = -160.0 
Random Forest: test core = -257.0 
AdaBoost: test core = -32.5 
XGBoost: test core = -38.0 


In [10]:
evaluate_classification(X_train_select, y)

Naive Bayes: test core = -185.0 
K Nerest Neighbors: test core = -148.5 
Linear SVM: test core = -260.0 
RBF SVM: test core = -182.0 
Decision Tree: test core = -73.5 
Neural Net: test core = -154.0 
Random Forest: test core = -260.0 
AdaBoost: test core = -19.5 
XGBoost: test core = -36.5 


<h2> Define xgboost with cost sensitive </h2>

In [12]:
#!/usr/bin/python
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

def logistic_obj(y_hat, dtrain, alpha=5, beta=25): # alpha for FN beta for FP
    y = dtrain.get_label()
    pred = 1. / (1. + np.exp(-y_hat))
    grad = pred * (beta + alpha*y - beta*y) - alpha*y  # alpha*(p-1)*y + beta*p*(1-y)
    hess = pred * (1 - pred) * (beta + alpha*y - beta*y)
    return grad, hess

def err_rate(pred, dtrain):
    y = dtrain.get_label()
    pred = 1. / (1. + np.exp(-pred))
    loss_fn = y*np.log(pred)
    loss_fp = (1.0 - y)*np.log(1.0 - pred)
    return 'error', np.sum(-(5*loss_fn+25*loss_fp))/len(y)



In [13]:
def cross_validate(X_train, X_test, y_train, y_test, depth, num_round):
    # load data
    d_train = xgb.DMatrix(X_train, label=y_train)
    d_val = xgb.DMatrix(X_test, label=y_test)
    # build model
    param = {'max_depth': depth, 'eta': 1, 'silent': 1, 'seed': 0}
#     watchlist = [(d_val, 'eval'), (d_train, 'train')]
#     model_trn = xgb.train(param, d_train, num_round, watchlist, obj=logistic_obj, feval=err_rate)
    model_trn = xgb.train(param, d_train, num_round, obj=logistic_obj, feval=err_rate)
    # prediction
    pred = model_trn.predict(d_val) 
    pred = 1. / (1. + np.exp(-pred))
    return pred

<h2> test with one train-test split</h2>

In [14]:
## normalized dataset
X_train, X_test, y_train, y_test = train_test_split(X_train_norm, y, test_size=0.3, random_state=42)
pred = cross_validate(X_train, X_test, y_train, y_test, 4, 90)
print(confusion_matrix(y_test, pred>0.5))
print(f'cost = {profit_scorer(y_test, pred>0.5)}')

[[538   2]
 [  5  19]]
cost = -40


  if getattr(data, 'base', None) is not None and \


In [15]:
## normalized dataset with weight 1,5
X_train, X_test, y_train, y_test = train_test_split(X_train_norm, y, test_size=0.3, random_state=42)
pred = cross_validate(X_train, X_test, y_train, y_test, 4, 90)
print(confusion_matrix(y_test, pred>0.5))
print(f'cost = {profit_scorer(y_test, pred>0.5)}')

[[538   2]
 [  5  19]]
cost = -40


In [16]:
## normalized dataset with weight 25,125
X_train, X_test, y_train, y_test = train_test_split(X_train_norm, y, test_size=0.3, random_state=42)
pred = cross_validate(X_train, X_test, y_train, y_test, 7, 30)
print(confusion_matrix(y_test, pred>0.5))
print(f'cost = {profit_scorer(y_test, pred>0.5)}')

[[535   5]
 [  6  18]]
cost = -85


In [17]:
## encode dataset
X_train, X_test, y_train, y_test = train_test_split(X_train_encode, y, test_size=0.3, random_state=42)
pred = cross_validate(X_train, X_test, y_train, y_test, 2, 100)
print(confusion_matrix(y_test, pred>0.5))
print(f'cost = {profit_scorer(y_test, pred>0.5)}')

[[539   1]
 [  4  20]]
cost = -5


In [19]:
## encode and normalized dataset
X_train, X_test, y_train, y_test = train_test_split(X_train_norm_enc, y, test_size=0.3, random_state=42)
pred = cross_validate(X_train, X_test, y_train, y_test, 2, 100)
print(confusion_matrix(y_test, pred>0.5))
print(f'cost = {profit_scorer(y_test, pred>0.5)}')

[[539   1]
 [  4  20]]
cost = -5


In [24]:
## feture select dataset
X_train, X_test, y_train, y_test = train_test_split(X_train_select, y, test_size=0.3, random_state=42)
pred = cross_validate(X_train, X_test, y_train, y_test, 6, 100)
print(confusion_matrix(y_test, pred>0.5))
print(f'cost = {profit_scorer(y_test, pred>0.5)}')

[[537   3]
 [  2  22]]
cost = 45


<h2> test with cv split</h2>

In [25]:
## normalized dataset
cv = StratifiedKFold(n_splits=10, random_state=42)
cost=0
for train_index, test_index in cv.split(X_train_norm, y):
    X_train, X_test = X_train_norm[train_index], X_train_norm[test_index]
    y_train, y_test = y[train_index], y[test_index]
    pred = cross_validate(X_train, X_test, y_train, y_test, 2, 130)
    print(f' confusion matrix: cost({profit_scorer(y_test, pred>0.5)})\n {confusion_matrix(y_test, pred>0.5)}')
    cost += profit_scorer(y_test, pred>0.5)

print(f'average cost = {cost/10}')

  if getattr(data, 'base', None) is not None and \


 confusion matrix: cost(-35)
 [[178   0]
 [  3   8]]
 confusion matrix: cost(-5)
 [[178   0]
 [  2   9]]
 confusion matrix: cost(-40)
 [[177   1]
 [  3   8]]
 confusion matrix: cost(-35)
 [[178   0]
 [  3   8]]
 confusion matrix: cost(20)
 [[178   0]
 [  1   9]]
 confusion matrix: cost(-45)
 [[176   1]
 [  3   7]]
 confusion matrix: cost(-40)
 [[177   0]
 [  3   7]]
 confusion matrix: cost(-45)
 [[176   1]
 [  3   7]]
 confusion matrix: cost(-75)
 [[176   1]
 [  4   6]]
 confusion matrix: cost(-70)
 [[177   0]
 [  4   6]]
average cost = -37.0


In [25]:
## encode dataset
cv = StratifiedKFold(n_splits=10, random_state=42)
cost=0
for train_index, test_index in cv.split(X_train_encode, y):
    X_train, X_test = X_train_encode.iloc[train_index], X_train_encode.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    pred = cross_validate(X_train, X_test, y_train, y_test, 2, 210)
    print(f' confusion matrix: cost({profit_scorer(y_test, pred>0.5)})\n {confusion_matrix(y_test, pred>0.5)}')
    cost += profit_scorer(y_test, pred>0.5)

print(f'average cost = {cost/10}')

  if getattr(data, 'base', None) is not None and \


 confusion matrix: cost(-40)
 [[177   1]
 [  3   8]]
 confusion matrix: cost(25)
 [[178   0]
 [  1  10]]
 confusion matrix: cost(-45)
 [[176   2]
 [  3   8]]
 confusion matrix: cost(-5)
 [[178   0]
 [  2   9]]
 confusion matrix: cost(15)
 [[177   1]
 [  1   9]]
 confusion matrix: cost(15)
 [[176   1]
 [  1   9]]
 confusion matrix: cost(-40)
 [[177   0]
 [  3   7]]
 confusion matrix: cost(-20)
 [[175   2]
 [  2   8]]
 confusion matrix: cost(-75)
 [[176   1]
 [  4   6]]
 confusion matrix: cost(-100)
 [[177   0]
 [  5   5]]
average cost = -27.0


In [26]:
## encode and normalized dataset
cv = StratifiedKFold(n_splits=10, random_state=42)
cost=0
for train_index, test_index in cv.split(X_train_norm_enc, y):
    X_train, X_test = X_train_encode.iloc[train_index], X_train_encode.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    pred = cross_validate(X_train, X_test, y_train, y_test, 2, 210)
    print(f' confusion matrix: cost({profit_scorer(y_test, pred>0.5)})\n {confusion_matrix(y_test, pred>0.5)}')
    cost += profit_scorer(y_test, pred>0.5)

print(f'average cost = {cost/10}')

  if getattr(data, 'base', None) is not None and \


 confusion matrix: cost(-40)
 [[177   1]
 [  3   8]]
 confusion matrix: cost(25)
 [[178   0]
 [  1  10]]
 confusion matrix: cost(-45)
 [[176   2]
 [  3   8]]
 confusion matrix: cost(-5)
 [[178   0]
 [  2   9]]
 confusion matrix: cost(15)
 [[177   1]
 [  1   9]]
 confusion matrix: cost(15)
 [[176   1]
 [  1   9]]
 confusion matrix: cost(-40)
 [[177   0]
 [  3   7]]
 confusion matrix: cost(-20)
 [[175   2]
 [  2   8]]
 confusion matrix: cost(-75)
 [[176   1]
 [  4   6]]
 confusion matrix: cost(-100)
 [[177   0]
 [  5   5]]
average cost = -27.0


In [49]:
## feature selection dataset
cv = StratifiedKFold(n_splits=10, random_state=42)
cost=0
for train_index, test_index in cv.split(X_train_select, y):
    X_train, X_test = X_train_encode.iloc[train_index], X_train_encode.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    pred = cross_validate(X_train, X_test, y_train, y_test, 7, 60)
    print(f' confusion matrix: cost({profit_scorer(y_test, pred>0.5)})\n {confusion_matrix(y_test, pred>0.5)}')
    cost += profit_scorer(y_test, pred>0.5)

print(f'average cost = {cost/10}')

  if getattr(data, 'base', None) is not None and \


 confusion matrix: cost(-40)
 [[177   1]
 [  3   8]]
 confusion matrix: cost(-5)
 [[178   0]
 [  2   9]]
 confusion matrix: cost(-50)
 [[175   3]
 [  3   8]]
 confusion matrix: cost(-65)
 [[178   0]
 [  4   7]]
 confusion matrix: cost(40)
 [[176   2]
 [  0  10]]
 confusion matrix: cost(5)
 [[174   3]
 [  1   9]]
 confusion matrix: cost(-40)
 [[177   0]
 [  3   7]]
 confusion matrix: cost(-15)
 [[176   1]
 [  2   8]]
 confusion matrix: cost(-105)
 [[176   1]
 [  5   5]]
 confusion matrix: cost(-105)
 [[176   1]
 [  5   5]]
average cost = -38.0


<h2> test parameters - depth/num_round</h2>

In [44]:
## feature select dataset
def parameter_tuning(train_data, number_depth, number_rounds):
    for i in range(number_depth):
        cv = StratifiedKFold(n_splits=10, random_state=42)
        cost=0
        for train_index, test_index in cv.split(train_data, y):
            X_train, X_test = X_train_norm[train_index], X_train_norm[test_index]
            y_train, y_test = y[train_index], y[test_index]
            pred = cross_validate(X_train, X_test, y_train, y_test, i, 200)
            cost += profit_scorer(y_test, pred>0.5)
        print(f'depth {i} cost = {cost/10}')
    for i in range(number_rounds):
        cv = StratifiedKFold(n_splits=10, random_state=42)
        cost=0
        for train_index, test_index in cv.split(X_train_norm, y):
            X_train, X_test = X_train_norm[train_index], X_train_norm[test_index]
            y_train, y_test = y[train_index], y[test_index]
            pred = cross_validate(X_train, X_test, y_train, y_test, 2, i*10)
            cost += profit_scorer(y_test, pred>0.5)
        print(f'depth {i} cost = {cost/10}')

In [46]:
parameter_tuning(X_train_norm, 10,20)

  if getattr(data, 'base', None) is not None and \


depth 0 cost = -260.0


  if getattr(data, 'base', None) is not None and \


depth 1 cost = -41.0


  if getattr(data, 'base', None) is not None and \


depth 2 cost = -49.0


  if getattr(data, 'base', None) is not None and \


depth 3 cost = -47.0


  if getattr(data, 'base', None) is not None and \


depth 4 cost = -58.0


  if getattr(data, 'base', None) is not None and \


depth 5 cost = -58.5


  if getattr(data, 'base', None) is not None and \


depth 6 cost = -48.5


  if getattr(data, 'base', None) is not None and \


depth 7 cost = -49.0


  if getattr(data, 'base', None) is not None and \


depth 8 cost = -45.0


  if getattr(data, 'base', None) is not None and \


depth 9 cost = -50.0
depth 0 cost = -835.5
depth 1 cost = -129.5


  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


depth 2 cost = -78.5


  if getattr(data, 'base', None) is not None and \


depth 3 cost = -58.0


  if getattr(data, 'base', None) is not None and \


depth 4 cost = -59.5


  if getattr(data, 'base', None) is not None and \


depth 5 cost = -52.5


  if getattr(data, 'base', None) is not None and \


depth 6 cost = -41.0


  if getattr(data, 'base', None) is not None and \


depth 7 cost = -40.5


  if getattr(data, 'base', None) is not None and \


depth 8 cost = -43.5


  if getattr(data, 'base', None) is not None and \


depth 9 cost = -35.5


  if getattr(data, 'base', None) is not None and \


depth 10 cost = -40.5


  if getattr(data, 'base', None) is not None and \


depth 11 cost = -40.5


  if getattr(data, 'base', None) is not None and \


depth 12 cost = -43.5


  if getattr(data, 'base', None) is not None and \


depth 13 cost = -37.0


  if getattr(data, 'base', None) is not None and \


depth 14 cost = -43.0


  if getattr(data, 'base', None) is not None and \


depth 15 cost = -43.0


  if getattr(data, 'base', None) is not None and \


depth 16 cost = -40.0


  if getattr(data, 'base', None) is not None and \


depth 17 cost = -43.0


  if getattr(data, 'base', None) is not None and \


depth 18 cost = -40.0


  if getattr(data, 'base', None) is not None and \


depth 19 cost = -49.0


In [47]:
parameter_tuning(X_train_encode, 10,20)

  if getattr(data, 'base', None) is not None and \


depth 0 cost = -260.0


  if getattr(data, 'base', None) is not None and \


depth 1 cost = -41.0


  if getattr(data, 'base', None) is not None and \


depth 2 cost = -49.0


  if getattr(data, 'base', None) is not None and \


depth 3 cost = -47.0


  if getattr(data, 'base', None) is not None and \


depth 4 cost = -58.0


  if getattr(data, 'base', None) is not None and \


depth 5 cost = -58.5


  if getattr(data, 'base', None) is not None and \


depth 6 cost = -48.5


  if getattr(data, 'base', None) is not None and \


depth 7 cost = -49.0


  if getattr(data, 'base', None) is not None and \


depth 8 cost = -45.0


  if getattr(data, 'base', None) is not None and \


depth 9 cost = -50.0
depth 0 cost = -835.5
depth 1 cost = -129.5


  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


depth 2 cost = -78.5


  if getattr(data, 'base', None) is not None and \


depth 3 cost = -58.0


  if getattr(data, 'base', None) is not None and \


depth 4 cost = -59.5


  if getattr(data, 'base', None) is not None and \


depth 5 cost = -52.5


  if getattr(data, 'base', None) is not None and \


depth 6 cost = -41.0


  if getattr(data, 'base', None) is not None and \


depth 7 cost = -40.5


  if getattr(data, 'base', None) is not None and \


depth 8 cost = -43.5


  if getattr(data, 'base', None) is not None and \


depth 9 cost = -35.5


  if getattr(data, 'base', None) is not None and \


depth 10 cost = -40.5


  if getattr(data, 'base', None) is not None and \


depth 11 cost = -40.5


  if getattr(data, 'base', None) is not None and \


depth 12 cost = -43.5


  if getattr(data, 'base', None) is not None and \


depth 13 cost = -37.0


  if getattr(data, 'base', None) is not None and \


depth 14 cost = -43.0


  if getattr(data, 'base', None) is not None and \


depth 15 cost = -43.0


  if getattr(data, 'base', None) is not None and \


depth 16 cost = -40.0


  if getattr(data, 'base', None) is not None and \


depth 17 cost = -43.0


  if getattr(data, 'base', None) is not None and \


depth 18 cost = -40.0


  if getattr(data, 'base', None) is not None and \


depth 19 cost = -49.0


In [45]:
parameter_tuning(X_train_select, 10,20)

  if getattr(data, 'base', None) is not None and \


depth 0 cost = -260.0


  if getattr(data, 'base', None) is not None and \


depth 1 cost = -41.0


  if getattr(data, 'base', None) is not None and \


depth 2 cost = -49.0


  if getattr(data, 'base', None) is not None and \


depth 3 cost = -47.0


  if getattr(data, 'base', None) is not None and \


depth 4 cost = -58.0


  if getattr(data, 'base', None) is not None and \


depth 5 cost = -58.5


  if getattr(data, 'base', None) is not None and \


depth 6 cost = -48.5


  if getattr(data, 'base', None) is not None and \


depth 7 cost = -49.0


  if getattr(data, 'base', None) is not None and \


depth 8 cost = -45.0


  if getattr(data, 'base', None) is not None and \


depth 9 cost = -50.0
depth 0 cost = -835.5


  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


depth 1 cost = -129.5
depth 2 cost = -78.5


  if getattr(data, 'base', None) is not None and \


depth 3 cost = -58.0


  if getattr(data, 'base', None) is not None and \


depth 4 cost = -59.5


  if getattr(data, 'base', None) is not None and \


depth 5 cost = -52.5


  if getattr(data, 'base', None) is not None and \


depth 6 cost = -41.0


  if getattr(data, 'base', None) is not None and \


depth 7 cost = -40.5


  if getattr(data, 'base', None) is not None and \


depth 8 cost = -43.5


  if getattr(data, 'base', None) is not None and \


depth 9 cost = -35.5


  if getattr(data, 'base', None) is not None and \


depth 10 cost = -40.5


  if getattr(data, 'base', None) is not None and \


depth 11 cost = -40.5


  if getattr(data, 'base', None) is not None and \


depth 12 cost = -43.5


  if getattr(data, 'base', None) is not None and \


depth 13 cost = -37.0


  if getattr(data, 'base', None) is not None and \


depth 14 cost = -43.0


  if getattr(data, 'base', None) is not None and \


depth 15 cost = -43.0


  if getattr(data, 'base', None) is not None and \


depth 16 cost = -40.0


  if getattr(data, 'base', None) is not None and \


depth 17 cost = -43.0


  if getattr(data, 'base', None) is not None and \


depth 18 cost = -40.0


  if getattr(data, 'base', None) is not None and \


depth 19 cost = -49.0
