In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import classification as clf
import imbalanced as imb
from sklearn.model_selection import KFold 
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, plot_roc_curve, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import statistics
data = clf.load_data("data_for_student_case.csv")

In [2]:
print(data)
pdata = clf.pre_process(data)
print(pdata)
clf.describe_data(pdata)

pdata.head()

          txid          bookingdate issuercountrycode txvariantcode       bin  \
0            1  2015-11-09 14:26:51                MX      mccredit  530056.0   
1            2  2015-11-09 14:27:38                MX      mccredit  547046.0   
2            3  2015-11-23 16:34:16                MX      mccredit  528843.0   
3            4  2015-11-23 16:34:51                MX      mccredit  547146.0   
4            5  2015-11-09 14:26:08                MX   visaclassic  477291.0   
...        ...                  ...               ...           ...       ...   
290377  482729  2015-11-03 18:57:01                SE     visadebit  453903.0   
290378  482731  2015-11-03 08:12:14                SE       mcdebit  554501.0   
290379  482732  2015-11-03 08:12:14                SE     visadebit  453903.0   
290380  482733  2015-11-03 18:56:44                SE       mcdebit  554501.0   
290381  482734  2015-11-03 08:12:14                SE       mcdebit  554501.0   

          amount currencyco

Unnamed: 0,issuercountrycode,txvariantcode,amount,currencycode,shoppercountrycode,shopperinteraction,simple_journal,cvcresponsecode,accountcode,mail_id,ip_id,card_id
0,76.0,2.0,64800.0,2.0,98.0,0.0,1.0,0.0,1.0,187702.0,7386.0,73358.0
1,76.0,2.0,44900.0,2.0,98.0,0.0,1.0,0.0,1.0,834.0,194109.0,44739.0
2,76.0,2.0,149900.0,2.0,98.0,0.0,1.0,0.0,1.0,119287.0,189655.0,122802.0
3,76.0,2.0,109900.0,2.0,98.0,0.0,1.0,0.0,1.0,173561.0,8254.0,70712.0
4,76.0,6.0,89900.0,2.0,98.0,0.0,1.0,0.0,1.0,70647.0,196247.0,208481.0


In [3]:
pdata = imb.pre_process(data)
imb.describe_data(pdata)

# Get feature vector and truth vector from dataset
X, y = imb.get_X_y(pdata)
# X = X.apply(LabelEncoder().fit_transform)

splits = 10
cv = KFold(n_splits=splits, shuffle=True)
scores_whitebox = np.zeros(10)
scores_blackbox = np.zeros(10)
iteration = 0
for train_index, test_index in cv.split(X):
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]

    # SMOTE Re-sampling
    sm = SMOTE(random_state=12)
    X_train, y_train = sm.fit_sample(X_train, y_train)

    # Classifiers
    # Decision Tree classifer (White-Box)
    clf = DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    CM = confusion_matrix(y_test, y_pred)
    TN = CM[0][0]
    FP = CM[0][1]
    false_positve_rate = FP / (FP + TN)
    scores_whitebox[iteration] = false_positve_rate

    # Neural network model (Black-box)
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                        hidden_layer_sizes=(5, 2), random_state=1)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    CM = confusion_matrix(y_test, y_pred)
    TN = CM[0][0]
    FP = CM[0][1]
    false_positve_rate = FP / (FP + TN)
    scores_blackbox[iteration] = false_positve_rate
    iteration += 1 
print("Mean false positve rate (white): " + str(np.mean(scores_whitebox)))
print("Mean false positve rate (black): " + str(np.mean(scores_blackbox)))

# clf.classify_knn(pdata,25)

Total: 237036
Non-Fraud: 236691 = 99.8545%
Fraud: 345 = 0.1455%

Mean false positve rate (white)0.0027039513914254043
Mean false positve rate (black)0.0
