In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gd
import seaborn as sns
%matplotlib inline

np.random.seed(7)
df=pd.read_csv("data.csv")

fr0 = df[df["Class"] == 0].sample( n=492 ).copy()
fr1 = df[df["Class"] == 1].copy()

drop_arr = [ "Time", "V13", "V15", "V24", "V25" ]
#drop_arr = [ "Time" ]

df=pd.concat([fr0, fr1]).drop(drop_arr, 1)

# prepare data for binary classification. I don't want them to be structured, so let's shuffle them

inp = df.drop(["Amount", "Class"], 1).values
fra = df["Class"].values

ind = np.random.permutation(fra.size)

inp = inp[ind]
fra = fra[ind]

# prepare data for amount prediction

ami = df[df["Class"] == 1].drop(["Amount", "Class"], 1).values
amo = df[df["Class"] == 1]["Amount"].values

In [22]:
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score, accuracy_score, f1_score

kf = KFold( n_splits = 4, shuffle = True )

<b>Support Vector Classification</b>

In [26]:
from sklearn import svm

for C_par in [ 0.1, 1, 10, 80 ]:
    for ker_par in [ "linear", "poly", "rbf" ]:
        
        rec_res, acc_res, f1_res = [], [], []
        
        for train_index, test_index in kf.split( inp ):
            xtrain, xtest = inp[train_index], inp[test_index]
            ytrain, ytest = fra[train_index], fra[test_index]
            
            svc_cl = svm.SVC( kernel = ker_par, C = C_par )
            
            svc_cl.fit( xtrain, ytrain )
            
            ypred = svc_cl.predict( xtest )
            
            rec_res.append( recall_score( ypred, ytest ) )
            acc_res.append( accuracy_score( ypred, ytest ) )
            f1_res.append( f1_score( ypred, ytest ) )
            
        print( "C: %.1f\tKernel: %s\tRecall: %.2f\tAccuracy %.2f\tF1 Score %.2f" %
             ( C_par, ker_par, np.mean( rec_res ), np.mean( acc_res ), np.mean( f1_res ) )
             )

C: 0.1	Kernel: linear	Recall: 0.97	Accuracy 0.93	F1 Score 0.93
C: 0.1	Kernel: poly	Recall: 0.99	Accuracy 0.92	F1 Score 0.92
C: 0.1	Kernel: rbf	Recall: 0.91	Accuracy 0.91	F1 Score 0.91
C: 1.0	Kernel: linear	Recall: 0.96	Accuracy 0.93	F1 Score 0.93
C: 1.0	Kernel: poly	Recall: 0.98	Accuracy 0.94	F1 Score 0.93
C: 1.0	Kernel: rbf	Recall: 0.94	Accuracy 0.93	F1 Score 0.92
C: 10.0	Kernel: linear	Recall: 0.96	Accuracy 0.93	F1 Score 0.93
C: 10.0	Kernel: poly	Recall: 0.94	Accuracy 0.92	F1 Score 0.92
C: 10.0	Kernel: rbf	Recall: 0.91	Accuracy 0.92	F1 Score 0.92
C: 80.0	Kernel: linear	Recall: 0.95	Accuracy 0.93	F1 Score 0.93
C: 80.0	Kernel: poly	Recall: 0.92	Accuracy 0.91	F1 Score 0.90
C: 80.0	Kernel: rbf	Recall: 0.90	Accuracy 0.91	F1 Score 0.91


<b>Decision Tree</b>

In [27]:
from sklearn.tree import DecisionTreeClassifier

for depth in [ 3, 4, 5, 6 ]:
    rec_res, acc_res, f1_res = [], [], []
 
    for train_index, test_index in kf.split( inp ):
        xtrain, xtest = inp[train_index], inp[test_index]
        ytrain, ytest = fra[train_index], fra[test_index]
        
        dtree = DecisionTreeClassifier( max_depth = depth )
          
        dtree.fit( xtrain, ytrain )
            
        ypred = dtree.predict( xtest )
            
        rec_res.append( recall_score( ypred, ytest ) )
        acc_res.append( accuracy_score( ypred, ytest ) )
        f1_res.append( f1_score( ypred, ytest ) )
        
    print("Depth: %d\tRecall: %.2f\tAccuracy %.2f\tF1 Score %.2f" %
        ( depth, np.mean( rec_res ), np.mean( acc_res ), np.mean( f1_res ) )
        )

Depth: 3	Recall: 0.94	Accuracy 0.91	F1 Score 0.91
Depth: 4	Recall: 0.93	Accuracy 0.92	F1 Score 0.92
Depth: 5	Recall: 0.95	Accuracy 0.92	F1 Score 0.92
Depth: 6	Recall: 0.95	Accuracy 0.93	F1 Score 0.92


In [28]:
from sklearn.linear_model import LogisticRegression

for C_par in [ 0.01, 0.1, 0.2, 0.3, 1, 10, 80 ]:
    rec_res, acc_res, f1_res = [], [], []
 
    for train_index, test_index in kf.split( inp ):
        xtrain, xtest = inp[train_index], inp[test_index]
        ytrain, ytest = fra[train_index], fra[test_index]
        
        log_re = LogisticRegression( C = C_par, n_jobs = 2 )
          
        log_re.fit( xtrain, ytrain )
            
        ypred = log_re.predict( xtest )
            
        rec_res.append( recall_score( ypred, ytest ) )
        acc_res.append( accuracy_score( ypred, ytest ) )
        f1_res.append( f1_score( ypred, ytest ) )
        
    print("C: %.3f\tRecall: %.2f\tAccuracy %.2f\tF1 Score %.2f" %
        ( C_par, np.mean( rec_res ), np.mean( acc_res ), np.mean( f1_res ) )
        )

C: 0.010	Recall: 0.95	Accuracy 0.93	F1 Score 0.93
C: 0.100	Recall: 0.97	Accuracy 0.94	F1 Score 0.94
C: 0.200	Recall: 0.96	Accuracy 0.93	F1 Score 0.93
C: 0.300	Recall: 0.97	Accuracy 0.94	F1 Score 0.93
C: 1.000	Recall: 0.97	Accuracy 0.94	F1 Score 0.94
C: 10.000	Recall: 0.97	Accuracy 0.94	F1 Score 0.93
C: 80.000	Recall: 0.95	Accuracy 0.93	F1 Score 0.93
