In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gd
import seaborn as sns
%matplotlib inline

np.random.seed(7)

import xgboost
from sklearn import svm

class FraudDetector:
    """ Class than can be used to find fraud transactions and their amounts """
    
    def __init__(self):
        self.trained = False
        
    def fit( self, x, y_fraud, y_amount ):
        """ Training function - takes X values, Y_fraud (sequence of 0 or 1) and Y_amount (positive floats) """
        
        self.fraud_model = svm.SVC( kernel = "poly", C = 0.1 )
        self.fraud_model.fit( x, y_fraud )
        
        self.amount_model = xgboost.XGBRegressor( n_estimators = 200, max_depth = 5, min_child_weight = 1, 
                                                  learning_rate = 0.1, gamma = 0.0 )
        
        # train model using fraudlent transactions only
        fraud_ind = ( y_fraud == 1 )
        self.amount_model.fit( x[fraud_ind], y_amount[fraud_ind] )
        self.trained = True
        
    def predict( self, x ):
        """ Takes X values and returns a tuple - fraud prediction and fraud amount """
        
        if self.trained == 0:
            raise Exception("Model must be trained")
        
        ret1 = self.fraud_model.predict( x )
        ret2 = self.amount_model.predict( x )
        
        # for non-fraudlent transaction the fraud amount is 0
        ret2 = ret2 * ret1
        
        return ret1, ret2

In [9]:
df=pd.read_csv("data.csv")

fr0 = df[df["Class"] == 0].sample( n = 40000 ).copy()
fr1 = df[df["Class"] == 1].copy()

drop_arr = [ "Time", "V13", "V15", "V24", "V25" ]

df=pd.concat([fr0, fr1]).drop(drop_arr, 1)

# shuffle the data frame
df = df.sample(frac=1).reset_index(drop=True)

Vfeat = df.drop(["Amount", "Class"], 1).values
fraud = df["Class"].values
amoun = df["Amount"].values

In [10]:
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score, accuracy_score, f1_score, mean_absolute_error, mean_squared_error

kf = KFold( n_splits = 4, shuffle = True )

rec, acc, f1, ab_err, sq_err = [], [], [], [], []

for train_index, test_index in kf.split( Vfeat ):                
    
    xtrain, xtest = Vfeat[train_index], Vfeat[test_index]
    
    yfraud_train, yfraud_test = fraud[train_index], fraud[test_index]
    
    yamoun_train, yamoun_test = amoun[train_index], amoun[test_index]
     
    fd = FraudDetector()
    fd.fit( xtrain, yfraud_train, yamoun_train )
    
    yfraud_pred, yamoun_pred = fd.predict( xtest )
            
    rec.append( recall_score( yfraud_pred, yfraud_test ) )
    acc.append( accuracy_score( yfraud_pred, yfraud_test ) )
    f1.append( f1_score( yfraud_pred, yfraud_test ) )
    
    ab_err.append( mean_absolute_error( yamoun_pred, yamoun_test ) )
    sq_err.append( mean_squared_error( yamoun_pred, yamoun_test ) )

print("Recall: %.2f\tAccuracy: %.2f\tF1 Score: %.2f\t|   Abs. Err.: %.1f\t Sq. Err.: %.1f Root of Sq. Err.: %.1f"
     % ( np.mean(rec), np.mean(acc), np.mean(f1), np.mean(ab_err), np.mean(sq_err), np.mean(sq_err)**0.5 )
     )

Recall: 0.96	Accuracy: 1.00	F1 Score: 0.86	|   Abs. Err.: 90.7	 Sq. Err.: 82127.7 Root of Sq. Err.: 286.6
