# Fraud Detection

In [124]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import boto3
import os
from sagemaker.amazon.amazon_estimator import get_image_uri
import sagemaker
from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split
import numpy as np

import sagemaker
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv
import nltk
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

## Investigate Data

In [125]:
data = pd.read_csv('../Data/full.csv');

data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


What's the percentage of fraud vs non-fraud in the dataset?

In [126]:
nonfrauds, frauds = data.groupby('isFraud').size()
print('Number of frauds: ', frauds)
print('Number of non-frauds: ', nonfrauds)
print('Percentage of fradulent data:', 100.*frauds/(frauds + nonfrauds))

Number of frauds:  8213
Number of non-frauds:  6354407
Percentage of fradulent data: 0.12908204481801522


## Process Data

In [127]:
train, validation, test = np.split(data.sample(frac=1, random_state=1729), [int(0.7 * len(data)), int(0.9*len(data))])

In [128]:
print(train.shape)
print(validation.shape)
print(test.shape)

(4453834, 11)
(1272524, 11)
(636262, 11)


In [129]:
train.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
4734074,332,CASH_OUT,133853.35,C913507260,0.0,0.0,C1540196676,290239.96,424093.32,0,0
3114157,236,PAYMENT,11753.05,C1826965934,358115.0,346361.95,M1135314828,0.0,0.0,0,0
4914186,349,TRANSFER,3662944.04,C2058851291,0.0,0.0,C540952107,3801547.66,7464491.7,0,0
6291553,664,CASH_OUT,94020.02,C1134688769,214968.0,120947.98,C433697726,0.0,94020.02,0,0
4391453,321,PAYMENT,22714.83,C527730183,20626.0,0.0,M378158807,0.0,0.0,0,0


Ignoring nameOrig and nameDest. These string values aren't categorical and may not correlate with isFraud. Step is hour in the month, without knowing the month it's difficult to understand weekday vs hour of the day information.

In [130]:
keep_list = ["type", "amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest", "isFraud", "isFlaggedFraud"]
train = train[keep_list]
test = test[keep_list]
validation = validation[keep_list]

train = pd.get_dummies(train, columns=["type"])
test = pd.get_dummies(test, columns=["type"])
validation = pd.get_dummies(validation, columns=["type"])

train.to_csv('train.csv', index=False, header=False)
test.to_csv('test.csv', index=False, header=False)
validation.to_csv('validation.csv', index=False, header=False)

In [None]:
from io import StringIO

sess = sagemaker.Session()
role = get_execution_role()
client = boto3.client('sagemaker')
bucket = sess.default_bucket()

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('test/test.csv')).upload_file('test.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('validation/validation.csv')).upload_file('validation.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
validation.head()

In [None]:
train_labels = np.array(train["isFraud"]).astype("float32")
train_features = np.array(train.drop("isFraud", axis=1)).astype("float32")

In [None]:
test_labels = np.array(test["isFraud"]).astype("float32")
test_features  = np.array(test.drop("isFraud", axis=1)).astype("float32")

In [None]:
validation_labels = np.array(validation["isFraud"]).astype("float32")
validation_features  = np.array(validation.drop("isFraud", axis=1)).astype("float32")

In [None]:
train_features.shape

## Training Linear Model

In [None]:
def get_base_estimator(clf, sess, role):

    container = get_image_uri(boto3.Session().region_name, clf)

    est = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, clf),
                                    sagemaker_session=sess)
    return est

In [None]:
def get_estimator(clf, sess, role):
    
    container = get_image_uri(boto3.Session().region_name, clf)

    
    if clf == 'xgboost':
        est = get_base_estimator(clf, sess, role)
        est.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        num_round=100)
        
    elif clf == 'linear-learner':
        
        est = sagemaker.LinearLearner(role=sagemaker.get_execution_role(),
                                               train_instance_count=1,
                                               train_instance_type='ml.m4.xlarge',
                                               predictor_type='binary_classifier',
                                               num_classes=2)

    elif clf == 'knn':
        est = sagemaker.KNN(role=sagemaker.get_execution_role(),
                                              k = 10,
                                               train_instance_count=1,
                                               train_instance_type='ml.m4.xlarge',
                                               predictor_type='classifier',
                                                sample_size = 200)
        

        
        
    elif clf == 'factorization-machines':
        est = sagemaker.FactorizationMachines(role=sagemaker.get_execution_role(),
                                               train_instance_count=1,
                                               train_instance_type='ml.m4.xlarge',
                                               predictor_type='binary_classifier',
                                                num_factors = 2)
        
        
    return est

In [None]:
sess = sagemaker.Session()
role = get_execution_role()
client = boto3.client('sagemaker')
bucket = sess.default_bucket()

s3_input_train = sagemaker.s3_input(s3_data='s3://{}/train'.format(bucket), content_type='csv')
s3_input_test = sagemaker.s3_input(s3_data='s3://{}/validation'.format(bucket), content_type='csv')

In [None]:
import sagemaker
from sagemaker.amazon.amazon_estimator import RecordSet
import boto3

# instantiate the LinearLearner estimator object
linear = sagemaker.LinearLearner(role=sagemaker.get_execution_role(),
                                               train_instance_count=1,
                                               train_instance_type='ml.m4.xlarge',
                                               predictor_type='binary_classifier',
                                               num_classes=2)



In [None]:
# wrap data in RecordSet objects
train_records = linear.record_set(train_features, train_labels, channel='train')
test_records = linear.record_set(test_features, test_labels, channel='test')

# start a training job
linear.fit([train_records, test_records])

## Host Linear Model

In [None]:
linear_predictor = linear.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge')

## Validate Linear Model

In [None]:
runtime= boto3.client('runtime.sagemaker')

In [None]:
import io

def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter=',', fmt='%g')
    return csv.getvalue().decode().rstrip()

In [None]:
import sys

# Function to generate prediction through sample data
def do_predict_linear(data, endpoint_name, content_type):
    
    payload = np2csv(data)
    response = runtime.invoke_endpoint(EndpointName=endpoint_name, 
                                   ContentType=content_type, 
                                   Body=payload)
    result = json.loads(response['Body'].read().decode())
    preds =  [r['score'] for r in result['predictions']]

    return preds

# Function to iterate through a larger data set and generate batch predictions
def batch_predict_linear(data, batch_size, endpoint_name, content_type):
    items = len(data)
    arrs = []
    
    for offset in range(0, items, batch_size):
        if offset+batch_size < items:
            datav = data.iloc[offset:(offset+batch_size),:].as_matrix()
            results = do_predict_linear(datav, endpoint_name, content_type)
            arrs.extend(results)
        else:
            datav = data.iloc[offset:items,:].as_matrix()
            arrs.extend(do_predict_linear(datav, endpoint_name, content_type))
        sys.stdout.write('.')
    return(arrs)

In [None]:
preds_train_lin = batch_predict_linear(train.iloc[:,1:], 100, linear_predictor.endpoint , 'text/csv')

In [None]:
preds_val_lin = batch_predict_linear(validation.iloc[:,1:], 100, linear_predictor.endpoint , 'text/csv')

In [None]:
preds_test_lin = batch_predict_linear(test.iloc[:,1:], 100, linear_predictor.endpoint , 'text/csv')

In [153]:
from sklearn.metrics import roc_auc_score

print("Training AUC", roc_auc_score(train_labels, preds_train_lin)) ##0.9091
print("Validation AUC", roc_auc_score(validation_labels, preds_val_lin) )###0.8998
print("Test AUC", roc_auc_score(test_labels, preds_test_lin) )###0.9033

Training AUC 0.8621622676796726
Validation AUC 0.8639127885509111
Test AUC 0.8660627507749715


In [None]:
import pandas as pd

pd.crosstab(np.where(validation_labels[0] == 0, 1, 0), preds_val_lin, rownames=['actuals'], colnames=['predictions'])

## Hyper Parameter Tuning

In [159]:
def get_tuner(clf, est):
        
    if clf == 'xgboost':
        objective_metric_name = 'validation:auc'

        hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                        'min_child_weight': ContinuousParameter(1, 10),
                        'alpha': ContinuousParameter(0, 2),
                        'max_depth': IntegerParameter(1, 10)}
        
    elif clf == 'knn':
        
        objective_metric_name = 'test:mse'

        hyperparameter_ranges = {'k': IntegerParameter(1, 1024),
                        'sample_size': IntegerParameter(256, 20000000)}
        
    elif clf == 'linear-learner':
        objective_metric_name = 'test:recall'
        
        hyperparameter_ranges = {'l1': ContinuousParameter(0.0000001,1),
                            'use_bias': CategoricalParameter([True, False])}
        
    elif clf == 'factorization-machines':
        objective_metric_name = 'test:binary_classification_accuracy'
        
        hyperparameter_ranges = {'linear_lr': IntegerParameter(1, 1000)}
        
    tuner = HyperparameterTuner(est,
                    objective_metric_name,
                    hyperparameter_ranges,
                    max_jobs=20,
                    max_parallel_jobs=3)
    
    return tuner

In [160]:
def run_training_job(clf):

    # build the estimator
    est = get_estimator(clf, sess, role)

    # get the hyperparameter tuner config 
    # set this to look for recall somehow 
    if clf == 'xgboost':
        
        tuner = get_tuner(clf, est)
        
        tuner.fit({'train': s3_input_train, 'validation': s3_input_test}) 
        
        return tuner

    else:
        # set the records
        train_records = est.record_set(train_features, train_labels, channel='train')
        test_records = est.record_set(test_features, test_labels, channel='test')

        tuner = get_tuner(clf, est)
        
        tuner.fit([train_records, test_records])
        
        return tuner
    

In [161]:
def magic_loop(models_to_run):
    pool = Pool(processes=multiprocessing.cpu_count())
    transformed_rows = pool.map(run_training_job, models_to_run)
    pool.close() 
    pool.join()

In [162]:
clfs = ['linear-learner', 'factorization-machines', 'knn']
magic_loop(clfs)

MaybeEncodingError: Error sending result: '[<sagemaker.tuner.HyperparameterTuner object at 0x7fa365181048>]'. Reason: 'AttributeError("Can't pickle local object 'lazy_call.<locals>._handler'",)'