# Fraud Detection

In [220]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import boto3
import os
from sagemaker.amazon.amazon_estimator import get_image_uri
import sagemaker
from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split
import numpy as np

import sagemaker
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv
import nltk
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

## Investigate Data

In [221]:
data = pd.read_csv('../Data/full.csv');

data.head()
data["isDetectedFraud"] = data["isFraud"] && data["isFlaggedFraud"]

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


What's the percentage of fraud vs non-fraud in the dataset?

In [222]:
nonfrauds, frauds = data.groupby('isFraud').size()
print('Number of frauds: ', frauds)
print('Number of non-frauds: ', nonfrauds)
print('Percentage of fradulent data:', 100.*frauds/(frauds + nonfrauds))

nonfrauds, frauds = data.groupby('isFlaggedFraud').size()
print('Number of flagged frauds: ', frauds)
print('Number of non-flagged-frauds: ', nonfrauds)
print('Percentage of flagged fradulent data:', 100.*frauds/(frauds + nonfrauds))

nonfrauds, frauds = data.groupby('isReallyFraud').size()
print('Number of detected frauds: ', frauds)
print('Number of non-detected-frauds: ', nonfrauds)
print('Percentage of detected fradulent data:', 100.*frauds/(frauds + nonfrauds))

Number of frauds:  8213
Number of non-frauds:  6354407
Percentage of fradulent data: 0.12908204481801522


## Process Data

In [223]:
train, validation, test = np.split(data.sample(frac=1, random_state=1729), [int(0.7 * len(data)), int(0.9*len(data))])

In [224]:
print(train.shape)
print(validation.shape)
print(test.shape)

(4453834, 11)
(1272524, 11)
(636262, 11)


In [225]:
train.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
4734074,332,CASH_OUT,133853.35,C913507260,0.0,0.0,C1540196676,290239.96,424093.32,0,0
3114157,236,PAYMENT,11753.05,C1826965934,358115.0,346361.95,M1135314828,0.0,0.0,0,0
4914186,349,TRANSFER,3662944.04,C2058851291,0.0,0.0,C540952107,3801547.66,7464491.7,0,0
6291553,664,CASH_OUT,94020.02,C1134688769,214968.0,120947.98,C433697726,0.0,94020.02,0,0
4391453,321,PAYMENT,22714.83,C527730183,20626.0,0.0,M378158807,0.0,0.0,0,0


Ignoring nameOrig and nameDest. These string values aren't categorical and may not correlate with isFraud. Step is hour in the month, without knowing the month it's difficult to understand weekday vs hour of the day information.

In [226]:
keep_list = ["type", "amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest", "isFraud", "isFlaggedFraud"]
train = train[keep_list]
test = test[keep_list]
validation = validation[keep_list]

train = pd.get_dummies(train, columns=["type"])
test = pd.get_dummies(test, columns=["type"])
validation = pd.get_dummies(validation, columns=["type"])

train.to_csv('train.csv', index=False, header=False)
test.to_csv('test.csv', index=False, header=False)
validation.to_csv('validation.csv', index=False, header=False)

In [227]:
from io import StringIO

sess = sagemaker.Session()
role = get_execution_role()
client = boto3.client('sagemaker')
bucket = sess.default_bucket()

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('test/test.csv')).upload_file('test.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('validation/validation.csv')).upload_file('validation.csv')

In [228]:
train.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
4734074,133853.35,0.0,0.0,290239.96,424093.32,0,0,0,1,0,0,0
3114157,11753.05,358115.0,346361.95,0.0,0.0,0,0,0,0,0,1,0
4914186,3662944.04,0.0,0.0,3801547.66,7464491.7,0,0,0,0,0,0,1
6291553,94020.02,214968.0,120947.98,0.0,94020.02,0,0,0,1,0,0,0
4391453,22714.83,20626.0,0.0,0.0,0.0,0,0,0,0,0,1,0


In [229]:
test.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
649918,132520.8,11508.0,144028.8,0.0,0.0,0,0,1,0,0,0,0
6078014,5927.64,58852.27,52924.63,0.0,0.0,0,0,0,0,0,1,0
3352247,2899.46,6054.0,3154.54,964608.59,967508.05,0,0,0,1,0,0,0
2875453,193469.02,0.0,0.0,248357.55,441826.56,0,0,0,1,0,0,0
5729113,93088.86,107246.0,14157.14,0.0,93088.86,0,0,0,1,0,0,0


In [230]:
validation.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
1236972,18154.02,0.0,0.0,6756.71,24910.73,0,0,0,1,0,0,0
5672574,13438.08,0.0,0.0,0.0,0.0,0,0,0,0,0,1,0
3172712,5628.1,10120.0,4491.9,0.0,0.0,0,0,0,0,0,1,0
928972,170548.46,265128.14,94579.68,2614875.46,2785423.92,0,0,0,1,0,0,0
1181561,33420.31,31643.0,0.0,0.0,33420.31,0,0,0,1,0,0,0


In [231]:
train_labels = np.array(train["isFraud"]).astype("float32")
train_features = np.array(train.drop("isFraud", axis=1)).astype("float32")

In [232]:
test_labels = np.array(test["isFraud"]).astype("float32")
test_features  = np.array(test.drop("isFraud", axis=1)).astype("float32")

In [233]:
validation_labels = np.array(validation["isFraud"]).astype("float32")
validation_features  = np.array(validation.drop("isFraud", axis=1)).astype("float32")

In [234]:
train_features.shape

(4453834, 11)

## Training Linear Model

In [235]:
def get_base_estimator(clf, sess, role):

    container = get_image_uri(boto3.Session().region_name, clf)

    est = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, clf),
                                    sagemaker_session=sess)
    return est

In [236]:
def get_estimator(clf, sess, role):
    
    container = get_image_uri(boto3.Session().region_name, clf)

    
    if clf == 'xgboost':
        est = get_base_estimator(clf, sess, role)
        est.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        num_round=100)
        
    elif clf == 'linear-learner':
        
        est = sagemaker.LinearLearner(role=sagemaker.get_execution_role(),
                                               train_instance_count=1,
                                               train_instance_type='ml.m4.xlarge',
                                               predictor_type='binary_classifier',
                                               num_classes=2)

    elif clf == 'knn':
        est = sagemaker.KNN(role=sagemaker.get_execution_role(),
                                              k = 10,
                                               train_instance_count=1,
                                               train_instance_type='ml.m4.xlarge',
                                               predictor_type='classifier',
                                                sample_size = 200)
        

        
        
    elif clf == 'factorization-machines':
        est = sagemaker.FactorizationMachines(role=sagemaker.get_execution_role(),
                                               train_instance_count=1,
                                               train_instance_type='ml.m4.xlarge',
                                               predictor_type='binary_classifier',
                                                num_factors = 2)
        
        
    return est

In [237]:
sess = sagemaker.Session()
role = get_execution_role()
client = boto3.client('sagemaker')
bucket = sess.default_bucket()

s3_input_train = sagemaker.s3_input(s3_data='s3://{}/train'.format(bucket), content_type='csv')
s3_input_test = sagemaker.s3_input(s3_data='s3://{}/validation'.format(bucket), content_type='csv')

In [238]:
import sagemaker
from sagemaker.amazon.amazon_estimator import RecordSet
import boto3

# instantiate the LinearLearner estimator object
linear = sagemaker.LinearLearner(role=sagemaker.get_execution_role(),
                                               train_instance_count=1,
                                               train_instance_type='ml.m4.xlarge',
                                               predictor_type='binary_classifier',
                                               num_classes=2)



In [239]:
# wrap data in RecordSet objects
train_records = linear.record_set(train_features, train_labels, channel='train')
test_records = linear.record_set(test_features, test_labels, channel='test')

# start a training job
linear.fit([train_records, test_records])

2019-06-13 15:35:08 Starting - Starting the training job...
2019-06-13 15:35:10 Starting - Launching requested ML instances......
2019-06-13 15:36:30 Starting - Preparing the instances for training......
2019-06-13 15:37:32 Downloading - Downloading input data......
2019-06-13 15:38:27 Training - Training image download completed. Training in progress.
[31mDocker entrypoint called with argument(s): train[0m
[31m[06/13/2019 15:38:30 INFO 139879623128896] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_step': u'auto', u'init_method': u'uniform', u'init_sigma': u'0.01', u'lr_scheduler_minimum

## Host Linear Model

In [240]:
linear_predictor = linear.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge')

---------------------------------------------------------------------------------------------------!

## Validate Linear Model

In [241]:
runtime= boto3.client('runtime.sagemaker')

In [242]:
import io

def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter=',', fmt='%g')
    return csv.getvalue().decode().rstrip()

In [243]:
import sys

# Function to generate prediction through sample data
def do_predict_linear(data, endpoint_name, content_type):
    
    payload = np2csv(data)
    response = runtime.invoke_endpoint(EndpointName=endpoint_name, 
                                   ContentType=content_type, 
                                   Body=payload)
    result = json.loads(response['Body'].read().decode())
    preds =  [r['predicted_label'] for r in result['predictions']]

    return preds

# Function to iterate through a larger data set and generate batch predictions
def batch_predict_linear(data, batch_size, endpoint_name, content_type):
    items = len(data)
    arrs = []
    
    for offset in range(0, items, batch_size):
        if offset+batch_size < items:
            datav = data.iloc[offset:(offset+batch_size),:].as_matrix()
            results = do_predict_linear(datav, endpoint_name, content_type)
            arrs.extend(results)
        else:
            datav = data.iloc[offset:items,:].as_matrix()
            arrs.extend(do_predict_linear(datav, endpoint_name, content_type))
        sys.stdout.write('.')
    return(arrs)

In [244]:
preds_train_lin = batch_predict_linear(train.iloc[:,1:], 100, linear_predictor.endpoint , 'text/csv')



........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................



In [245]:
preds_val_lin = batch_predict_linear(validation.iloc[:,1:], 100, linear_predictor.endpoint , 'text/csv')

....



........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................



In [246]:
preds_test_lin = batch_predict_linear(test.iloc[:,1:], 100, linear_predictor.endpoint , 'text/csv')

.................



........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................



## Validation Metrics

- AUC
- Accuracy
- Precision
- Recall

In [247]:
from sklearn.metrics import roc_auc_score

print("Training AUC", roc_auc_score(train_labels, preds_train_lin))
print("Validation AUC", roc_auc_score(validation_labels, preds_val_lin) )
print("Test AUC", roc_auc_score(test_labels, preds_test_lin) )

Training AUC 0.5163216220002299
Validation AUC 0.5142146660996758
Test AUC 0.5225667400967046


In [248]:
from sklearn.metrics import confusion_matrix

tn, fp, fn, tp = confusion_matrix(test_labels.tolist(), preds_test_lin).ravel()
(tn, fp, fn, tp)

(608024, 27411, 754, 73)

In [181]:
from sklearn import svm, datasets
from sklearn.model_selection import cross_val_score
print(preds_test_lin[3])
clf = svm.SVC(gamma='scale', random_state=0)
cross_val_score(clf, preds_test_lin.reshape(-1, 1), test_labels.tolist().reshape(-1, 1), scoring='recall_macro', cv=5)  

0.0018178928876295686


AttributeError: 'list' object has no attribute 'reshape'

In [249]:
print("accuracy: ", (tp+tn)/len(preds_test_lin))

accuracy:  0.9557336443163351


In [250]:
result = [a for a in preds_test_lin if a not in [0]]
print("precision: ", (tp)/len(result))

precision:  0.00265609081647504


In [251]:
result = [a for a in test_labels.tolist() if a not in [0]]
print("recall: ", (tp)/len(result))

recall:  0.08827085852478839


## Hyper Parameter Tuning

In [252]:
def get_tuner(clf, est):
        
    if clf == 'xgboost':
        objective_metric_name = 'validation:auc'

        hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                        'min_child_weight': ContinuousParameter(1, 10),
                        'alpha': ContinuousParameter(0, 2),
                        'max_depth': IntegerParameter(1, 10)}
        
    elif clf == 'knn':
        
        objective_metric_name = 'test:mse'

        hyperparameter_ranges = {'k': IntegerParameter(1, 1024),
                        'sample_size': IntegerParameter(256, 20000000)}
        
    elif clf == 'linear-learner':
        objective_metric_name = 'test:recall'
        
        hyperparameter_ranges = {'l1': ContinuousParameter(0.0000001,1),
                            'use_bias': CategoricalParameter([True, False])}
        
    elif clf == 'factorization-machines':
        objective_metric_name = 'test:binary_classification_accuracy'
        
        hyperparameter_ranges = {'linear_lr': IntegerParameter(1, 1000)}
        
    tuner = HyperparameterTuner(est,
                    objective_metric_name,
                    hyperparameter_ranges,
                    max_jobs=20,
                    max_parallel_jobs=3)
    
    return tuner

In [253]:
def run_training_job(clf):

    # build the estimator
    est = get_estimator(clf, sess, role)

    # get the hyperparameter tuner config 
    # set this to look for recall somehow 
    if clf == 'xgboost':
        
        tuner = get_tuner(clf, est)
        
        tuner.fit({'train': s3_input_train, 'validation': s3_input_test}) 
        
        return tuner

    else:
        # set the records
        train_records = est.record_set(train_features, train_labels, channel='train')
        test_records = est.record_set(test_features, test_labels, channel='test')

        tuner = get_tuner(clf, est)
        
        tuner.fit([train_records, test_records])
        
        return tuner
    

In [254]:
def magic_loop(models_to_run):
    pool = Pool(processes=multiprocessing.cpu_count())
    transformed_rows = pool.map(run_training_job, models_to_run)
    pool.close() 
    pool.join()

In [None]:
clfs = ['linear-learner', 'factorization-machines', 'knn']
magic_loop(clfs)