In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import boto3
import os
from sagemaker.amazon.amazon_estimator import get_image_uri
import sagemaker
from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split
import numpy as np

import sagemaker
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv
import nltk
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

In [None]:
data = pd.read_csv('../Data/full.csv');

train, validation, test = np.split(data.sample(frac=1, random_state=1729), [int(0.8 * len(data)), int(0.8*len(data))])

In [None]:
%matplotlib inline

data["isFraud"].hist()

In [None]:
filter = data["isFraud"]==1
data.where(filter).count()

In [None]:
train.head()

In [None]:
keep_list = ["step", "type", "amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest", "isFraud", "isFlaggedFraud"]
train = train[keep_list]
test = test[keep_list]
validation = validation[keep_list]

train = pd.get_dummies(train, columns=["type"])
test = pd.get_dummies(test, columns=["type"])
validation = pd.get_dummies(validation, columns=["type"])

train.to_csv('train.csv', index=False, header=False)
test.to_csv('test.csv', index=False, header=False)
validation.to_csv('validation.csv', index=False, header=False)

In [None]:
from io import StringIO

sess = sagemaker.Session()
role = get_execution_role()
client = boto3.client('sagemaker')
bucket = sess.default_bucket()

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('test/test.csv')).upload_file('test.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('validation/validation.csv')).upload_file('validation.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
validation.head()

In [None]:
train_labels = np.array(train["isFraud"]).astype("float32")
train_features = np.array(train.drop("isFraud", axis=1)).astype("float32")

In [None]:
test_labels = np.array(test["isFraud"]).astype("float32")
test_features  = np.array(test.drop("isFraud", axis=1)).astype("float32")

In [None]:
validation_labels = np.array(validation["isFraud"]).astype("float32")
validation_features  = np.array(validation.drop("isFraud", axis=1)).astype("float32")

In [None]:
train_features.shape

In [None]:
def get_base_estimator(clf, sess, role):

    container = get_image_uri(boto3.Session().region_name, clf)

    est = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, clf),
                                    sagemaker_session=sess)
    return est

In [None]:
def get_estimator(clf, sess, role):
    
    container = get_image_uri(boto3.Session().region_name, clf)

    
    if clf == 'xgboost':
        est = get_base_estimator(clf, sess, role)
        est.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        num_round=100)
        
    elif clf == 'linear-learner':
        
        est = sagemaker.LinearLearner(role=sagemaker.get_execution_role(),
                                               train_instance_count=1,
                                               train_instance_type='ml.m4.xlarge',
                                               predictor_type='binary_classifier',
                                               num_classes=2)

    elif clf == 'knn':
        est = sagemaker.KNN(role=sagemaker.get_execution_role(),
                                              k = 10,
                                               train_instance_count=1,
                                               train_instance_type='ml.m4.xlarge',
                                               predictor_type='classifier',
                                                sample_size = 200)
        

        
        
    elif clf == 'factorization-machines':
        est = sagemaker.FactorizationMachines(role=sagemaker.get_execution_role(),
                                               train_instance_count=1,
                                               train_instance_type='ml.m4.xlarge',
                                               predictor_type='binary_classifier',
                                                num_factors = 2)
        
        
    return est

In [None]:
# add k-fold cross validation here 
sess = sagemaker.Session()
role = get_execution_role()
client = boto3.client('sagemaker')
bucket = sess.default_bucket()

s3_input_train = sagemaker.s3_input(s3_data='s3://{}/train'.format(bucket), content_type='csv')
s3_input_test = sagemaker.s3_input(s3_data='s3://{}/test'.format(bucket), content_type='csv')

In [None]:
import sagemaker
from sagemaker.amazon.amazon_estimator import RecordSet
import boto3

# instantiate the LinearLearner estimator object
multiclass_estimator = sagemaker.LinearLearner(role=sagemaker.get_execution_role(),
                                               train_instance_count=1,
                                               train_instance_type='ml.m4.xlarge',
                                               predictor_type='binary_classifier',
                                               num_classes=2)



In [None]:
# wrap data in RecordSet objects
train_records = multiclass_estimator.record_set(train_features, train_labels, channel='train')
test_records = multiclass_estimator.record_set(test_features, test_labels, channel='test')

# start a training job
multiclass_estimator.fit([train_records, test_records])

In [None]:
def get_tuner(clf, est):
        
    if clf == 'xgboost':
        objective_metric_name = 'validation:auc'

        hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                        'min_child_weight': ContinuousParameter(1, 10),
                        'alpha': ContinuousParameter(0, 2),
                        'max_depth': IntegerParameter(1, 10)}
        
    elif clf == 'knn':
        
        objective_metric_name = 'test:accuracy'

        hyperparameter_ranges = {'k': IntegerParameter(1, 1024),
                        'sample_size': IntegerParameter(256, 20000000)}
        
    elif clf == 'linear-learner':
        objective_metric_name = 'test:recall'
        
        hyperparameter_ranges = {'l1': ContinuousParameter(0.0000001,1),
                            'use_bias': CategoricalParameter([True, False])}
        
    elif clf == 'factorization-machines':
        objective_metric_name = 'test:binary_classification_accuracy'
        
        hyperparameter_ranges = {'bias_wd': IntegerParameter(1, 1000)}
        
    tuner = HyperparameterTuner(est,
                    objective_metric_name,
                    hyperparameter_ranges,
                    max_jobs=20,
                    max_parallel_jobs=3)
    
    return tuner

In [None]:
def run_training_job(clf):

    # build the estimator
    est = get_estimator(clf, sess, role)

    # get the hyperparameter tuner config 
    # set this to look for recall somehow 
    if clf == 'xgboost':
        
        tuner = get_tuner(clf, est)
        
        tuner.fit({'train': s3_input_train, 'test': s3_input_test}) 
        
        return tuner

    else:
        # set the records
        train_records = est.record_set(train_features, train_labels, channel='train')
        test_records = est.record_set(test_features, test_labels, channel='validation')

        tuner = get_tuner(clf, est)
        
        tuner.fit([train_records, test_records])
        
        return tuner
    

In [None]:
def magic_loop(models_to_run):
    pool = Pool(processes=multiprocessing.cpu_count())
    transformed_rows = pool.map(run_training_job, models_to_run)
    pool.close() 
    pool.join()

In [None]:
clfs = ['xgboost', 'linear-learner', 'factorization-machines', 'knn']
magic_loop(clfs)