In [None]:
bucket = 'baby-data'

import boto3
import re
from sagemaker import get_execution_role

role = get_execution_role()

In [None]:
import numpy as np
import pandas as pd
import sagemaker
from sagemaker.predictor import csv_serializer 

In [None]:
s3 = boto3.client('s3') 
obj = s3.get_object(Bucket = bucket, Key = 'mergedData.csv') 

data = pd.read_csv(obj['Body'])

In [None]:
data = data.drop(['id'], axis=1)

In [None]:
# separate training and validation
train_data, validation_data = np.split(data, [int(0.7 * len(data))])  

In [None]:
train_data.to_csv('train.csv', index=False, header=False)
validation_data.to_csv('validation.csv', index=False, header=False)

In [None]:
# copy the file to S3
boto3.Session().resource('s3').Bucket(bucket).Object('train/train.csv').upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object('validation/validation.csv').upload_file('validation.csv')

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'xgboost')

s3_input_train = sagemaker.s3_input(s3_data = 's3://{}/train'.format(bucket), content_type = 'csv')
s3_input_validation = sagemaker.s3_input(s3_data = 's3://{}/validation'.format(bucket), content_type = 'csv')

In [None]:
# train
sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(container,
                                   role,
                                   train_instance_count=1,
                                   train_instance_type='ml.m4.xlarge',
                                   output_path='s3://{}/output'.format(bucket),
                                   sagemaker_session=sess)
xgb.set_hyperparameters(objective='binary:logistic', 
                        eval_metric='error',
                        alpha=1.5,
                        eta=.05,
                        max_depth=8,
                        min_child_weight=3.7,
                        num_round=100)

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [None]:
# evaluate
xgb_predictor = xgb.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')

In [None]:
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer
xgb_predictor.deserializer = None

In [None]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

In [None]:
predictions = predict(validation_data.as_matrix()[:, 1:])

In [None]:
# confusion matrix
pd.crosstab(index=validation_data['CSGM'], columns=np.round(predictions), rownames=['actuals'], colnames=['predictions'])

In [None]:
# ROC-AUC Score
from sklearn import metrics
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt   

In [None]:
print("Validation AUC", roc_auc_score(validation_data['CSGM'], predictions))

In [None]:
print("Validation AUC", roc_auc_score(validation_data['CSGM'], predictions))

fpr, tpr, thresholds = metrics.roc_curve(validation_data['CSGM'], predictions)
roc_auc = metrics.auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % (roc_auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

print(plt.figure())

In [None]:
# clean up
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)