In [None]:
# setup
bucket = 'baby-data'

import boto3
import re
from sagemaker.amazon.amazon_estimator import get_image_uri
import numpy as np
import pandas as pd
import sagemaker
from sagemaker.predictor import csv_serializer 
from sklearn import metrics
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt  
from sqlalchemy import create_engine

s3 = boto3.client('s3') 

pred_data = pd.DataFrame(columns=['actual', 'predicted'])

In [None]:
import sys
!{sys.executable} -m pip install pymysql

In [None]:
import config # this is a file on my local computer that has my database login information
db_engine = create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'.format(config.login['username'], config.login['password'], config.login['host'], '3306', config.login['database']))
connection = db_engine.connect()

In [None]:
# query data from database
query = ("Select Raw.isCSGM, Raw.recording_id, Generated.* "
         "FROM Accelerometer_Generated Generated "
         "INNER JOIN Accelerometer_Raw Raw "
         "ON Raw.id = Generated.raw_id "
         "ORDER BY Raw.timestamp")
data = pd.read_sql(query, connection)
data.drop(['id', 'raw_id', 'left_arm_x_calibrated', 'left_arm_y_calibrated','left_arm_z_calibrated',
            'right_arm_x_calibrated', 'right_arm_y_calibrated','right_arm_z_calibrated',
            'left_leg_x_calibrated', 'left_leg_y_calibrated','left_leg_z_calibrated',
            'right_leg_x_calibrated', 'right_leg_y_calibrated','right_leg_z_calibrated'], axis=1)

In [None]:
recording_id_list = data.recording_id.unique()

In [None]:
pred_data = pd.read_csv('./pred_data.csv')

for recording_id in recording_id_list:
    # hold out one baby's data for testing
    test_data = data.loc[data['recording_id'] == recording_id]
    test_data = test_data.drop(['recording_id'], axis=1)
    test_data = test_data.reset_index(drop=True)
    
    # separate the rest of the data into training and validation
    train_data = pd.DataFrame(columns=data.columns)
    validation_data = pd.DataFrame(columns=data.columns)
    
    # we want 70% of each baby's data in training and 30% of each baby's data in validation
    for inner_recording_id in recording_id_list:
        # make sure we aren't using the test data
        if recording_id != inner_recording_id:
            innerBabyData = data.loc[data['recording_id'] == recording_id]
            innerBabyData = innerBabyData.reset_index(drop=True)
            train, validation = np.split(innerBabyData, [int(0.7*len(innerBabyData))])    
            train_data = train_data.append(train)
            validation_data = validation_data.append(validation)        
            
    validation_data = validation_data.drop(['recording_id'], axis=1)
    train_data = train_data.drop(['recording_id'], axis=1)
    
    train_data.to_csv('train.csv', index=False, header=False)
    validation_data.to_csv('validation.csv', index=False, header=False)

    # copy the file to S3
    boto3.Session().resource('s3').Bucket(bucket).Object('train/train.csv').upload_file('train.csv')
    boto3.Session().resource('s3').Bucket(bucket).Object('validation/validation.csv').upload_file('validation.csv')
    boto3.Session().resource('s3').Bucket(bucket).Object('test/test.csv').upload_file('test.csv')

    container = get_image_uri(boto3.Session().region_name, 'xgboost')

    s3_input_train = sagemaker.s3_input(s3_data = 's3://{}/train'.format(bucket), content_type = 'csv')
    s3_input_validation = sagemaker.s3_input(s3_data = 's3://{}/validation'.format(bucket), content_type = 'csv')

    # train
    sess = sagemaker.Session()

    xgb = sagemaker.estimator.Estimator(container,
                                       role,
                                       train_instance_count=1,
                                       train_instance_type='ml.m4.xlarge',
                                       output_path='s3://{}/output'.format(bucket),
                                       sagemaker_session=sess)
    xgb.set_hyperparameters(objective='binary:logistic', 
                            eval_metric='error',
                            alpha=1.5,
                            eta=.05,
                            max_depth=8,
                            min_child_weight=3.7,
                            num_round=100)

    xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})
    
    # evaluate model
    xgb_predictor = xgb.deploy(initial_instance_count=1,
                               instance_type='ml.m4.xlarge')

    xgb_predictor.content_type = 'text/csv'
    xgb_predictor.serializer = csv_serializer
    xgb_predictor.deserializer = None

    def predict(data, rows=500):
        split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
        predictions = ''
        for array in split_array:
            predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

        return np.fromstring(predictions[1:], sep=',')

    predictions = predict(test_data.as_matrix()[:, 1:])
    
    # concat actual data and prediction
    predictionDF = pd.DataFrame(data=predictions)
    addData = pd.concat([test_data['isCSGM'], predictionDF], axis=1, ignore_index=True)
    addData = addData.rename(columns={0: "actual", 1: "predicted"})
    
    # add to pred_data
    pred_data = pred_data.append(addData)
    
    # delete endpoint
    sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
    
    print('finished: ' + recording_id)

In [None]:
# save pred_data to notebook
pred_data.to_csv('pred_data.csv', index=False, header=True)
# boto3.Session().resource('s3').Bucket(bucket).Object('eval/pred_data.csv').upload_file('pred_data.csv')

In [None]:
## Evaluate predictions from all models

In [None]:
# confusion matrix
pd.crosstab(index=pred_data['actual'], columns=pred_data['predicted'].round(0), rownames=['actuals'], colnames=['predictions'])

In [None]:
# ROC-AUC Score
print("Validation AUC", roc_auc_score(list(pred_data['actual']), list(pred_data['predicted'])))
fpr, tpr, thresholds = metrics.roc_curve(list(pred_data['actual']), list(pred_data['predicted']))
roc_auc = metrics.auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % (roc_auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
print(plt.figure())