In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import datetime as dt
import pickle as pkl
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

import sagemaker
import boto3
from sagemaker.predictor import csv_serializer
from sagemaker import get_execution_role

In [None]:
role = get_execution_role()
region = boto3.Session().region_name  

In [None]:
dynamo_bucket_table = boto3.resource('dynamodb', region_name='us-east-1').Table('taxi_training_data_location')
response = dynamo_bucket_table.get_item(Key={'bucketid': 'validation'})

In [None]:
bucket = 'aws-emr-resources-507786327009-us-east-1'
prefix = str(response['Item']['prefix'])

In [None]:
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri

sess = sagemaker.Session()

container = get_image_uri(region, 'xgboost', '0.90-1')

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)


In [None]:
xgb.set_hyperparameters(max_depth=2,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='reg:squarederror',
                        num_round=70,
                        eval_metric='rmse')

In [None]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

# Test

In [None]:
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer
xgb_predictor.deserializer = None

In [None]:
test_file_ = 'part-00000-67622f1f-1e3b-4629-8da4-e92d79a27399-c000.csv'

In [None]:
test_bucket_ = 's3://{}/{}/test/test.csv/{}'.format(bucket, prefix, test_file_)

In [None]:
!aws s3 cp $test_bucket_ .

In [None]:
test_data_df = pd.read_csv(test_file_).head(100)

In [None]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

In [None]:
predictions = predict(test_data_df.values[:, :-1])
print(predictions)