In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import datetime as dt
import pickle as pkl
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

import sagemaker
import boto3
from sagemaker.predictor import csv_serializer
from sagemaker import get_execution_role

In [77]:
role = get_execution_role()
region = boto3.Session().region_name  

In [78]:
bucket = 'aws-emr-resources-507786327009-us-east-1'
prefix = 'taxidata_v6'

In [79]:
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')

In [80]:
from sagemaker.amazon.amazon_estimator import get_image_uri

sess = sagemaker.Session()

container = get_image_uri(region, 'xgboost', '0.90-1')

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)


In [81]:
xgb.set_hyperparameters(max_depth=2,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='reg:squarederror',
                        num_round=70,
                        eval_metric='rmse')

In [None]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2020-04-05 11:39:51 Starting - Starting the training job...
2020-04-05 11:39:53 Starting - Launching requested ML instances....

# Test

In [67]:
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer
xgb_predictor.deserializer = None

---------------!

In [29]:
test_file_ = 'part-00000-67622f1f-1e3b-4629-8da4-e92d79a27399-c000.csv'

In [26]:
test_bucket_ = 's3://{}/{}/test/test.csv/part-00000-3af93797-4b2e-4e1e-a3ea-b060bd2b218b-c000.csv'.format(bucket, prefix)

In [27]:
!aws s3 cp $test_bucket_ .

download: s3://aws-emr-resources-507786327009-us-east-1/taxidata2/test/test.csv/part-00000-3af93797-4b2e-4e1e-a3ea-b060bd2b218b-c000.csv to ./part-00000-3af93797-4b2e-4e1e-a3ea-b060bd2b218b-c000.csv


In [68]:
test_data_df = pd.read_csv(test_file_).head(100)

In [72]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

In [73]:
predictions = predict(test_data_df.values[:, :-1])
print(predictions)

[113.1368866  113.1368866  181.86711121 157.83950806 126.46881866
 108.34696198 113.86780548  74.41907501  75.03672028 117.08781433
 114.34994507 123.1421051  172.13899231 172.13899231  98.21212769
  73.9143219   84.66641998 113.72432709 103.52446747 105.35676575
 103.52446747  73.9143219  166.65112305 111.9879837  111.9879837
 102.25985718 125.74923706 139.85775757 122.38574982 160.06062317
  88.25262451 113.1368866  109.02374268  73.9143219  112.94224548
 112.94224548 118.9678421  118.46308899 157.38226318  80.04853058
  85.30313873 112.38722992 107.36790466  78.02746582  73.9143219
  78.02746582 130.6973877  130.6973877  136.1852417  109.61118317
 113.72432709 109.61118317 109.61118317 117.08781433 123.59664154
 154.00265503 113.72432709  78.02746582  73.9143219  172.13899231
  74.41907501 113.72432709  74.41907501  73.78356934  70.83407593
 113.86851501 103.52446747  73.9143219   78.53221893  78.02746582
  84.66641998  98.21212769 166.65112305 120.96929932 120.96929932
 135.9752807

In [74]:
print(test_data_df.values[:, -1])

[0.   0.   0.   3.99 0.   2.19 0.   1.45 0.   0.   0.   0.   0.   4.06
 1.96 1.36 0.   0.   3.7  0.   0.   4.14 0.   0.   0.   1.96 0.   7.29
 0.   0.   0.   0.   1.56 0.   1.76 1.66 0.   0.   0.   0.   0.   6.
 5.08 0.   0.96 0.   0.   0.   6.2  1.2  0.   0.   0.   0.   0.   0.
 0.   0.   0.01 2.5  1.76 0.   0.   0.   3.   2.2  1.08 1.36 0.   0.
 0.   1.96 0.   1.16 1.56 2.   1.36 0.   0.   2.36 0.   4.46 2.   0.
 2.32 1.36 0.   4.26 0.   1.   0.   1.56 1.36 0.   2.86 2.   4.   1.26
 0.   4.  ]


In [75]:
test_data_df.head()

Unnamed: 0,95,56,1,1.52,1.1,9.3,2,1.2,8.0,0.0,...,20,10,41,2017.1,3.1,20.1,10.1,49,1.4194,0.0.1
0,95,95,1,0.98,1,8.3,2,1.0,7.0,0.0,...,20,10,12,2017,3,20,10,19,1.275,0.0
1,95,28,1,1.69,1,8.8,2,1.0,7.5,0.0,...,20,10,51,2017,3,20,10,57,1.0444,0.0
2,255,52,1,4.3,1,15.3,2,1.0,14.0,0.0,...,20,10,12,2017,3,20,10,25,2.0528,0.0
3,255,49,1,2.9,1,17.29,1,1.0,12.0,0.3325,...,20,10,49,2017,3,20,11,3,2.3111,3.99
4,226,82,1,3.06,1,12.8,2,1.0,11.5,0.0,...,20,10,9,2017,3,20,10,22,2.1111,0.0


In [53]:
a

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [55]:
a[:-1]

[0, 1, 2, 3, 4, 5, 6, 7, 8]