In [37]:
import boto3
import sagemaker

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import mxnet as mx

from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer

%matplotlib inline

In [38]:
from sagemaker import get_execution_role
role = get_execution_role()
role

'arn:aws:iam::166807176553:role/service-role/AWSGlueServiceSageMakerNotebookRole-User-Data'

In [39]:
dataset = 's3://ufo-ml-project/ufo_fullset.csv'
dataset = pd.read_csv(dataset)
dataset.head()

Unnamed: 0,reportedTimestamp,eventDate,eventTime,shape,duration,witnesses,weather,firstName,lastName,latitude,longitude,sighting,physicalEvidence,contact,researchOutcome
0,1977-04-04T04:02:23.340Z,1977-03-31,23:46,circle,4,1,rain,Ila,Bashirian,47.329444,-122.578889,Y,N,N,explained
1,1982-11-22T02:06:32.019Z,1982-11-15,22:04,disk,4,1,partly cloudy,Eriberto,Runolfsson,52.664913,-1.034894,Y,Y,N,explained
2,1992-12-07T19:06:52.482Z,1992-12-07,19:01,circle,49,1,clear,Miller,Watsica,38.951667,-92.333889,Y,N,N,explained
3,2011-02-24T21:06:34.898Z,2011-02-21,20:56,disk,13,1,partly cloudy,Clifton,Bechtelar,41.496944,-71.367778,Y,N,N,explained
4,1991-03-09T16:18:45.501Z,1991-03-09,11:42,circle,17,1,mostly cloudy,Jayda,Ebert,47.606389,-122.330833,Y,N,N,explained


In [40]:
dataset.shape

(18000, 15)

In [41]:
X = dataset[['shape', 'duration', 'witnesses', 'weather', 'latitude', 'longitude', 'physicalEvidence', 'contact']]
y = dataset[['researchOutcome']]

In [42]:
#X = X.dropna(subset=['shape', 'weather', 'physicalEvidence', 'contact'])
X['shape'] = X['shape'].astype('category')
X['weather'] = X['weather'].astype('category')
X = pd.get_dummies(X)

print(X)

       duration  witnesses   latitude   longitude  shape_box  shape_circle  \
0             4          1  47.329444 -122.578889          0             1   
1             4          1  52.664913   -1.034894          0             0   
2            49          1  38.951667  -92.333889          0             1   
3            13          1  41.496944  -71.367778          0             0   
4            17          1  47.606389 -122.330833          0             1   
...         ...        ...        ...         ...        ...           ...   
17995        95         10  42.033333  -87.733333          0             0   
17996        55         10  43.004444  -71.348889          0             1   
17997        39         10  36.866389  -83.888889          0             1   
17998        28         10  35.385833  -94.398333          0             0   
17999        38         10  29.883056  -97.941111          0             1   

       shape_disk  shape_light  shape_oval  shape_pyramid  ... 

In [43]:
dependent_label_map = {"explained": 0, "probable": 1, "unexplained": 2}
y = pd.DataFrame(list(map(lambda observation: dependent_label_map[observation], y['researchOutcome'])), columns=['researchOutcome'])
print(y)

       researchOutcome
0                    0
1                    0
2                    0
3                    0
4                    0
...                ...
17995                2
17996                0
17997                0
17998                2
17999                0

[18000 rows x 1 columns]


In [44]:
data = pd.concat([y, X], axis=1)
data.head()

Unnamed: 0,researchOutcome,duration,witnesses,latitude,longitude,shape_box,shape_circle,shape_disk,shape_light,shape_oval,...,weather_fog,weather_mostly cloudy,weather_partly cloudy,weather_rain,weather_snow,weather_stormy,physicalEvidence_N,physicalEvidence_Y,contact_N,contact_Y
0,0,4,1,47.329444,-122.578889,0,1,0,0,0,...,0,0,0,1,0,0,1,0,1,0
1,0,4,1,52.664913,-1.034894,0,0,1,0,0,...,0,0,1,0,0,0,0,1,1,0
2,0,49,1,38.951667,-92.333889,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,0,13,1,41.496944,-71.367778,0,0,1,0,0,...,0,0,1,0,0,0,1,0,1,0
4,0,17,1,47.606389,-122.330833,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0


In [45]:
train_data, validation_data, test_data = np.split(data.sample(frac=1, random_state=1729), [int(0.7 * len(data)), int(0.9 * len(data))])
train_data.to_csv('train.csv', header=False, index=False)
validation_data.to_csv('validation.csv', header=False, index=False)

In [46]:
bucket='ufo-ml-project'
prefix='processed-data'
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')
s3_input_train = TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')

In [50]:
from sagemaker import estimator
import sagemaker

xgboost_container = sagemaker.image_uris.retrieve("xgboost", 'us-west-2', "1.2-1")
hyperparameters={"num_class": 3, "num_round": 50, "objective": "multi:softmax"}

xgboost_estimator = estimator.Estimator(role=role,
                                        image_uri=xgboost_container,
                                        hyperparameters=hyperparameters,
                                        instance_count=1,
                                        instance_type='ml.c4.xlarge',
                                        output_path='s3://ufo-ml-project/classifier-results/',
                                        volume_size=5
                                        )

In [51]:
%%time
xgboost_estimator.fit({'train': s3_input_train, 'validation': s3_input_validation})

2021-01-14 22:11:40 Starting - Starting the training job...
2021-01-14 22:12:03 Starting - Launching requested ML instancesProfilerReport-1610662300: InProgress
......
2021-01-14 22:13:03 Starting - Preparing the instances for training......
2021-01-14 22:14:04 Downloading - Downloading input data...
2021-01-14 22:14:25 Training - Downloading the training image..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV inp

In [53]:
%time
xgb_predictor = xgboost_estimator.deploy(
    initial_instance_count = 1,
    instance_type = 'ml.m4.xlarge',
    serializer = CSVSerializer())

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.63 µs
-----------------!

In [62]:
from sklearn.metrics import accuracy_score

def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

predictions = predict(test_data.to_numpy()[:,1:])
true_results = test_data['researchOutcome'].to_numpy()

accuracy_score(true_results, predictions)

0.9361111111111111

In [63]:
xgb_predictor.delete_endpoint()