# Inspired by these blog posts:
https://towardsdatascience.com/xgboost-in-amazon-sagemaker-28e5e354dbcd
https://github.com/awslabs/amazon-sagemaker-examples/blob/master/hyperparameter_tuning/xgboost_random_log/hpo_xgboost_random_log.ipynb
https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_amazon_algorithms/xgboost_abalone/xgboost_abalone.ipynb

https://aws.amazon.com/blogs/machine-learning/amazon-sagemaker-automatic-model-tuning-now-supports-random-search-and-hyperparameter-scaling/
https://sagemaker.readthedocs.io/en/stable/tuner.html

Train xgboost with Python SDK

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
import io
import os
import datetime as dt
import pickle as pkl
import boto3

import sagemaker
from sagemaker import get_execution_role
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner, IntegerParameter
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer, json_deserializer

In [2]:
role = get_execution_role() 
region = boto3.Session().region_name
bucket = 'kaggle.sf.crime'
filename='raw_data.csv'

In [9]:

raw_data=pd.read_csv('s3://{}/{}'.format(bucket, filename),
                     parse_dates=['Dates'], low_memory=False)

## preprocess the data

In [10]:
hour = raw_data.Dates.dt.hour
raw_data['hour']=hour
month=pd.get_dummies(raw_data.Dates.dt.month)
neighborhood = pd.get_dummies(raw_data.PdDistrict)
month.columns=['Jan','Feb','March','April','May','June','July','Aug','Sep','Oct','Nov','Dec']


In [11]:
#Convert crime labels to label numbers named cData
CrimeData = preprocessing.LabelEncoder()
cData = CrimeData.fit_transform(raw_data.Category)

In [12]:
concat_df = pd.concat([ month,neighborhood, raw_data.hour], axis=1)
concat_df ['crime']=cData
concat_df ['Y']=raw_data['Y']
concat_df ['X']=raw_data['X']

In [13]:
# to ensure the target variable is in teh first column
cols = list(concat_df)
cols.insert(0, cols.pop(cols.index('crime')))
concat_df = concat_df.loc[:, cols]

len(concat_df['crime'].unique().tolist())

## After splitting into training/validation (80/20), push to S3 bucket by upload_file to the Object created

In [16]:
training, validation= train_test_split(concat_df, train_size=.8, random_state=3)



In [93]:
training.to_csv('training.csv',index=False, header=False)
validation.to_csv('validation.csv',index=False, header=False)

In [95]:
boto3.Session().resource('s3').Bucket(bucket).Object('train/train.csv').upload_file('training.csv')
boto3.Session().resource('s3').Bucket(bucket).Object('validation/validation.csv').upload_file('validation.csv')

In [3]:
## Specifies the path to the files in S3 bucket
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/train/'.format(bucket), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/validation/'.format(bucket), content_type='csv')

# train the sagemaker model with hyperparameter tuning

In [32]:
sess = sagemaker.Session()
container = get_image_uri(region, 'xgboost', '0.90-1')


xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/output/'.format(bucket),
                                    sagemaker_session=sess)

# for multiclass target, need to specify num_class, otherwise it won't work, for some reason, can't change learning rate

xgb.set_hyperparameters(
    objective='multi:softmax', #"multi:softmax"
    eval_metric='merror',
    num_round=100, #30-53 minutes for two hyperparameter, very long
    colsample_bytree=1,
    gamma=1.2,
    seed=2,
    num_class=len(concat_df['crime'].unique().tolist())
)
objective_metric_name = 'validation:merror'

In [33]:
hyperparameter_ranges = {
    'subsample': ContinuousParameter(0.5, 1),
    'max_depth': IntegerParameter(3, 10)
}

In [34]:
gridsearch= HyperparameterTuner(
    xgb,
    objective_metric_name,
    hyperparameter_ranges,
    objective_type='Minimize',
    max_jobs=5,
    max_parallel_jobs=10,
    early_stopping_type='Auto',
    strategy='Random')

gridsearch.fit({'train': s3_input_train, 'validation': s3_input_validation}, include_cls_metadata=False)

In [35]:
# check progress of gridsearch
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=gridsearch.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

'InProgress'

In [50]:
sagemaker.HyperparameterTuningJobAnalytics(gridsearch.latest_tuning_job.job_name).dataframe()

Unnamed: 0,FinalObjectiveValue,TrainingElapsedTimeSeconds,TrainingEndTime,TrainingJobName,TrainingJobStatus,TrainingStartTime,subsample
0,0.726957,140.0,2020-02-04 20:21:00+00:00,sagemaker-xgboost-200204-2016-004-ac2e1966,Stopped,2020-02-04 20:18:40+00:00,0.826949
1,0.714692,2984.0,2020-02-04 21:08:26+00:00,sagemaker-xgboost-200204-2016-003-888ebeff,Completed,2020-02-04 20:18:42+00:00,0.91856
2,0.716856,3227.0,2020-02-04 21:12:11+00:00,sagemaker-xgboost-200204-2016-002-a6dfadb9,Completed,2020-02-04 20:18:24+00:00,0.75084
3,0.729605,118.0,2020-02-04 20:21:00+00:00,sagemaker-xgboost-200204-2016-001-6bf72778,Stopped,2020-02-04 20:19:02+00:00,0.501434


## to automate attachment of the best train job from gridearch

In [56]:
# Attach to an existing hyperparameter tuning job, which consists of multiple training job tasks
xgb_tuning_job_name = gridsearch.latest_tuning_job.job_name
xgb_tuner = HyperparameterTuner.attach(xgb_tuning_job_name)

# Get the best XGBoost training job name from thejob
xgb_best_job = xgb_tuner.best_training_job()
print(xgb_best_job)
xgb_tuned = sagemaker.estimator.Estimator.attach(xgb_best_job)

sagemaker-xgboost-200204-2016-003-888ebeff
2020-02-04 21:08:26 Starting - Preparing the instances for training
2020-02-04 21:08:26 Downloading - Downloading input data
2020-02-04 21:08:26 Training - Training image download completed. Training in progress.
2020-02-04 21:08:26 Uploading - Uploading generated training model
2020-02-04 21:08:26 Completed - Training job completed[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value merror to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter _tuning_objective_metric value validation:merror to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sag

## Alternative: to manually do the attachment of the best train job from gridearch
job_name = 'sagemaker-xgboost-200204-1841-002-fc901d44' # the one with the best eval metric
xgb_tuned = sagemaker.estimator.Estimator.attach(job_name)

In [6]:
xgb_tuned.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

-----------------------!

<sagemaker.predictor.RealTimePredictor at 0x7f9b1b9e8a20>

In [7]:
# invoke endpoint and make prediciton
endpt_predictor=sagemaker.predictor.RealTimePredictor(endpoint=job_name)
endpt_predictor.content_type='text/csv'
endpt_predictor.serializer=csv_serializer
endpt_predictor.deserializer= None

In [17]:
select_cols=cols[1:]
print(select_cols)
arr_val=validation[select_cols][-10:].values
pred_result=endpt_predictor.predict(arr_val).decode("utf-8").split(',')

['Jan', 'Feb', 'March', 'April', 'May', 'June', 'July', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN', 'hour', 'Y', 'X']


In [31]:
pred_result=list(map(float, pred_result))
results = [int(i) for i in pred_result]
results

[16, 20, 21, 7, 19, 16, 20, 7, 16, 16]