In [None]:
%%sh
pip -q install pip --upgrade
pip -q install sagemaker awscli boto3 pandas --upgrade

In [None]:
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [None]:
import os, sys, time
import numpy as np 
import pandas as pd
import boto3, sagemaker

print (boto3.__version__)
print (sagemaker.__version__)

sess   = sagemaker.Session()
bucket = sess.default_bucket()                     
prefix = 'DEMO-xgboost-script'


role = sagemaker.get_execution_role()

In [None]:
%%sh
wget -N https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip
unzip -o bank-additional.zip

In [None]:
data = pd.read_csv('./bank-additional/bank-additional-full.csv', sep=';')

# Remove dots in strings and column names
# to avoid Spark problems when baselining in SM Model Monitor
data.columns = data.columns.str.replace('\.', '_')
data.replace(to_replace='\.', value='_', inplace=True, regex=True)

# One-hot encode
data = pd.get_dummies(data)
data = data.drop(['y_no'], axis=1)

# Move labels to first column, which is what SM Model Monitor expects
data = pd.concat([data['y_yes'], data.drop(['y_yes'], axis=1)], axis=1)

# Split into training and validation (95/5)
train_data, val_data, _ = np.split(
    data.sample(frac=1, random_state=123),
    [int(0.95 * len(data)), int(len(data))]
)

# Save to CSV files
train_data.to_csv('training.csv', index=False, header=True, sep=',') # Need to keep column names
val_data.to_csv('validation.csv', index=False, header=True, sep=',')

In [None]:
train_data.head()

In [None]:
output = "s3://{}/{}/output/".format(bucket,prefix)
print(output)

# Train and deploy on SageMaker

In [None]:
training = sess.upload_data(path="training.csv", key_prefix=prefix + "/training")
validation = sess.upload_data(path="validation.csv", key_prefix=prefix + "/validation")
print(training)
print(validation)

In [None]:
from sagemaker.xgboost import XGBoost

xgb_estimator = XGBoost(entry_point='xgb.py', 
                          role=role,
                          train_instance_count=1, 
                          train_instance_type='ml.m4.xlarge',
                          framework_version='0.90-2',
                          py_version='py3',
                          output_path=output,
                          hyperparameters={
                              'max-depth': 5,
                              'eval-metric': 'error'
                          }
                       )

In [None]:
xgb_estimator.fit({'training':training, 'validation':validation})

In [None]:
xgb_endpoint_name = prefix+time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

xgb_predictor = xgb_estimator.deploy(
                     initial_instance_count=1, 
                     instance_type='ml.m4.xlarge',
                     endpoint_name=xgb_endpoint_name)

In [None]:
print(xgb_endpoint_name)

In [None]:
smrt = boto3.client('sagemaker-runtime')

# Predict samples from the validation set
payload = val_data[:100].drop(['y_yes'], axis=1) 
payload = payload.to_csv(header=False, index=False).rstrip()

print(payload)

In [None]:
response = smrt.invoke_endpoint(
    EndpointName=xgb_endpoint_name,
    Body=payload.encode('utf8'),
    ContentType='text/csv')

print(response['Body'].read())

In [None]:
#sess.delete_endpoint(endpoint_name=xgb_endpoint_name)