#### Training a XGBoost Classifier

Our approach was to develop a XGBClassifier to predict the "NEXT ACTION"

In [1]:
import boto3
import sagemaker
import json
from sagemaker import get_execution_role
from sagemaker.sklearn import SKLearnModel
#from sagemaker.estimator import Estimator
from sagemaker import image_uris
from datetime import datetime
import pytz
import ast
from sagemaker.local import LocalSession
from sagemaker.model import Model
from sagemaker.tuner import HyperparameterTuner
from sklearn.metrics import classification_report
import numpy as np

  import scipy.sparse


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
role = get_execution_role()
region = boto3.Session().region_name
sess = sagemaker.Session()
sagemaker_runtime = boto3.client('sagemaker-runtime', region_name=region)

# Local Session
#sess = LocalSession()
#sess.config = {'local': {'local_code':True}}

Bucket = 'sagemaker-us-east-1-254050731868'
Prefix = 'starbucks-capstone'

#### Read The best Model

In [3]:
image_uri = image_uris.retrieve(framework="sklearn", region=region, version="1.2-1")
tuning_job_name = 'tn-stb-event-sep2124-194742'
tuning_job = HyperparameterTuner.attach(tuning_job_name, sagemaker_session=sess)
best_job = tuning_job.best_training_job()
best_hyperparameters = tuning_job.best_estimator().hyperparameters()
model_data = tuning_job.best_estimator().model_data
print('model data: ', model_data)
print('Best Job: ', best_job)
print('Best HyperParameters' ,best_hyperparameters)


2024-09-21 19:54:33 Starting - Found matching resource for reuse
2024-09-21 19:54:33 Downloading - Downloading the training image
2024-09-21 19:54:33 Training - Training image download completed. Training in progress.
2024-09-21 19:54:33 Uploading - Uploading generated training model
2024-09-21 19:54:33 Completed - Resource reused by training job: tn-stb-event-sep2124-194742-011-e01417b1

2024-09-21 19:54:33 Starting - Found matching resource for reuse
2024-09-21 19:54:33 Downloading - Downloading the training image
2024-09-21 19:54:33 Training - Training image download completed. Training in progress.
2024-09-21 19:54:33 Uploading - Uploading generated training model
2024-09-21 19:54:33 Completed - Resource reused by training job: tn-stb-event-sep2124-194742-011-e01417b1
model data:  s3://sagemaker-us-east-1-254050731868/tn-stb-event-sep2124-194742-009-f3051caa/output/model.tar.gz
Best Job:  tn-stb-event-sep2124-194742-009-f3051caa
Best HyperParameters {'_tuning_objective_metric': 't

#### Deploying the model

In [4]:
model = SKLearnModel(
    model_data=model_data,
    framework_version='1.2-1',
    role=role,
    sagemaker_session=sess,
    dependencies=['code/requirements.txt'],
    entry_point='code/train.py'
)

In [5]:
predictor = model.deploy(initial_instance_count=1, instance_type='ml.t2.medium', endpoint_name='ep-starbucks')

-----------------!

In [6]:
import pandas as pd

train_path = f's3://{Bucket}/{Prefix}/dataset/train.csv'
test_path = f's3://{Bucket}/{Prefix}/dataset/test.csv'

train = pd.read_csv(train_path, index_col=0)
test  = pd.read_csv(test_path, index_col=0)

X_train = train.drop(columns='event')
y_train = train.get('event')

X_test  = test.drop(columns='event')
y_test  = test.get('event')

one_sample = X_test.sample(1).to_json(orient='records')
multiple_samples = X_test.sample(10).to_json(orient='records')

one_sample = ast.literal_eval(one_sample)       # format correctly to send
multiple_samples = json.loads(multiple_samples) # format correctly to send

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



#### Testing One Sample

In [7]:
import json

endpoint_name = 'ep-starbucks'

payload = json.dumps(one_sample)

response = sagemaker_runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType='application/json',
    Body=payload
)

result = json.loads(response['Body'].read().decode())

print(f"Prediction of next events and probabilities: {result}")

Prediction of next events and probabilities: {'offer completed': {'0': 0.5074740648269653}, 'offer received': {'0': 0.10439745336771011}, 'offer viewed': {'0': 0.09287305176258087}, 'transaction': {'0': 0.2952553927898407}}


#### Testing Multiple Samples

In [8]:
import json

endpoint_name = 'ep-starbucks'

payload = json.dumps(multiple_samples)

response = sagemaker_runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType='application/json',
    Body=payload
)

result = json.loads(response['Body'].read().decode())

print(f"Prediction of next events and probabilities: {result}")

Prediction of next events and probabilities: {'offer completed': {'0': 0.19169123470783234, '1': 0.35546839237213135, '2': 0.4267372190952301, '3': 0.04460670426487923, '4': 0.09158255904912949, '5': 0.005726163741201162, '6': 0.31745645403862, '7': 0.021664030849933624, '8': 0.47849637269973755, '9': 0.007402719929814339}, 'offer received': {'0': 0.10461727529764175, '1': 0.3411425054073334, '2': 0.2173684984445572, '3': 0.11153170466423035, '4': 0.5586006045341492, '5': 0.07815248519182205, '6': 0.5117647051811218, '7': 0.16847339272499084, '8': 0.16314852237701416, '9': 0.24545027315616608}, 'offer viewed': {'0': 0.5800002217292786, '1': 0.05237748473882675, '2': 0.047427013516426086, '3': 0.11416944116353989, '4': 0.08814860880374908, '5': 0.7870533466339111, '6': 0.054051000624895096, '7': 0.6684191823005676, '8': 0.0929161086678505, '9': 0.6000007390975952}, 'transaction': {'0': 0.12369128316640854, '1': 0.2510116696357727, '2': 0.308467298746109, '3': 0.7296921610832214, '4': 0.

#### Predict over the enpoint all the test samples

In [9]:
endpoint_name = 'ep-starbucks'

batch_size=500
results = []
for i in range(0, len(X_test), batch_size):
    X_test_batch = X_test[i:i+batch_size].to_json(orient='records')
    X_test_batch = json.loads(X_test_batch)
    payload = json.dumps(X_test_batch)
    response = sagemaker_runtime.invoke_endpoint(EndpointName=endpoint_name, ContentType='application/json',Body=payload)
    result = json.loads(response['Body'].read().decode())
    result = pd.DataFrame(result)
    results.append(result)
    
results = pd.concat(results).reset_index(drop=True)
results

Unnamed: 0,offer completed,offer received,offer viewed,transaction
0,0.359530,0.421374,0.059271,0.159825
1,0.591181,0.097414,0.042019,0.269387
2,0.054186,0.139624,0.623495,0.182695
3,0.175706,0.114152,0.598048,0.112094
4,0.027842,0.633491,0.076225,0.262442
...,...,...,...,...
46918,0.529892,0.007429,0.081100,0.381579
46919,0.035760,0.426389,0.112181,0.425670
46920,0.446557,0.283921,0.081939,0.187582
46921,0.067514,0.455138,0.096399,0.380949


In [11]:
#event = results.columns

y_pred = results.apply(np.argmax, axis=1)#.apply(lambda x: event[x])

In [12]:
y = y_test.map(dict(zip(result.columns, range(4))))

In [13]:
pd.DataFrame(classification_report(y, y_pred, labels=range(4), target_names=results.columns, output_dict=True)).T

Unnamed: 0,precision,recall,f1-score,support
offer completed,0.283614,0.707753,0.404953,5314.0
offer received,0.502248,0.590424,0.542778,9837.0
offer viewed,0.538857,0.830563,0.653641,9083.0
transaction,0.788466,0.281414,0.414785,22689.0
accuracy,0.500778,0.500778,0.500778,0.500778
macro avg,0.528296,0.602538,0.504039,46923.0
weighted avg,0.622971,0.500778,0.48674,46923.0


In [14]:
predictor.delete_endpoint()