# Build Model with XGBoost Container

In the example we will use one of the Sagemaker training containers to train our model and write a trained model artefact to S3

In [27]:
# setting up SageMaker parameters
import sagemaker
import boto3

boto_session = boto3.Session()
region = boto_session.region_name
bucket_name = "telco-churn-seoul" #sagemaker.Session().default_bucket()
bucket_prefix = "xgboost-example"  # Location in the bucket to store our files
sgmk_session = sagemaker.Session()
sgmk_client = boto_session.client("sagemaker")
sgmk_role = sagemaker.get_execution_role()

print(sgmk_role)

arn:aws:iam::320389841409:role/service-role/AmazonSageMaker-ExecutionRole-20201022T141998


In [28]:
import pandas as pd
import numpy as np

df_train = pd.read_csv("data/train.csv")
df_valid = pd.read_csv("data/validation.csv")
df_test = pd.read_csv("data/test.csv")

df_train.head()

Unnamed: 0,Senior Citizen,Tenure,Monthly Charges,Total Charges,Churn,Gender_Female,Gender_Male,Partner_No,Partner_Yes,Dependents_No,...,Streaming Movies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,Paperless Billing_No,Paperless Billing_Yes,Payment Method_Bank transfer (automatic),Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check
0,0,2,19.65,31.2,0,0,1,1,0,1,...,0,1,0,0,0,1,0,0,1,0
1,0,9,75.6,661.55,0,0,1,1,0,1,...,0,1,0,0,1,0,0,0,1,0
2,0,17,35.65,646.05,0,1,0,0,1,1,...,0,1,0,0,0,1,0,1,0,0
3,1,16,96.15,1529.2,1,1,0,1,0,1,...,1,1,0,0,0,1,0,0,1,0
4,0,2,79.95,174.45,0,0,1,1,0,1,...,0,1,0,0,0,1,0,0,1,0


## Load data to S3

For the XGboost training job it needs to pull the data from S3 directly.


In [29]:
# Upload CSV files to S3 for SageMaker training
train_uri = sgmk_session.upload_data(
    path="data/train.csv",
    bucket=bucket_name,
    key_prefix=bucket_prefix,
)
val_uri = sgmk_session.upload_data(
    path="data/validation.csv",
    bucket=bucket_name,
    key_prefix=bucket_prefix,
)


# Define the data input channels for the training job:
s3_input_train = sagemaker.inputs.TrainingInput(train_uri, content_type="csv")
s3_input_validation = sagemaker.inputs.TrainingInput(val_uri, content_type="csv")

print(f"{s3_input_train.config}\n\n{s3_input_validation.config}")

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://telco-churn-seoul/xgboost-example/train.csv', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://telco-churn-seoul/xgboost-example/validation.csv', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}


## Determine the image you will be using

We'll be using SageMaker's built-in XGBoost Algorithm: Benefiting from performance-optimized, pre-implemented functionality like multi-instance parallelization, and support for multiple input formats.

In general to use the pre-built algorithms, we'll need to:

* Refer to the [Common Parameters docs](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html) to see the high-level configuration and what features each algorithm has
* Refer to the [algorithm docs](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html) to understand the detail of the data formats and (hyper)-parameters it supports

From these docs, we'll understand what data format we need to upload to S3 (next), and how to get the container image URI of the algorithm... which is listed on the Common Parameters page but can also be extracted through the SDK:

In [30]:
# specify container
training_image = sagemaker.image_uris.retrieve("xgboost", region=region, version="1.0-1")

print(training_image)

366743142698.dkr.ecr.ap-northeast-2.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3


In [None]:
# Instantiate an XGBoost estimator object
estimator = sagemaker.estimator.Estimator(
    image_uri=training_image,  # XGBoost algorithm container
    instance_type="ml.m5.xlarge",  # type of training instance
    instance_count=1,  # number of instances to be used
    role=sgmk_role,  # IAM role to be used
    max_run=20*60,  # Maximum allowed active runtime
    use_spot_instances=True,  # Use spot instances to reduce cost
    max_wait=30*60,  # Maximum clock time (including spot delays)
)

# define its hyperparameters
estimator.set_hyperparameters(
    num_round=150,     # int: [1,300]
    max_depth=5,     # int: [1,10]
    alpha=2.5,         # float: [0,5]
    eta=0.5,           # float: [0,1]
    objective="binary:logistic",
)

# start a training (fitting) job
estimator.fit({ "train": s3_input_train, "validation": s3_input_validation })


# Find the trained model artefact

In [34]:
sm_boto3 = boto_session.client("sagemaker")

sm_boto3.describe_training_job(TrainingJobName=estimator.latest_training_job.name)['ModelArtifacts']['S3ModelArtifacts']

's3://sagemaker-ap-northeast-2-320389841409/sagemaker-xgboost-2021-06-16-12-41-25-643/output/model.tar.gz'

# 04 Batch Transform

Return to this section to create a batch transform job from the trained model.

In [36]:
sm_transformer = estimator.transformer(1, "ml.m4.xlarge")


In [37]:
data_to_score = "data/test.csv"

In [41]:
import pandas as pd
df = pd.read_csv(data_to_score)

In [42]:
df.head()

Unnamed: 0,Senior Citizen,Tenure,Monthly Charges,Total Charges,Churn,Gender_Female,Gender_Male,Partner_No,Partner_Yes,Dependents_No,...,Streaming Movies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,Paperless Billing_No,Paperless Billing_Yes,Payment Method_Bank transfer (automatic),Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check
0,0,1,59.85,59.85,1,0,1,1,0,1,...,0,1,0,0,0,1,0,0,1,0
1,0,42,20.75,844.45,0,1,0,1,0,1,...,0,0,0,1,0,1,1,0,0,0
2,0,55,79.4,4238.45,0,0,1,0,1,0,...,0,1,0,0,1,0,0,0,1,0
3,1,66,104.9,6891.45,0,1,0,0,1,0,...,1,0,1,0,0,1,0,0,1,0
4,1,47,86.05,3865.6,0,0,1,1,0,1,...,0,1,0,0,0,1,0,1,0,0


In [48]:
df.drop("Churn",inplace=True, axis=1)
df.head()

Unnamed: 0,Senior Citizen,Tenure,Monthly Charges,Total Charges,Gender_Female,Gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,Streaming Movies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,Paperless Billing_No,Paperless Billing_Yes,Payment Method_Bank transfer (automatic),Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check
0,0,1,59.85,59.85,0,1,1,0,1,0,...,0,1,0,0,0,1,0,0,1,0
1,0,42,20.75,844.45,1,0,1,0,1,0,...,0,0,0,1,0,1,1,0,0,0
2,0,55,79.4,4238.45,0,1,0,1,0,1,...,0,1,0,0,1,0,0,0,1,0
3,1,66,104.9,6891.45,1,0,0,1,0,1,...,1,0,1,0,0,1,0,0,1,0
4,1,47,86.05,3865.6,0,1,1,0,1,0,...,0,1,0,0,0,1,0,1,0,0


In [49]:
df.to_csv("data/score.csv", header=False, index=False)

In [50]:
data_uri = sgmk_session.upload_data(
    path="data/score.csv",
    bucket=bucket_name,
    key_prefix=bucket_prefix,
)

In [51]:
data_uri

's3://telco-churn-seoul/xgboost-example/score.csv'

In [None]:
# start a transform job
sm_transformer.transform(data_uri, content_type="text/csv", split_type="Line")
sm_transformer.wait()

# Inspect the results

You retrieve the results written to the Transform job output_path 

In [53]:
import json
import io
from urllib.parse import urlparse

def get_csv_output_from_s3(s3uri, file_name):
    parsed_url = urlparse(s3uri)
    bucket_name = parsed_url.netloc
    prefix = parsed_url.path[1:]
    s3 = boto3.resource("s3")
    obj = s3.Object(bucket_name, "{}/{}".format(prefix, file_name))
    return obj.get()["Body"].read().decode("utf-8")

In [55]:
sm_transformer.output_path

's3://sagemaker-ap-northeast-2-320389841409/sagemaker-xgboost-2021-06-17-01-37-31-570'

In [56]:
output = get_csv_output_from_s3(sm_transformer.output_path, "{}.out".format("score.csv"))
output_df = pd.read_csv(io.StringIO(output), sep=",", header=None)
output_df.head(8)

Unnamed: 0,0
0,0.079208
1,0.013604
2,0.008018
3,0.049226
4,0.396948
5,0.017079
6,0.002958
7,0.003726
