# Score data with a batch transform job

This is an alternative approach to running the batch transform in which we load the serialised model in order to do the batch transform.

In [3]:
import sagemaker
import boto3

boto_session = boto3.Session()
region = boto_session.region_name
bucket_name = "telco-churn-seoul" #sagemaker.Session().default_bucket()
bucket_prefix = "xgboost-example"  # Location in the bucket to store our files
sgmk_session = sagemaker.Session()
sgmk_client = boto_session.client("sagemaker")
sgmk_role = sagemaker.get_execution_role()


## Load Serialised Model

Note that this model artefact was produced by the XGBoost training job in Notebook 01

We load it into a container to create the transform job

In [4]:
model_artefact = 's3://sagemaker-ap-northeast-2-320389841409/sagemaker-xgboost-2021-06-16-12-41-25-643/output/model.tar.gz'

# Create the Inference Script

This script will be used by the model container to load the serialised object and prepare it for inference
This is the bare minimum requirement. There is a great deal more that can be done to customise how the
inference process is performed, included pre-processing. See the documentation here: https://sagemaker.readthedocs.io/en/stable/frameworks/xgboost/using_xgboost.html

In [25]:
%%writefile inference.py

import pickle as pkl
import os

def model_fn(model_dir):
    with open(os.path.join(model_dir, "xgboost-model"), "rb") as f:
        booster = pkl.load(f)
    return booster


Overwriting inference.py


In [26]:
from sagemaker.xgboost.model import XGBoostModel

In [27]:
xgboost_model = XGBoostModel(
    model_data=model_artefact,
    role=sgmk_role,
    entry_point="inference.py",
    framework_version="1.0-1"
)

In [28]:
transformer = xgboost_model.transformer(
    instance_count=1,
    instance_type="ml.m4.xlarge"
)

# Prepare the data

In [29]:
data_to_score = "data/test.csv"

In [43]:
import pandas as pd
df = pd.read_csv(data_to_score)

In [44]:
df.head()

Unnamed: 0,Senior Citizen,Tenure,Monthly Charges,Total Charges,Churn,Gender_Female,Gender_Male,Partner_No,Partner_Yes,Dependents_No,...,Streaming Movies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,Paperless Billing_No,Paperless Billing_Yes,Payment Method_Bank transfer (automatic),Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check
0,0,1,59.85,59.85,1,0,1,1,0,1,...,0,1,0,0,0,1,0,0,1,0
1,0,42,20.75,844.45,0,1,0,1,0,1,...,0,0,0,1,0,1,1,0,0,0
2,0,55,79.4,4238.45,0,0,1,0,1,0,...,0,1,0,0,1,0,0,0,1,0
3,1,66,104.9,6891.45,0,1,0,0,1,0,...,1,0,1,0,0,1,0,0,1,0
4,1,47,86.05,3865.6,0,0,1,1,0,1,...,0,1,0,0,0,1,0,1,0,0


In [45]:
df.drop("Churn",inplace=True, axis=1)
df.head()

Unnamed: 0,Senior Citizen,Tenure,Monthly Charges,Total Charges,Gender_Female,Gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,Streaming Movies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,Paperless Billing_No,Paperless Billing_Yes,Payment Method_Bank transfer (automatic),Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check
0,0,1,59.85,59.85,0,1,1,0,1,0,...,0,1,0,0,0,1,0,0,1,0
1,0,42,20.75,844.45,1,0,1,0,1,0,...,0,0,0,1,0,1,1,0,0,0
2,0,55,79.4,4238.45,0,1,0,1,0,1,...,0,1,0,0,1,0,0,0,1,0
3,1,66,104.9,6891.45,1,0,0,1,0,1,...,1,0,1,0,0,1,0,0,1,0
4,1,47,86.05,3865.6,0,1,1,0,1,0,...,0,1,0,0,0,1,0,1,0,0


In [46]:
df.to_csv("data/score.csv", header=False, index=False)

In [47]:
data_uri = sgmk_session.upload_data(
    path="data/score.csv",
    bucket=bucket_name,
    key_prefix=bucket_prefix,
)

In [48]:
data_uri

's3://telco-churn-seoul/xgboost-example/score.csv'

# Run the transform job

In [None]:
# start a transform job
transformer.transform(data_uri, content_type="text/csv", split_type="Line")
transformer.wait()

..........

# Inspect the results

In [None]:
import json
import io
from urllib.parse import urlparse

def get_csv_output_from_s3(s3uri, file_name):
    parsed_url = urlparse(s3uri)
    bucket_name = parsed_url.netloc
    prefix = parsed_url.path[1:]
    s3 = boto3.resource("s3")
    obj = s3.Object(bucket_name, "{}/{}".format(prefix, file_name))
    return obj.get()["Body"].read().decode("utf-8")


In [None]:
output = get_csv_output_from_s3(transformer.output_path, "{}.out".format("score.csv"))
output_df = pd.read_csv(io.StringIO(output), sep=",", header=None)
output_df.head(8)