# Score data with a batch transform job

This is an alternative approach to running the batch transform in which we load the serialised model in order to do the batch transform.

In [8]:
import sagemaker
import boto3

boto_session = boto3.Session()
region = boto_session.region_name
bucket_name = "telco-churn-seoul" #sagemaker.Session().default_bucket()
bucket_prefix = "xgboost-example"  # Location in the bucket to store our files
sgmk_session = sagemaker.Session()
sgmk_client = boto_session.client("sagemaker")
sgmk_role = sagemaker.get_execution_role()


## Load Serialised Model

Note that this model artefact was produced by the XGBoost training job in Notebook 01

We will use this model artefact to create the batch transform job

In [9]:
model_artefact = 's3://sagemaker-ap-northeast-2-320389841409/sagemaker-xgboost-2021-06-16-12-41-25-643/output/model.tar.gz'

In [10]:
batch_output = "s3://{}/{}/batch-inference".format(bucket_name, bucket_prefix)

# Create the Inference Script

This script will be used by the model container to load the serialised object and prepare it for inference
This is the bare minimum requirement. There is a great deal more that can be done to customise how the
inference process is performed, included pre-processing. See the documentation here: https://sagemaker.readthedocs.io/en/stable/frameworks/xgboost/using_xgboost.html

In [14]:
container = sagemaker.image_uris.retrieve(
    "xgboost", region=region, version="1.0-1"
)


In [15]:
model = sagemaker.model.Model(
    model_data=model_artefact,
    role=sgmk_role,
    image_uri=container,
)

In [16]:
transformer = model.transformer(
    instance_count=1, instance_type="ml.m4.xlarge", output_path=batch_output
)


# Prepare the data

In [17]:
data_to_score = "data/test.csv"

In [18]:
import pandas as pd
df = pd.read_csv(data_to_score)

In [19]:
df.head()

Unnamed: 0,Senior Citizen,Tenure,Monthly Charges,Total Charges,Churn,Gender_Female,Gender_Male,Partner_No,Partner_Yes,Dependents_No,...,Streaming Movies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,Paperless Billing_No,Paperless Billing_Yes,Payment Method_Bank transfer (automatic),Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check
0,0,1,59.85,59.85,1,0,1,1,0,1,...,0,1,0,0,0,1,0,0,1,0
1,0,42,20.75,844.45,0,1,0,1,0,1,...,0,0,0,1,0,1,1,0,0,0
2,0,55,79.4,4238.45,0,0,1,0,1,0,...,0,1,0,0,1,0,0,0,1,0
3,1,66,104.9,6891.45,0,1,0,0,1,0,...,1,0,1,0,0,1,0,0,1,0
4,1,47,86.05,3865.6,0,0,1,1,0,1,...,0,1,0,0,0,1,0,1,0,0


In [20]:
df.drop("Churn",inplace=True, axis=1)
df.head()

Unnamed: 0,Senior Citizen,Tenure,Monthly Charges,Total Charges,Gender_Female,Gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,Streaming Movies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,Paperless Billing_No,Paperless Billing_Yes,Payment Method_Bank transfer (automatic),Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check
0,0,1,59.85,59.85,0,1,1,0,1,0,...,0,1,0,0,0,1,0,0,1,0
1,0,42,20.75,844.45,1,0,1,0,1,0,...,0,0,0,1,0,1,1,0,0,0
2,0,55,79.4,4238.45,0,1,0,1,0,1,...,0,1,0,0,1,0,0,0,1,0
3,1,66,104.9,6891.45,1,0,0,1,0,1,...,1,0,1,0,0,1,0,0,1,0
4,1,47,86.05,3865.6,0,1,1,0,1,0,...,0,1,0,0,0,1,0,1,0,0


In [21]:
df.to_csv("data/score.csv", header=False, index=False)

In [22]:
data_uri = sgmk_session.upload_data(
    path="data/score.csv",
    bucket=bucket_name,
    key_prefix=bucket_prefix,
)

In [23]:
data_uri

's3://telco-churn-seoul/xgboost-example/score.csv'

# Run the transform job

In [None]:
# start a transform job
#transformer.transform(data_uri, content_type="text/csv", split_type="Line")


In [24]:
transformer.transform(
    data=data_uri, data_type="S3Prefix", content_type="text/csv", split_type="Line"
)
transformer.wait()

..............................
[34m[2021-06-18:04:13:22:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-06-18:04:13:22:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-06-18:04:13:22:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }

  

In [25]:
import json
import io
from urllib.parse import urlparse

def get_csv_output_from_s3(s3uri, file_name):
    parsed_url = urlparse(s3uri)
    bucket_name = parsed_url.netloc
    prefix = parsed_url.path[1:]
    s3 = boto3.resource("s3")
    obj = s3.Object(bucket_name, "{}/{}".format(prefix, file_name))
    return obj.get()["Body"].read().decode("utf-8")

In [26]:
transformer.output_path

's3://telco-churn-seoul/xgboost-example/batch-inference'

In [27]:
output = get_csv_output_from_s3(transformer.output_path, "{}.out".format("score.csv"))
output_df = pd.read_csv(io.StringIO(output), sep=",", header=None)
output_df.head(8)

Unnamed: 0,0
0,0.079208
1,0.013604
2,0.008018
3,0.049226
4,0.396948
5,0.017079
6,0.002958
7,0.003726
