# SageMaker XGBoost demo

## Import libraries

In [1]:
import sagemaker
import boto3
import numpy as np
from sagemaker import get_execution_role
from sagemaker.session import Session

from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer

import pandas as pd
from sklearn.model_selection import train_test_split

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/jovyan/.config/sagemaker/config.yaml


## Setup AWS resources

In [2]:
region = "ap-southeast-1"  
boto_session = boto3.Session(region_name=region)
sagemaker_session = sagemaker.Session(boto_session=boto_session)
role = "arn:aws:iam::891377389768:role/cloudos-687e76105c45270e09d6eec1-session-role"

sagemaker_session = sagemaker.Session(boto_session=boto_session)

bucket = "demo-sagemaker-lifebit-ai"
prefix = "sagemaker/xgboost-example"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/jovyan/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/jovyan/.config/sagemaker/config.yaml


## Fetch dataset

In [4]:
url = "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
df = pd.read_csv(url)

cols_to_delete = ["paticipant_id"]
for del_col in cols_to_delete:
    if del_col in df.columns:
        del df[del_col]

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Split training/testing data

In [6]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

outcome_col = "Outcome"

train_df = train_df[[outcome_col] + [c for c in train_df.columns if c != outcome_col]]
test_df = test_df[[outcome_col] + [c for c in test_df.columns if c != outcome_col]]

train_df.to_csv("train.csv", index=False, header=False)
test_df.to_csv("test.csv", index=False, header=False)

## Stage data for SageMaker

In [7]:
sagemaker_session.upload_data("train.csv", bucket=bucket, key_prefix=prefix + "/train")
sagemaker_session.upload_data("test.csv", bucket=bucket, key_prefix=prefix + "/test")
s3_test = "s3://demo-sagemaker-lifebit-ai/sagemaker/xgboost-example/test/test.csv"
s3_train = "s3://demo-sagemaker-lifebit-ai/sagemaker/xgboost-example/train/train.csv"

train_input = TrainingInput(s3_data=s3_train, content_type="text/csv")
test_input  = TrainingInput(s3_data=s3_test, content_type="text/csv")

## Train the model

In [8]:
container = sagemaker.image_uris.retrieve("xgboost", boto_session.region_name, "1.5-1")

xgb = Estimator(
    container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{bucket}/{prefix}/output",
    sagemaker_session=sagemaker_session
)

xgb.set_hyperparameters(
    objective="binary:logistic",
    num_round=50,
    max_depth=5,
    eta=0.2,
    subsample=0.8
)

xgb.fit({
    "train": train_input,
    "validation": test_input
})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-07-23-06-18-45-588


2025-07-23 06:18:45 Starting - Starting the training job...
2025-07-23 06:19:01 Starting - Preparing the instances for training...
2025-07-23 06:19:21 Downloading - Downloading input data...
2025-07-23 06:20:07 Downloading - Downloading the training image......
2025-07-23 06:21:13 Training - Training image download completed. Training in progress.
2025-07-23 06:21:13 Uploading - Uploading generated training model...
2025-07-23 06:21:26 Completed - Training job completed
..Training seconds: 124
Billable seconds: 124


## Deploy model

In [9]:
predictor = xgb.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large"
)

predictor.serializer = CSVSerializer()
predictor.deserializer = CSVDeserializer()

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-07-23-06-21-57-094
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2025-07-23-06-21-57-094
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2025-07-23-06-21-57-094


------!

## Set up data for prediction

### Without SageMaker
Without SageMaker, we would have to run the notebook in the same machine as the model, or configure a server to handle communication between the notebook and the model:
- Configured GPU-accelerated server
- Configure the model to be accessed by the server
- HTTP server
- Domain name and TLS encryption
- API endpoints to communicate with the model via the server
- Reuests management code client an server side

### With SageMaker
With the model deployed, SageMaker takes care of generating API endpoints, managing requests, etc.  
From the perspective of a user, the predictor is just a function (`predictor.predict()`).  
However, it is running in AWS using ML-optimized compute instances

In [13]:
def predict(series, outcome_col=outcome_col, binary=True):
    row = series[[col for col in df.columns if col != outcome_col]].astype(str).tolist()
    prediction_raw = predictor.predict([row]) # This runs transparently in ML accelerated instances in AWS
    prediction = float(prediction_raw[0][0])
    prob = np.round(prediction, 2)
    if binary:
        return int(prob > 0.5)
    return prob

## Evaluate resuls

The `predict()` method of our model returns a probability that its argument belongs to the class it was trained for.  
In this case, the model is trying to predict the presence of Diabetes from a list of 8 variables

In [20]:
outcome_prediction_col = "PredictedOutcome"
test_df[outcome_prediction_col] = test_df.apply(predict, axis=1)

At a very rough scale, the model is able to predict Diabetes with 71% accuracy.

In [21]:
n_rows = test_df.shape[0]
np.round(sum(test_df[outcome_col].values == test_df[outcome_prediction_col].values) / n_rows, 3)

0.714

From here, we can investigate 

In [22]:
test_df.loc[test_df[outcome_col] - test_df[outcome_prediction_col] != 0,]

Unnamed: 0,Outcome,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,PredictedOutcome
668,0,6,98,58,33,190,34.0,0.43,43,1
473,0,7,136,90,0,0,29.9,0.21,50,1
204,0,6,103,72,32,190,37.7,0.324,55,1
336,0,0,117,0,0,0,33.8,0.932,44,1
568,0,4,154,72,29,126,31.3,0.338,37,1
148,0,5,147,78,0,0,33.7,0.218,65,1
667,1,10,111,70,27,0,27.5,0.141,40,0
212,0,7,179,95,31,0,34.2,0.164,60,1
199,1,4,148,60,27,318,30.9,0.15,29,0
213,1,0,140,65,26,130,42.6,0.431,24,0


## Cleanup

In [25]:
predictor.delete_model()
predictor.delete_endpoint()

INFO:sagemaker:Deleting model with name: sagemaker-xgboost-2025-07-23-06-21-57-094
INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-xgboost-2025-07-23-06-21-57-094
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2025-07-23-06-21-57-094
