In [1]:
import boto3
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import sagemaker
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, ContinuousParameter
from sagemaker.estimator import Estimator
from sagemaker import image_uris
from sagemaker.serializers import CSVSerializer

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
# Your S3 bucket and folder
bucket_name = 'renisa-ml-demo-bucket'
s3_prefix = 'data/'

# List of local files you want to upload
files_to_upload = ['bank.csv', 'bank-full.csv', 'bank-additional-full.csv', 'bank-names.txt', 'bank-additional-names.txt']

# Initialize S3 client
s3 = boto3.client('s3')

# Upload each file
for file_name in files_to_upload:
    s3_key = s3_prefix + file_name
    s3.upload_file(file_name, bucket_name, s3_key)
    print(f"Uploaded {file_name} to s3://{bucket_name}/{s3_key}")

Uploaded bank.csv to s3://renisa-ml-demo-bucket/data/bank.csv
Uploaded bank-full.csv to s3://renisa-ml-demo-bucket/data/bank-full.csv
Uploaded bank-additional-full.csv to s3://renisa-ml-demo-bucket/data/bank-additional-full.csv
Uploaded bank-names.txt to s3://renisa-ml-demo-bucket/data/bank-names.txt
Uploaded bank-additional-names.txt to s3://renisa-ml-demo-bucket/data/bank-additional-names.txt


In [3]:
df = pd.read_csv('s3://renisa-ml-demo-bucket/data/bank-full.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
# Load the dataset from S3
df = pd.read_csv('s3://renisa-ml-demo-bucket/data/bank-full.csv', sep=';')

# ---- Step 1: Inspect the data ----
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("\nTarget distribution:\n", df['y'].value_counts())

# ---- Step 2: Convert target variable 'y' to binary ----
df['y'] = df['y'].map({'yes': 1, 'no': 0})

# ---- Step 3: Identify categorical and numerical columns ----
categorical_cols = df.select_dtypes(include='object').columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols.remove('y')

print("\nCategorical features:", categorical_cols)
print("Numerical features:", numerical_cols)

# ---- Step 4: One-hot encode categorical variables ----
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# ---- Step 5: Normalize numerical features ----
scaler = StandardScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

# ---- Step 6: Train-test split ----
X = df_encoded.drop('y', axis=1)
y = df_encoded['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("\nTrain shape:", X_train.shape)
print("Test shape:", X_test.shape)

Shape: (45211, 17)
Columns: ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']

Target distribution:
 y
no     39922
yes     5289
Name: count, dtype: int64

Categorical features: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
Numerical features: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

Train shape: (36168, 42)
Test shape: (9043, 42)


In [5]:
# Create a folder to store CSVs
os.makedirs('bank_data', exist_ok=True)

# Combine X and y
train_df = pd.concat([y_train.reset_index(drop=True), X_train.reset_index(drop=True)], axis=1)
val_df   = pd.concat([y_test.reset_index(drop=True),  X_test.reset_index(drop=True)],  axis=1)

# Save as CSVs
train_df.to_csv('bank_data/train.csv', index=False, header=False)
val_df.to_csv('bank_data/validation.csv', index=False, header=False)

print(train_df.iloc[:, 0].unique())

[0 1]


In [6]:
prefix = 'xgboost-bank-demo'

train_key = f'{prefix}/train/train.csv'
val_key = f'{prefix}/validation/validation.csv'

s3 = boto3.client('s3')
s3.upload_file('bank_data/train.csv', bucket_name, train_key)
s3.upload_file('bank_data/validation.csv', bucket_name, val_key)

s3_train_path = f's3://{bucket_name}/{train_key}'
s3_val_path = f's3://{bucket_name}/{val_key}'

print(f"Uploaded training data to: {s3_train_path}")
print(f"Uploaded validation data to: {s3_val_path}")

Uploaded training data to: s3://renisa-ml-demo-bucket/xgboost-bank-demo/train/train.csv
Uploaded validation data to: s3://renisa-ml-demo-bucket/xgboost-bank-demo/validation/validation.csv


In [7]:
boto_session = boto3.Session(region_name='us-east-1')
session = sagemaker.Session(boto_session=boto_session)
role = sagemaker.get_execution_role()

region=session.boto_region_name
print(region)

us-east-1


In [8]:
# Get the built-in XGBoost image URI
# In SageMaker, an image is basically a Docker container that has pre-installed code for training or inference
# Here we're asking Sagemaker: “Give me the URI of the official XGBoost container 
# (pre-built Docker image) in us-east-1 for version 1.3-1.”
container = image_uris.retrieve(
    framework="xgboost",
    region="us-east-1",
    version="1.3-1"  
)

# Define the XGBoost Estimator
# 
xgb = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    output_path=f's3://{bucket_name}/{prefix}/output',
    sagemaker_session=session
)

xgb.set_hyperparameters(
    objective="binary:logistic",
    num_round=100
)

hyperparameter_ranges = {
    "max_depth": IntegerParameter(3, 10), 
    # Controls the maximum depth of each decision tree.
    # Larger values make the model more complex, but risk overfitting.
    
    "eta": ContinuousParameter(0.01, 0.3), 
    # Learning rate (shrinkage). Smaller values make learning slower but more precise.
    # Often paired with more boosting rounds.
    
    "min_child_weight": IntegerParameter(1, 10), 
    # Minimum sum of instance weight (hessian) needed in a child.
    # Larger values can make the model more conservative (less complex splits).
    
    "subsample": ContinuousParameter(0.5, 1.0), 
    # Fraction of the training data used to grow each tree.
    # Prevents overfitting; lower values add randomness.
    
    "colsample_bytree": ContinuousParameter(0.5, 1.0) 
    # Fraction of features (columns) to consider at each tree.
    # Helps prevent overfitting and speeds up training.
}

tuner = HyperparameterTuner(
    estimator=xgb,
    objective_metric_name="validation:auc",  # Or "validation:logloss"
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=[{
        "Name": "validation:auc",
        "Regex": ".*\\[.*\\].*validation-auc:([0-9\\.]+)"
    }],
    max_jobs=10,
    max_parallel_jobs=2,
    objective_type="Maximize"
)

# Train
tuner.fit({
    "train": TrainingInput(s3_train_path, content_type="text/csv"),
    "validation": TrainingInput(s3_val_path, content_type="text/csv")
})

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


...............................................................................................!


In [9]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import StringDeserializer

endpoint_name = "xgboost-bank-marketing-endpoint"

# Check if endpoint exists
sm_client = boto3.client('sagemaker', region_name='us-east-1')
existing_endpoints = sm_client.list_endpoints(NameContains=endpoint_name)['Endpoints']
endpoint_names = [ep['EndpointName'] for ep in existing_endpoints]

if endpoint_name in endpoint_names:
    print(f"Endpoint '{endpoint_name}' already exists. Reusing it.")
    predictor = Predictor(endpoint_name=endpoint_name, sagemaker_session=session)
else:
    print(f"Creating endpoint '{endpoint_name}' from best estimator...")
    # Fetch the best estimator from the tuner
    best_estimator = tuner.best_estimator()

    predictor = best_estimator.deploy(
        initial_instance_count=1,
        instance_type="ml.m5.large",
        endpoint_name=endpoint_name
    )

# Set up serialization/deserialization
predictor.serializer = CSVSerializer()
predictor.deserializer = StringDeserializer()

Endpoint 'xgboost-bank-marketing-endpoint' already exists. Reusing it.


In [10]:
sample = [[30, 1, 1, 1, 1, 1, 1, 1787, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0]]
# Call endpoint with SageMaker SDK
response = predictor.predict(sample)
print("Prediction:", response)

Prediction: {"predictions": [{"score": 0.8477307558059692}]}


In [11]:
import io
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Ensure all values are float, not bool or object
X_test_fixed = X_test.astype(float)

# Convert to CSV format for endpoint input
csv_buffer = io.StringIO()
X_test_fixed.to_csv(csv_buffer, header=False, index=False)
payload = csv_buffer.getvalue()

# Call endpoint with boto3
runtime = boto3.client('sagemaker-runtime', region_name='us-east-1')
response = runtime.invoke_endpoint(
    EndpointName='xgboost-bank-marketing-endpoint',  # Update if different
    ContentType='text/csv',
    Body=payload
)

# Parse predictions
result = response['Body'].read().decode('utf-8')
predictions = [float(x) for x in result.strip().split('\n') if x]
predicted_labels = [1 if p > 0.5 else 0 for p in predictions]

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, predicted_labels))
print("Precision:", precision_score(y_test, predicted_labels))
print("Recall:", recall_score(y_test, predicted_labels))
print("F1 Score:", f1_score(y_test, predicted_labels))
print("ROC AUC:", roc_auc_score(y_test, predictions))


Accuracy: 0.8822293486674776
Precision: 0.49519890260631
Recall: 0.34120982986767484
F1 Score: 0.40402909904868495
ROC AUC: 0.8478550874572244


In [None]:
# --- Step: Setup SageMaker Clarify for Bias & Explainability Analysis ---
# SHapley Additive exPlanations
import boto3
from sagemaker.clarify import SageMakerClarifyProcessor
from sagemaker import Session

boto_session = boto3.Session(region_name="us-east-1")
sagemaker_session = Session(boto_session=boto_session)

from sagemaker.clarify import (
    DataConfig,
    ModelConfig,
    BiasConfig,
    SHAPConfig
)

# Combine test features and labels into one DataFrame for Clarify input
clarify_df = df.loc[y_test.index].copy()
clarify_df["y"] = clarify_df["y"].map({"yes": 1, "no": 0})

# Save test set to CSV (required by Clarify)
clarify_test_path = "bank_data/clarify_test.csv"
clarify_df.to_csv(clarify_test_path, index=False)

# Upload test set to S3
clarify_s3_key = f"{prefix}clarify_test.csv"
clarify_s3_path = f"s3://{bucket_name}/{clarify_s3_key}"
s3.upload_file(clarify_test_path, bucket_name, clarify_s3_key)

# Initialize Clarify processor (use low-cost instance)
clarify_processor = SageMakerClarifyProcessor(
    role=role,
    instance_count=1,
    instance_type="ml.t3.medium",
    sagemaker_session=sagemaker_session
)

# Configure input/output data for Clarify
data_config = DataConfig(
    s3_data_input_path=clarify_s3_path,
    s3_output_path=f"s3://{bucket_name}/{prefix}clarify-output",
    label="y",
    headers=clarify_df.columns.tolist(),
    dataset_type="text/csv"
)

# Configure model endpoint details for post-training & explainability
model_config = ModelConfig(
    model_name="xgboost-bank-marketing-endpoint",
    instance_type="ml.t3.medium",
    instance_count=1,
    content_type="text/csv",
    accept_type="text/csv"
)

# Define bias config 
bias_config = BiasConfig(
    facet_name="marital",
    label_values_or_threshold=[1]
)

# Define SHAP config to explain model predictions
shap_config = SHAPConfig(
    baseline=X_test.sample(10, random_state=42).astype(float).values.tolist(),
    num_samples=100
)

# --- Run Pre-training Bias Analysis (on dataset only) ---
clarify_processor.run_pre_training_bias(
    data_config,
    bias_config
)

# --- Run Post-training Bias Analysis (on predictions from model) ---
clarify_processor.run_post_training_bias(
    data_config,
    model_config,
    bias_config
)

# --- Run Explainability (SHAP) ---
clarify_processor.run_explainability(
    data_config=data_config,
    model_config=model_config,
    shap_config=shap_config
)