# Assignment 4.1: Model Store

## Setup

In [1]:
import os
import json
from time import gmtime, strftime

import boto3
import numpy as np
import pandas as pd
import sagemaker
from sklearn.metrics import accuracy_score, roc_auc_score

sess = sagemaker.Session()
region = sess.boto_region_name

try:
    role = sagemaker.get_execution_role()
except Exception:
    role = os.environ.get("SAGEMAKER_ROLE_ARN")
    if not role:
        raise RuntimeError(
            "Set SAGEMAKER_ROLE_ARN environment variable or run in SageMaker Studio."
        )

bucket = sess.default_bucket()
prefix = "assignment4-model-store"

s3 = boto3.client("s3", region_name=region)
sagemaker_client = boto3.client("sagemaker", region_name=region)
sts_client = boto3.client("sts", region_name=region)

account_arn = sts_client.get_caller_identity()["Arn"]
run_id = strftime("%Y%m%d-%H%M%S", gmtime())

## Data preparation

Download the UCI Breast Cancer Wisconsin (Diagnostic) dataset from AWS public S3, split into train/validation/test sets, and upload to our S3 bucket for training.

In [2]:
# Download breast cancer dataset from AWS public S3 bucket
filename = "wdbc.csv"
s3.download_file(
    f"sagemaker-example-files-prod-{region}",
    "datasets/tabular/breast_cancer/wdbc.csv",
    filename
)

data = pd.read_csv(filename, header=None)

# Specify column names from the UCI dataset documentation
data.columns = [
    "id", "diagnosis",
    "radius_mean", "texture_mean", "perimeter_mean", "area_mean",
    "smoothness_mean", "compactness_mean", "concavity_mean",
    "concave points_mean", "symmetry_mean", "fractal_dimension_mean",
    "radius_se", "texture_se", "perimeter_se", "area_se",
    "smoothness_se", "compactness_se", "concavity_se",
    "concave points_se", "symmetry_se", "fractal_dimension_se",
    "radius_worst", "texture_worst", "perimeter_worst", "area_worst",
    "smoothness_worst", "compactness_worst", "concavity_worst",
    "concave points_worst", "symmetry_worst", "fractal_dimension_worst"
]

# Prepare features and target
# Convert diagnosis to binary: M (malignant) = 1, B (benign) = 0
data["diagnosis"] = (data["diagnosis"] == "M").astype(int)

# Shuffle and split
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

train_size = int(0.7 * len(data))
val_size = int(0.15 * len(data))

train_data = data[:train_size]
val_data = data[train_size:train_size + val_size]
test_data = data[train_size + val_size:]

# Save X and y for later evaluation
X_val = val_data.drop(columns=["id", "diagnosis"])
y_val = val_data["diagnosis"]
X_test = test_data.drop(columns=["id", "diagnosis"])
y_test = test_data["diagnosis"]

# Save to CSV files
train_file = "train_data.csv"
val_file = "validation_data.csv"
batch_file = "batch_data.csv"

# Training/validation: label first, no header, no ID (XGBoost format)
train_data.drop(columns=["id"]).to_csv(train_file, index=False, header=False)
val_data.drop(columns=["id"]).to_csv(val_file, index=False, header=False)

# Batch transform: features ONLY (no label, no ID) for prediction
test_data.drop(columns=["id", "diagnosis"]).to_csv(batch_file, index=False, header=False)

# Upload to S3
train_key = f"{prefix}/train/{train_file}"
val_key = f"{prefix}/validation/{val_file}"
batch_key = f"{prefix}/batch/{batch_file}"

s3.upload_file(train_file, bucket, train_key)
s3.upload_file(val_file, bucket, val_key)
s3.upload_file(batch_file, bucket, batch_key)

train_s3_uri = f"s3://{bucket}/{train_key}"
val_s3_uri = f"s3://{bucket}/{val_key}"
test_s3_uri = f"s3://{bucket}/{batch_key}"

print(f"Training data: {train_s3_uri}")
print(f"Validation data: {val_s3_uri}")
print(f"Batch data: {test_s3_uri}")
print(f"\nTrain size: {len(train_data)}, Val size: {len(val_data)}, Test size: {len(test_data)}")

train_s3_uri, val_s3_uri, test_s3_uri

Training data: s3://sagemaker-us-east-1-586589491781/assignment4-model-store/train/train_data.csv
Validation data: s3://sagemaker-us-east-1-586589491781/assignment4-model-store/validation/validation_data.csv
Batch data: s3://sagemaker-us-east-1-586589491781/assignment4-model-store/batch/batch_data.csv

Train size: 398, Val size: 85, Test size: 86


('s3://sagemaker-us-east-1-586589491781/assignment4-model-store/train/train_data.csv',
 's3://sagemaker-us-east-1-586589491781/assignment4-model-store/validation/validation_data.csv',
 's3://sagemaker-us-east-1-586589491781/assignment4-model-store/batch/batch_data.csv')

## Train XGBoost model

In [3]:
from sagemaker.inputs import TrainingInput

job_name = f"xgb-{run_id}"
output_path = f"s3://{bucket}/{prefix}/output/{job_name}"

training_image = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.7-1",
)

xgb_estimator = sagemaker.estimator.Estimator(
    training_image,
    role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size=30,
    input_mode="File",
    output_path=output_path,
    sagemaker_session=sess,
)

xgb_estimator.set_hyperparameters(
    objective="binary:logistic",
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    num_round=100,
)

train_input = TrainingInput(train_s3_uri, content_type="text/csv")
val_input = TrainingInput(val_s3_uri, content_type="text/csv")

xgb_estimator.fit({"train": train_input, "validation": val_input}, job_name=job_name, logs=True)

INFO:sagemaker:Creating training-job with name: xgb-20260129-152007


2026-01-29 15:20:08 Starting - Starting the training job......
2026-01-29 15:21:02 Downloading - Downloading input data...
2026-01-29 15:21:27 Downloading - Downloading the training image......
2026-01-29 15:22:34 Training - Training image download completed. Training in progress.
  import pkg_resources[0m
[34m[2026-01-29 15:22:26.157 ip-10-2-94-115.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2026-01-29 15:22:26.215 ip-10-2-94-115.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2026-01-29:15:22:26:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2026-01-29:15:22:26:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2026-01-29:15:22:26:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2026-01-29:15:22:26:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2026-01-29:15:22:26:INFO] Determined 0 G

## Deploy endpoint and evaluate

In [4]:
predictor = xgb_estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.xlarge",
)

predictor.serializer = sagemaker.serializers.CSVSerializer()
predictor.deserializer = sagemaker.deserializers.CSVDeserializer()

val_payload = X_val.to_csv(index=False, header=False)
val_probs_raw = predictor.predict(val_payload)

# CSVDeserializer returns a list - handle both nested and flat lists
if isinstance(val_probs_raw, list):
    if len(val_probs_raw) > 0 and isinstance(val_probs_raw[0], list):
        val_probs = np.array([float(row[0]) for row in val_probs_raw])
    else:
        val_probs = np.array([float(p) for p in val_probs_raw])
else:
    val_probs = np.array([float(p) for p in str(val_probs_raw).split("\n") if p.strip()])

val_preds = (val_probs >= 0.5).astype(int)
val_accuracy = accuracy_score(y_val, val_preds)
val_auc = roc_auc_score(y_val, val_probs)

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation AUC: {val_auc:.4f}")

val_accuracy, val_auc

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2026-01-29-15-23-20-220
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2026-01-29-15-23-20-220
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2026-01-29-15-23-20-220


------!Validation Accuracy: 0.9529
Validation AUC: 0.9917


(0.9529411764705882, 0.9916851441241685)

## Batch transform

In [5]:
transformer = xgb_estimator.transformer(
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=f"s3://{bucket}/{prefix}/batch-output/{run_id}",
)

transformer.transform(
    data=test_s3_uri,
    content_type="text/csv",
    split_type="Line",
)
transformer.wait()

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2026-01-29-15-26-52-913
INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2026-01-29-15-26-53-683


  import pkg_resources[0m
[34m[2026-01-29:15:32:09:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2026-01-29:15:32:09:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2026-01-29:15:32:09:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
  import pkg_resources[0m
[35m[2026-01-29:15:32:09:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2026-01-29:15:32:09:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2026-01-29:15:32:09:INFO] nginx config: [0m
[35mworker_processes auto;[0m
[35mdaemon off;[0m
[35mpid /tmp/nginx.pid;[0m
[35merror_log  /dev/stderr;[0m
[35mworker_rlimit_nofile 4096;[0m
[35mevents {
  worker_connections 2048;[0m
[35m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log

## Part 1: Set up Model Package Group

In [6]:
model_package_group_name = f"xgboost-breast-cancer-{run_id}"
model_package_group_desc = (
    "XGBoost classifier for breast cancer malignancy (UCI dataset)."
)

create_group_response = sagemaker_client.create_model_package_group(
    ModelPackageGroupName=model_package_group_name,
    ModelPackageGroupDescription=model_package_group_desc,
)
print(create_group_response["ModelPackageGroupArn"])

describe_group = sagemaker_client.describe_model_package_group(
    ModelPackageGroupName=model_package_group_name
)
describe_group

arn:aws:sagemaker:us-east-1:586589491781:model-package-group/xgboost-breast-cancer-20260129-152007


{'ModelPackageGroupName': 'xgboost-breast-cancer-20260129-152007',
 'ModelPackageGroupArn': 'arn:aws:sagemaker:us-east-1:586589491781:model-package-group/xgboost-breast-cancer-20260129-152007',
 'ModelPackageGroupDescription': 'XGBoost classifier for breast cancer malignancy (UCI dataset).',
 'CreationTime': datetime.datetime(2026, 1, 29, 15, 32, 57, 481000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:586589491781:user-profile/d-5mfotqubn7ry/default-1768808396524',
  'UserProfileName': 'default-1768808396524',
  'DomainId': 'd-5mfotqubn7ry',
  'IamIdentity': {'Arn': 'arn:aws:sts::586589491781:assumed-role/LabRole/SageMaker',
   'PrincipalId': 'AROAYRE3VKJCQ7WVYUF4W:SageMaker'}},
 'ModelPackageGroupStatus': 'Completed',
 'ResponseMetadata': {'RequestId': '1bb96a03-d283-4ba6-98e2-88edfd9dedff',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '1bb96a03-d283-4ba6-98e2-88edfd9dedff',
   'strict-transport-security': 'max-age=47304000; inclu

## Part 2: Set up Model Package

In [7]:
training_job_name = xgb_estimator.latest_training_job.name
training_job_desc = sagemaker_client.describe_training_job(
    TrainingJobName=training_job_name
)
training_job_arn = training_job_desc["TrainingJobArn"]
training_image = training_job_desc["AlgorithmSpecification"]["TrainingImage"]
model_artifact = xgb_estimator.model_data

model_package_description = (
    "XGBoost model package for breast cancer malignancy prediction. "
    "Trained on UCI Breast Cancer Wisconsin (Diagnostic) dataset."
)

model_package_input = {
    "ModelPackageGroupName": model_package_group_name,
    "ModelPackageDescription": model_package_description,
    "ModelApprovalStatus": "Approved",
    "InferenceSpecification": {
        "Containers": [
            {
                "Image": training_image,
                "ModelDataUrl": model_artifact,
            }
        ],
        "SupportedContentTypes": ["text/csv"],
        "SupportedResponseMIMETypes": ["text/csv"],
    },
}

create_model_package_response = sagemaker_client.create_model_package(**model_package_input)
model_package_arn = create_model_package_response["ModelPackageArn"]
print(model_package_arn)

describe_package = sagemaker_client.describe_model_package(
    ModelPackageName=model_package_arn
)
describe_package

arn:aws:sagemaker:us-east-1:586589491781:model-package/xgboost-breast-cancer-20260129-152007/1


{'ModelPackageGroupName': 'xgboost-breast-cancer-20260129-152007',
 'ModelPackageVersion': 1,
 'ModelPackageRegistrationType': 'Registered',
 'ModelPackageArn': 'arn:aws:sagemaker:us-east-1:586589491781:model-package/xgboost-breast-cancer-20260129-152007/1',
 'ModelPackageDescription': 'XGBoost model package for breast cancer malignancy prediction. Trained on UCI Breast Cancer Wisconsin (Diagnostic) dataset.',
 'CreationTime': datetime.datetime(2026, 1, 29, 15, 32, 58, 70000, tzinfo=tzlocal()),
 'InferenceSpecification': {'Containers': [{'Image': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1',
    'ImageDigest': 'sha256:b4f13edb198529c460692015797fa1ca6a8ff1ed64a149297174d922121b8fc4',
    'ModelDataUrl': 's3://sagemaker-us-east-1-586589491781/assignment4-model-store/output/xgb-20260129-152007/xgb-20260129-152007/output/model.tar.gz',
    'ModelDataETag': '310de3dee69e82e4cd9865a1a8d04917',
    'IsCheckpoint': False}],
  'SupportedContentTypes': ['text/csv'],
  

## Part 3: Write the Model Card

In [9]:
hyperparams = xgb_estimator.hyperparameters()

# Create model card content following SageMaker schema
model_card_content = {
    "model_overview": {
        "model_description": "XGBoost binary classifier for breast cancer malignancy prediction.",
        "model_creator": account_arn,
        "model_artifact": [model_artifact],
        "algorithm_type": "XGBoost",
        "problem_type": "Binary classification",
        "model_owner": account_arn,
    },
    "intended_uses": {
        "purpose_of_model": "Predict likelihood of malignancy from tumor features for educational purposes.",
        "intended_uses": "Educational and research use for breast cancer prediction.",
        "factors_affecting_model_efficiency": "Feature scaling and data quality.",
        "risk_rating": "Low",
        "explanations_for_risk_rating": "Academic model, not for production use.",
    },
    "business_details": {
        "business_problem": "Classification of breast tumor malignancy.",
        "business_stakeholders": "Educational institution.",
        "line_of_business": "Education",
    },
    "training_details": {
        "objective_function": {
            "function": "Minimize",
            "notes": "Binary logistic loss on validation set",
        },
        "training_observations": f"Trained on {len(train_data)} samples with validation accuracy {val_accuracy:.4f}.",
        "training_job_details": {
            "training_arn": training_job_arn,
            "training_datasets": [train_s3_uri],
            "training_environment": {
                "container_image": [training_image],
            },
            "user_provided_training_metrics": [
                {"name": "validation:accuracy", "value": float(val_accuracy)},
                {"name": "validation:auc", "value": float(val_auc)},
            ],
            "user_provided_hyper_parameters": [
                {"name": k, "value": str(v)} for k, v in hyperparams.items()
            ],
        },
    },
    "evaluation_details": [
        {
            "name": "validation_metrics",
            "evaluation_observation": "Metrics computed from endpoint predictions.",
            "datasets": [val_s3_uri],
            "metric_groups": [
                {
                    "name": "binary_classification_metrics",
                    "metric_data": [
                        {"name": "accuracy", "type": "number", "value": float(val_accuracy)},
                        {"name": "roc_auc", "type": "number", "value": float(val_auc)},
                    ],
                }
            ],
        }
    ],
    "additional_information": {
        "ethical_considerations": "Educational model, not intended for clinical decision-making.",
        "caveats_and_recommendations": "Model trained on UCI public dataset; generalization to real clinical settings not validated.",
    },
}

model_card_name = f"breast-cancer-xgboost-card-{run_id}"

create_model_card_response = sagemaker_client.create_model_card(
    ModelCardName=model_card_name,
    Content=json.dumps(model_card_content),
    ModelCardStatus="Draft",
)
print(create_model_card_response["ModelCardArn"])

describe_card = sagemaker_client.describe_model_card(
    ModelCardName=model_card_name
)
describe_card

arn:aws:sagemaker:us-east-1:586589491781:model-card/breast-cancer-xgboost-card-20260129-152007


{'ModelCardArn': 'arn:aws:sagemaker:us-east-1:586589491781:model-card/breast-cancer-xgboost-card-20260129-152007',
 'ModelCardName': 'breast-cancer-xgboost-card-20260129-152007',
 'ModelCardVersion': 1,
 'Content': '{"model_overview": {"model_description": "XGBoost binary classifier for breast cancer malignancy prediction.", "model_creator": "arn:aws:sts::586589491781:assumed-role/LabRole/SageMaker", "model_artifact": ["s3://sagemaker-us-east-1-586589491781/assignment4-model-store/output/xgb-20260129-152007/xgb-20260129-152007/output/model.tar.gz"], "algorithm_type": "XGBoost", "problem_type": "Binary classification", "model_owner": "arn:aws:sts::586589491781:assumed-role/LabRole/SageMaker"}, "intended_uses": {"purpose_of_model": "Predict likelihood of malignancy from tumor features for educational purposes.", "intended_uses": "Educational and research use for breast cancer prediction.", "factors_affecting_model_efficiency": "Feature scaling and data quality.", "risk_rating": "Low", "e

## Cleanup

In [None]:
predictor.delete_endpoint()