In [20]:
import sagemaker
import boto3
import pandas as pd
import numpy as np
from sagemaker.model import Model
from sagemaker.transformer import Transformer

# Session setup
session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
region = session.boto_region_name

# Get XGBoost image
xgb_image = sagemaker.image_uris.retrieve('xgboost', region, version='1.7-1')

# Find our baseline model artifact
sm_client = boto3.client('sagemaker')
s3_client = boto3.client('s3')

baseline_job = 'insurance-lapse-baseline-2026-02-12-15-18-48-636'
job_details = sm_client.describe_training_job(TrainingJobName=baseline_job)
model_artifact = job_details['ModelArtifacts']['S3ModelArtifacts']

print(f"Bucket: {bucket}")
print(f"Region: {region}")
print(f"Model artifact: {model_artifact}")

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


Bucket: sagemaker-us-east-1-360138725243
Region: us-east-1
Model artifact: s3://sagemaker-us-east-1-360138725243/models/insurance-lapse-baseline-2026-02-12-15-18-48-636/output/model.tar.gz


In [None]:
# ── Batch Transform – Score entire validation set ──────────────
import time

# Create a SageMaker Model object from our trained artifact
model_name = f'insurance-lapse-baseline-{int(time.time())}'

xgb_model = Model(
    image_uri=xgb_image,
    model_data=model_artifact,
    role=role,
    sagemaker_session=session,
    name=model_name
)
xgb_model.create(instance_type='ml.m5.large')

# Configure Batch Transform with input_filter to skip the Lapsed column
transformer = Transformer(
    model_name=model_name,
    instance_count=1,
    instance_type='ml.m5.large',
    output_path=f's3://{bucket}/batch-predictions/',
    sagemaker_session=session,
    assemble_with='Line',
    accept='text/csv'
)

# input_filter="$[1:]" → skip column 0 (Lapsed), send only 25 features
# join_source="Input"  → append original input row to predictions for comparison
transformer.transform(
    data=f's3://{bucket}/data/processed/validation/',
    content_type='text/csv',
    split_type='Line',
    input_filter='$[1:]',
    join_source='Input'
)

print('Batch Transform job started...')
transformer.wait()
print('Batch Transform complete!')


INFO:sagemaker:Creating model with name: insurance-lapse-baseline-1771102784
INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2026-02-14-20-59-45-696


....................

In [None]:
# ── Cell 3: Download & Evaluate Batch Transform Results ──────────────
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# Download batch predictions from S3
batch_output_path = f'{bucket}/batch-predictions/validation.csv.out'
s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket=bucket, Key='batch-predictions/validation.csv.out')
results = pd.read_csv(response['Body'], header=None)

print(f'Results shape: {results.shape}')
print(f'First few rows:')
results.head()

In [None]:
# ── Cell 4: Evaluate Batch Predictions ───────────────────────────────
# Column 0 = actual Lapsed, Column 26 = predicted probability
actuals = results[0].astype(int)
predictions_prob = results[26]

# Apply our chosen threshold of 0.30
threshold = 0.30
predictions = (predictions_prob >= threshold).astype(int)

print(f'Batch Transform scored {len(predictions):,} records')
print(f'Threshold: {threshold}')
print(f'Prediction range: {predictions_prob.min():.4f} to {predictions_prob.max():.4f}')
print(f'Mean probability: {predictions_prob.mean():.4f}')
print()
print(classification_report(actuals, predictions, target_names=['Not Lapsed', 'Lapsed']))
print('Confusion Matrix:')
print(confusion_matrix(actuals, predictions))

In [None]:
# ── Verify threshold 0.20 matches previous results ─────────────
threshold_020 = 0.20
preds_020 = (predictions_prob >= threshold_020).astype(int)

print(f'Threshold: {threshold_020}')
print(classification_report(actuals, preds_020, target_names=['Not Lapsed', 'Lapsed']))

In [11]:
# ── Register Model in SageMaker Model Registry ───────────────
from sagemaker.model_metrics import ModelMetrics, MetricsSource
import json

# Define the model package group (like a "repository" for model versions)
model_package_group = 'insurance-lapse-prediction'

# Create the group if it doesn't exist
try:
    sm_client.create_model_package_group(
        ModelPackageGroupName=model_package_group,
        ModelPackageGroupDescription='Policyholder lapse prediction models for P&C insurance'
    )
    print(f'Created model package group: {model_package_group}')
except sm_client.exceptions.ConflictException:
    print(f'Model package group already exists: {model_package_group}')

# Register our baseline model as version 1
create_model_response = sm_client.create_model_package(
    ModelPackageGroupName=model_package_group,
    ModelPackageDescription='Baseline XGBoost model - threshold 0.30 (79% recall) or 0.20 (97% recall)',
    InferenceSpecification={
        'Containers': [{
            'Image': xgb_image,
            'ModelDataUrl': model_artifact
        }],
        'SupportedContentTypes': ['text/csv'],
        'SupportedResponseMIMETypes': ['text/csv'],
        'SupportedTransformInstanceTypes': ['ml.m5.large'],
        'SupportedRealtimeInferenceInstanceTypes': ['ml.m5.large']
    },
    ModelApprovalStatus='PendingManualApproval',
    CustomerMetadataProperties={
        'project': 'policyholder-lapse-prediction',
        'algorithm': 'xgboost',
        'threshold_conservative': '0.30',
        'threshold_aggressive': '0.20',
        'recall_at_030': '0.79',
        'recall_at_020': '0.97',
        'training_job': 'insurance-lapse-baseline-2026-02-12-15-18-48-636'
    }
)

model_package_arn = create_model_response['ModelPackageArn']
print(f'\nModel registered successfully!')
print(f'Model Package ARN: {model_package_arn}')
print(f'Status: PendingManualApproval')

Created model package group: insurance-lapse-prediction

Model registered successfully!
Model Package ARN: arn:aws:sagemaker:us-east-1:360138725243:model-package/insurance-lapse-prediction/1
Status: PendingManualApproval


In [15]:
# ── Approve the Model ────────────────────────────────────────
sm_client.update_model_package(
    ModelPackageArn=model_package_arn,
    ModelApprovalStatus='Approved'
)

# Verify
details = sm_client.describe_model_package(ModelPackageName=model_package_arn)
print(f'Model: {model_package_arn}')
print(f'Status: {details["ModelApprovalStatus"]}')
print(f'Version: {details["ModelPackageVersion"]}')

Model: arn:aws:sagemaker:us-east-1:360138725243:model-package/insurance-lapse-prediction/1
Status: Approved
Version: 1


In [18]:
# ── Risk Tiering – Turn Predictions into Business Actions ────
# Apply risk tiers to our batch predictions
def assign_risk_tier(probability):
    if probability >= 0.60:
        return 'High'
    elif probability >= 0.30:
        return 'Medium'
    else:
        return 'Low'

# Build a results dataframe
scored_customers = pd.DataFrame({
    'actual_lapsed': actuals,
    'lapse_probability': predictions_prob.round(4),
    'risk_tier': predictions_prob.apply(assign_risk_tier)
})

# Summary by tier
tier_summary = scored_customers.groupby('risk_tier').agg(
    count=('lapse_probability', 'size'),
    avg_probability=('lapse_probability', 'mean'),
    actual_lapse_rate=('actual_lapsed', 'mean')
).round(4)

# Reorder rows
tier_summary = tier_summary.reindex(['High', 'Medium', 'Low'])

print('═══ Risk Tier Summary ═══')
print(f'Total customers scored: {len(scored_customers):,}\n')
print(tier_summary)
print()

# Show how tiers map to business actions
print('═══ Recommended Actions ═══')
for tier in ['High', 'Medium', 'Low']:
    count = tier_summary.loc[tier, 'count']
    rate = tier_summary.loc[tier, 'actual_lapse_rate']
    pct = count / len(scored_customers) * 100
    print(f'\n{tier} Risk: {count:,} customers ({pct:.1f}%)')
    print(f'  Actual lapse rate: {rate:.1%}')
    if tier == 'High':
        print('  → Action: Personal agent call + retention offer')
    elif tier == 'Medium':
        print('  → Action: Targeted email campaign + early renewal discount')
    else:
        print('  → Action: Standard renewal process')

═══ Risk Tier Summary ═══
Total customers scored: 21,111

           count  avg_probability  actual_lapse_rate
risk_tier                                           
High        4691           0.7818             0.6012
Medium      6532           0.4458             0.1891
Low         9888           0.1248             0.0316

═══ Recommended Actions ═══

High Risk: 4,691 customers (22.2%)
  Actual lapse rate: 60.1%
  → Action: Personal agent call + retention offer

Medium Risk: 6,532 customers (30.9%)
  Actual lapse rate: 18.9%
  → Action: Targeted email campaign + early renewal discount

Low Risk: 9,888 customers (46.8%)
  Actual lapse rate: 3.2%
  → Action: Standard renewal process


In [19]:
# ── Cell 9: Save Scored Results & Clean Up ───────────────────────────
# Save scored customers to S3
scored_customers.to_csv('/tmp/scored_customers.csv', index=False)
s3_client.upload_file(
    '/tmp/scored_customers.csv',
    bucket,
    'batch-predictions/scored_customers.csv'
)
print(f'Scored results saved to: s3://{bucket}/batch-predictions/scored_customers.csv')

# Clean up the model we created for Batch Transform
sm_client.delete_model(ModelName=model_name)
print(f'Cleaned up model: {model_name}')

print('\n═══ Day 7 Complete! ═══')
print(f'✓ Batch Transform: {len(scored_customers):,} customers scored')
print(f'✓ Model Registry: Version 1 — Approved')
print(f'✓ Risk Tiering: High ({4691}), Medium ({6532}), Low ({9888})')

Scored results saved to: s3://sagemaker-us-east-1-360138725243/batch-predictions/scored_customers.csv
Cleaned up model: insurance-lapse-baseline-1771017218

═══ Day 7 Complete! ═══
✓ Batch Transform: 21,111 customers scored
✓ Model Registry: Version 1 — Approved
✓ Risk Tiering: High (4691), Medium (6532), Low (9888)
