## Section 1: Environment Setup

We'll reuse the model from Lab 5 or create a new one for demonstration purposes.

In [None]:
# ============================================================
# Universal SageMaker Configuration
# Compatible with Studio, Notebook Instances, and Local
# ============================================================

import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = os.path.abspath('../..')
if project_root not in sys.path:
    sys.path.append(project_root)

# SageMaker imports
import sagemaker
import boto3
from sagemaker import get_execution_role
from sagemaker.model import Model
from sagemaker.serverless import ServerlessInferenceConfig
from sagemaker.async_inference import AsyncInferenceConfig
from sagemaker.multidatamodel import MultiDataModel
from sagemaker.transformer import Transformer

# Data science imports
import pandas as pd
import numpy as np
import json
import time
from datetime import datetime

# Configuration
try:
    from utils.sagemaker_config import get_sagemaker_config
    config = get_sagemaker_config(s3_prefix='lab6-endpoints')
    role = config['role']
    session = config['session']
    bucket = config['bucket']
    region = config['region']
except ImportError:
    print("Using fallback configuration method")
    role = get_execution_role()
    session = sagemaker.Session()
    bucket = session.default_bucket()
    region = session.boto_region_name

print("Configuration complete.")
print(f"Region: {region}")
print(f"S3 Bucket: s3://{bucket}")
print(f"IAM Role: {role[:50]}...")

sm_client = boto3.client('sagemaker', region_name=region)
s3_client = boto3.client('s3', region_name=region)

---

## Section 2: Prepare Model Artifacts

For this lab, we'll use a pre-trained scikit-learn model. In a production scenario, this would come from your Model Registry.

### Model Artifact Requirements

A valid SageMaker model artifact must include:
- Serialized model file (e.g., `model.pkl`)
- Inference code in `code/` directory
- Dependencies in `code/requirements.txt`
- Packaged as `model.tar.gz`


In [None]:
# ============================================================
# Create a Simple Model Artifact
# ============================================================

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import joblib
import tarfile
import shutil

# Create a simple trained model
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X, y)

# Create directory structure
model_dir = "model_artifact"
code_dir = os.path.join(model_dir, "code")
os.makedirs(code_dir, exist_ok=True)

# Save model
joblib.dump(model, os.path.join(model_dir, "model.pkl"))

# Create inference script
inference_code = '''
import os
import json
import joblib
import numpy as np

def model_fn(model_dir):
    """Load the model"""
    return joblib.load(os.path.join(model_dir, "model.pkl"))

def input_fn(request_body, content_type):
    """Parse input data"""
    if content_type == "application/json":
        data = json.loads(request_body)
        return np.array(data["features"]).reshape(1, -1)
    raise ValueError(f"Unsupported content type: {content_type}")

def predict_fn(input_data, model):
    """Run prediction"""
    prediction = model.predict(input_data)
    probability = model.predict_proba(input_data)
    return {
        "prediction": int(prediction[0]),
        "probability": float(probability[0][1]),
        "confidence": float(max(probability[0]))
    }

def output_fn(prediction, accept_type):
    """Format output"""
    if accept_type == "application/json":
        return json.dumps(prediction), accept_type
    raise ValueError(f"Unsupported accept type: {accept_type}")
'''

with open(os.path.join(code_dir, "inference.py"), "w") as f:
    f.write(inference_code)

# Create requirements.txt
with open(os.path.join(code_dir, "requirements.txt"), "w") as f:
    f.write("scikit-learn==1.3.0\nnumpy==1.24.3\njoblib==1.3.1\n")

# Create tar.gz
tar_path = "model.tar.gz"
with tarfile.open(tar_path, "w:gz") as tar:
    tar.add(model_dir, arcname=".")

# Upload to S3
model_s3_uri = f"s3://{bucket}/lab6-models/model.tar.gz"
s3_client.upload_file(tar_path, bucket, "lab6-models/model.tar.gz")

print(f"Model artifact created and uploaded:")
print(f"   S3 URI: {model_s3_uri}")

# Cleanup local files
shutil.rmtree(model_dir)
os.remove(tar_path)

---

## Section 3: Real-Time Endpoints with Auto-Scaling

Real-time endpoints provide synchronous predictions with low latency. They are ideal for user-facing applications requiring immediate responses.

### Key Characteristics

- **Latency**: Sub-100ms typical
- **Throughput**: Configurable via instance type and count
- **Availability**: Always-on (instances running continuously)
- **Cost**: Pay per instance hour, regardless of utilization

### Auto-Scaling Configuration

Auto-scaling adjusts instance count based on load:
- **Target Metric**: InvocationsPerInstance (default)
- **Scale-Out**: Add instances when metric exceeds threshold
- **Scale-In**: Remove instances when metric drops
- **Cool-Down**: Delay before scaling action


In [None]:
# ============================================================
# Deploy Real-Time Endpoint with Auto-Scaling
# ============================================================

from sagemaker.sklearn import SKLearnModel

# Create model object
# Note: inference.py is already in code/ directory inside model.tar.gz
sklearn_model = SKLearnModel(
    model_data=model_s3_uri,
    role=role,
    framework_version="1.2-1",
    py_version="py3",
    sagemaker_session=session
)

# Deploy endpoint
realtime_endpoint_name = f"realtime-endpoint-{int(time.time())}"

print(f"Deploying real-time endpoint: {realtime_endpoint_name}")
print("   Instance type: ml.t2.medium")
print("   Initial instance count: 1")
print("   Deployment in progress (4-6 minutes)...")

realtime_predictor = sklearn_model.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium",
    endpoint_name=realtime_endpoint_name
)

print(f"\nEndpoint deployed successfully:")
print(f"   Name: {realtime_endpoint_name}")
print(f"   Status: InService")

In [None]:
# ============================================================
# Configure Auto-Scaling
# ============================================================

autoscaling_client = boto3.client('application-autoscaling', region_name=region)

# Resource ID for the endpoint
resource_id = f"endpoint/{realtime_endpoint_name}/variant/AllTraffic"

# Register scalable target
response = autoscaling_client.register_scalable_target(
    ServiceNamespace='sagemaker',
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredInstanceCount',
    MinCapacity=1,
    MaxCapacity=3
)

print("Scalable target registered:")
print(f"   Min instances: 1")
print(f"   Max instances: 3")

# Configure scaling policy
response = autoscaling_client.put_scaling_policy(
    PolicyName=f"{realtime_endpoint_name}-scaling-policy",
    ServiceNamespace='sagemaker',
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredInstanceCount',
    PolicyType='TargetTrackingScaling',
    TargetTrackingScalingPolicyConfiguration={
        'TargetValue': 5.0,  # Target 5 invocations per instance
        'PredefinedMetricSpecification': {
            'PredefinedMetricType': 'SageMakerVariantInvocationsPerInstance'
        },
        'ScaleInCooldown': 300,  # 5 minutes
        'ScaleOutCooldown': 60   # 1 minute
    }
)

print("\nAuto-scaling policy configured:")
print(f"   Target: 5 invocations per instance")
print(f"   Scale-out cooldown: 60 seconds")
print(f"   Scale-in cooldown: 300 seconds")

In [None]:
# ============================================================
# Test Real-Time Endpoint
# ============================================================

# Create test data
test_features = np.random.randn(20).tolist()
test_data = {"features": test_features}

print("Testing real-time endpoint...")
print(f"\nInput: {len(test_features)} features")

# Measure latency
start_time = time.time()
result = realtime_predictor.predict(test_data)
latency_ms = (time.time() - start_time) * 1000

print(f"\nPrediction result:")
print(json.dumps(result, indent=2))
print(f"\nLatency: {latency_ms:.2f} ms")

# Multiple invocations for testing
print("\nRunning 10 consecutive invocations...")
latencies = []
for i in range(10):
    start = time.time()
    realtime_predictor.predict(test_data)
    latencies.append((time.time() - start) * 1000)

print(f"Average latency: {np.mean(latencies):.2f} ms")
print(f"P50 latency: {np.percentile(latencies, 50):.2f} ms")
print(f"P95 latency: {np.percentile(latencies, 95):.2f} ms")

---

## Section 4: Serverless Endpoints

Serverless endpoints automatically scale from zero to handle traffic spikes without managing instances. They are ideal for development, intermittent workloads, or cost optimization.

### Key Characteristics

- **No Infrastructure Management**: AWS handles all scaling
- **Cost Model**: Pay only for inference time (per request)
- **Cold Start**: First request after idle period takes ~1-2 seconds
- **Concurrency**: Auto-scales to handle concurrent requests
- **Memory Configuration**: 1 GB to 6 GB per request

### When to Use Serverless

- Development and testing environments
- Unpredictable or bursty traffic patterns
- Cost-sensitive applications with low QPS
- Prototypes and MVPs

### When NOT to Use Serverless

- Consistent high traffic (real-time cheaper)
- Ultra-low latency requirements (<500ms)
- Very large models (>6 GB memory)


In [None]:
# ============================================================
# Deploy Serverless Endpoint
# ============================================================

# Configure serverless inference
serverless_config = ServerlessInferenceConfig(
    memory_size_in_mb=2048,  # 2 GB memory
    max_concurrency=10        # Handle up to 10 concurrent requests
)

# Create new model object for serverless
serverless_model = SKLearnModel(
    model_data=model_s3_uri,
    role=role,
    framework_version="1.2-1",
    py_version="py3",
    sagemaker_session=session
)

# Deploy serverless endpoint
serverless_endpoint_name = f"serverless-endpoint-{int(time.time())}"

print(f"Deploying serverless endpoint: {serverless_endpoint_name}")
print("   Memory: 2048 MB")
print("   Max concurrency: 10")
print("   Deployment in progress (4-6 minutes)...")

serverless_predictor = serverless_model.deploy(
    serverless_inference_config=serverless_config,
    endpoint_name=serverless_endpoint_name
)

print(f"\nServerless endpoint deployed successfully:")
print(f"   Name: {serverless_endpoint_name}")
print(f"   Status: InService")

In [None]:
# ============================================================
# Test Serverless Endpoint (Cold Start vs Warm)
# ============================================================

# Test cold start (first request after deployment)
print("Testing serverless endpoint...")
print("\n1. Cold start (first request):")
start_time = time.time()
result = serverless_predictor.predict(test_data)
cold_start_latency = (time.time() - start_time) * 1000

print(f"   Latency: {cold_start_latency:.2f} ms")
print(f"   Result: {result}")

# Test warm invocations
print("\n2. Warm invocations (subsequent requests):")
warm_latencies = []
for i in range(5):
    start = time.time()
    serverless_predictor.predict(test_data)
    warm_latencies.append((time.time() - start) * 1000)
    time.sleep(0.1)  # Small delay between requests

print(f"   Average latency: {np.mean(warm_latencies):.2f} ms")
print(f"   Min latency: {np.min(warm_latencies):.2f} ms")
print(f"   Max latency: {np.max(warm_latencies):.2f} ms")

print(f"\nCold start overhead: {cold_start_latency - np.mean(warm_latencies):.2f} ms")

### Cost Comparison: Real-Time vs Serverless

Let's estimate costs for different usage patterns:

**Real-Time Endpoint (ml.t2.medium: $0.065/hour)**
- Always-on cost: $0.065/hr √ó 730 hrs/month = **$47.50/month**
- Best for: >1000 requests/hour

**Serverless Endpoint ($0.00002 per inference second + $0.60 per GB-hour)**
- 100 requests/day, 50ms each: ~$0.10/month
- 1000 requests/day, 50ms each: ~$1.00/month
- 10000 requests/day, 50ms each: ~$10/month
- Best for: <1000 requests/hour

**Rule of Thumb**: Serverless is cheaper for < 1000 requests/hour. Above that, real-time becomes more cost-effective.


---

## Section 5: Asynchronous Endpoints

Asynchronous endpoints handle long-running inference requests (>60 seconds) by queueing requests and returning results via S3.

### Key Characteristics

- **Request Model**: Queue-based (SQS)
- **Response Model**: S3 notification (SNS optional)
- **Timeout**: Up to 1 hour per request
- **Throughput**: High (queue buffers requests)
- **Cost**: Pay per instance hour + S3 storage

### Use Cases

- Large document processing (OCR, PDF parsing)
- Video/audio analysis
- Batch-like workloads with SLA flexibility
- Expensive model inference (LLMs, computer vision)

### Architecture

```
Client Request ‚Üí API Gateway ‚Üí SageMaker Async Endpoint
                                    ‚Üì
                               SQS Queue
                                    ‚Üì
                            ML Model Inference
                                    ‚Üì
                          S3 (Result Output)
                                    ‚Üì
                          SNS Notification (optional)
```


In [None]:
# ============================================================
# Deploy Asynchronous Endpoint
# ============================================================

# Configure async inference
async_config = AsyncInferenceConfig(
    output_path=f"s3://{bucket}/lab6-async-results/",
    max_concurrent_invocations_per_instance=4,
    failure_path=f"s3://{bucket}/lab6-async-failures/"
)

# Create model for async endpoint
async_model = SKLearnModel(
    model_data=model_s3_uri,
    role=role,
    framework_version="1.2-1",
    py_version="py3",
    sagemaker_session=session
)

# Deploy async endpoint
async_endpoint_name = f"async-endpoint-{int(time.time())}"

print(f"Deploying async endpoint: {async_endpoint_name}")
print("   Instance type: ml.t2.medium")
print("   Max concurrent invocations: 4")
print(f"   Output path: s3://{bucket}/lab6-async-results/")
print("   Deployment in progress (4-6 minutes)...")

async_predictor = async_model.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium",
    async_inference_config=async_config,
    endpoint_name=async_endpoint_name
)

print(f"\nAsync endpoint deployed successfully:")
print(f"   Name: {async_endpoint_name}")
print(f"   Status: InService")

In [None]:
# ============================================================
# Test Asynchronous Endpoint
# ============================================================

# Upload input data to S3
input_data_key = "lab6-async-input/test_input.json"
s3_client.put_object(
    Bucket=bucket,
    Key=input_data_key,
    Body=json.dumps(test_data)
)
input_s3_uri = f"s3://{bucket}/{input_data_key}"

print(f"Test input uploaded: {input_s3_uri}")

# Invoke async endpoint
print("\nInvoking async endpoint...")
response = async_predictor.predict_async(input_path=input_s3_uri)

# Get output location
output_location = response.output_path
print(f"Request accepted. Output will be at:")
print(f"   {output_location}")

# Wait for result
print("\nWaiting for inference to complete...")
import time as time_module
max_wait = 60  # seconds
elapsed = 0
result_found = False

while elapsed < max_wait:
    try:
        # Check if output exists
        output_key = output_location.replace(f"s3://{bucket}/", "")
        response = s3_client.head_object(Bucket=bucket, Key=output_key)
        result_found = True
        break
    except s3_client.exceptions.ClientError:
        time_module.sleep(5)
        elapsed += 5
        print(f"   Waiting... ({elapsed}s)")

if result_found:
    # Retrieve result
    result_obj = s3_client.get_object(Bucket=bucket, Key=output_key)
    result = json.loads(result_obj['Body'].read().decode('utf-8'))
    
    print(f"\nInference complete!")
    print(f"Prediction result:")
    print(json.dumps(result, indent=2))
else:
    print(f"\nTimeout: Result not available after {max_wait} seconds")
    print("Check S3 output location later for result")

---

## Section 6: Multi-Model Endpoints

Multi-Model Endpoints (MME) allow hosting multiple models on a single endpoint, sharing compute resources efficiently.

### Key Characteristics

- **Model Loading**: Dynamic (on-demand)
- **Model Caching**: LRU cache in instance memory
- **Cost Efficiency**: Share instances across models
- **Scalability**: 1000s of models on single endpoint
- **Invocation**: Specify target model in request

### Use Cases

- Personalized models per customer/region
- A/B testing multiple model versions
- Multi-tenant ML applications
- Cost optimization for many small models

### Architecture

```
Client Request (specifies TargetModel)
         ‚Üì
  Endpoint Instance
         ‚Üì
    Model Cache (LRU)
  ‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
  ‚îÇ Model A (hot)   ‚îÇ  ‚Üê Most recently used
  ‚îÇ Model B (hot)   ‚îÇ
  ‚îÇ Model C (warm)  ‚îÇ
  ‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
         ‚Üì
   S3 Model Store
  (100s-1000s models)
```


In [None]:
# ============================================================
# Create Multiple Model Variants
# ============================================================

# For demonstration, create 3 slightly different models
print("Creating multiple model variants...")

model_artifacts = []

for model_id in ["model_A", "model_B", "model_C"]:
    # Create a slightly different model
    X_temp, y_temp = make_classification(
        n_samples=1000, 
        n_features=20, 
        n_classes=2, 
        random_state=42 + ord(model_id[-1])
    )
    temp_model = RandomForestClassifier(
        n_estimators=50 + 10 * ord(model_id[-1]), 
        random_state=42
    )
    temp_model.fit(X_temp, y_temp)
    
    # Create model directory
    temp_dir = f"model_{model_id}"
    os.makedirs(temp_dir, exist_ok=True)
    joblib.dump(temp_model, os.path.join(temp_dir, "model.pkl"))
    
    # Copy inference code
    os.makedirs(os.path.join(temp_dir, "code"), exist_ok=True)
    with open(os.path.join(temp_dir, "code", "inference.py"), "w") as f:
        f.write(inference_code)
    with open(os.path.join(temp_dir, "code", "requirements.txt"), "w") as f:
        f.write("scikit-learn==1.3.0\nnumpy==1.24.3\njoblib==1.3.1\n")
    
    # Create tar.gz
    tar_name = f"{model_id}.tar.gz"
    with tarfile.open(tar_name, "w:gz") as tar:
        tar.add(temp_dir, arcname=".")
    
    # Upload to S3
    s3_key = f"lab6-mme-models/{tar_name}"
    s3_client.upload_file(tar_name, bucket, s3_key)
    model_uri = f"s3://{bucket}/{s3_key}"
    model_artifacts.append((model_id, model_uri))
    
    print(f"   {model_id}: {model_uri}")
    
    # Cleanup
    shutil.rmtree(temp_dir)
    os.remove(tar_name)

print(f"\n{len(model_artifacts)} model variants created and uploaded")

In [None]:
# ============================================================
# Deploy Multi-Model Endpoint
# ============================================================

from sagemaker.sklearn import SKLearnModel
from sagemaker.multidatamodel import MultiDataModel

# Create a base model
base_model = SKLearnModel(
    model_data=model_artifacts[0][1],  # Use first model as base
    role=role,
    framework_version="1.2-1",
    py_version="py3",
    sagemaker_session=session
)

# Create MultiDataModel
mme_model = MultiDataModel(
    name=f"mme-model-{int(time.time())}",
    model_data_prefix=f"s3://{bucket}/lab6-mme-models/",
    model=base_model,
    sagemaker_session=session
)

# Deploy MME endpoint
mme_endpoint_name = f"mme-endpoint-{int(time.time())}"

print(f"Deploying multi-model endpoint: {mme_endpoint_name}")
print(f"   Model prefix: s3://{bucket}/lab6-mme-models/")
print(f"   Number of models: {len(model_artifacts)}")
print("   Deployment in progress (4-6 minutes)...")

mme_predictor = mme_model.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium",
    endpoint_name=mme_endpoint_name
)

print(f"\nMulti-model endpoint deployed successfully:")
print(f"   Name: {mme_endpoint_name}")
print(f"   Models available: {len(model_artifacts)}")

In [None]:
# ============================================================
# Test Multi-Model Endpoint
# ============================================================

print("Testing multi-model endpoint with different models...\n")

for model_id, model_uri in model_artifacts:
    print(f"Testing {model_id}:")
    
    # Invoke with target model
    start_time = time.time()
    result = mme_predictor.predict(
        test_data,
        target_model=f"{model_id}.tar.gz"
    )
    latency = (time.time() - start_time) * 1000
    
    print(f"   Prediction: {result['prediction']}")
    print(f"   Confidence: {result['confidence']:.4f}")
    print(f"   Latency: {latency:.2f} ms")
    print()

print("All models invoked successfully!")
print("\nNote: First invocation per model may be slower (model loading)")
print("Subsequent invocations use cached model in memory")

---

## Section 7: Batch Transform

Batch Transform runs inference on large datasets stored in S3 without deploying a persistent endpoint. It's ideal for periodic bulk predictions.

### Key Characteristics

- **Execution Model**: Job-based (not persistent)
- **Input**: S3 dataset (CSV, JSON, etc.)
- **Output**: S3 results directory
- **Parallelism**: Configurable instance count
- **Cost**: Pay only during job execution

### Use Cases

- Periodic batch scoring (daily/weekly)
- One-time predictions on large datasets
- Offline model evaluation
- Data preprocessing pipelines

### Advantages over Endpoints

- No idle instance costs
- Automatic data parallelization
- No endpoint management
- Built-in data filtering and joining


In [None]:
# ============================================================
# Prepare Batch Input Data
# ============================================================

# Create sample batch input
batch_size = 100
batch_data = []

for i in range(batch_size):
    features = np.random.randn(20).tolist()
    batch_data.append({"features": features})

# Save as JSONL (one JSON object per line)
batch_input_file = "batch_input.jsonl"
with open(batch_input_file, "w") as f:
    for item in batch_data:
        f.write(json.dumps(item) + "\n")

# Upload to S3
batch_input_key = "lab6-batch-input/batch_input.jsonl"
s3_client.upload_file(batch_input_file, bucket, batch_input_key)
batch_input_s3 = f"s3://{bucket}/{batch_input_key}"

print(f"Batch input prepared:")
print(f"   Records: {batch_size}")
print(f"   S3 Location: {batch_input_s3}")

# Cleanup local file
os.remove(batch_input_file)

In [None]:
# ============================================================
# Create and Run Batch Transform Job
# ============================================================

# Create model for batch transform
batch_model = SKLearnModel(
    model_data=model_s3_uri,
    role=role,
    framework_version="1.2-1",
    py_version="py3",
    sagemaker_session=session
)

# Create transformer
transformer = batch_model.transformer(
    instance_count=1,
    instance_type="ml.t2.medium",
    output_path=f"s3://{bucket}/lab6-batch-output/",
    assemble_with="Line",  # Combine predictions line by line
    accept="application/json"
)

# Start transform job
job_name = f"batch-transform-{int(time.time())}"

print(f"Starting batch transform job: {job_name}")
print(f"   Input: {batch_input_s3}")
print(f"   Output: s3://{bucket}/lab6-batch-output/")
print(f"   Instance: ml.t2.medium")
print("   Job in progress (3-5 minutes)...")

transformer.transform(
    data=batch_input_s3,
    content_type="application/json",
    split_type="Line",  # Each line is a separate request
    job_name=job_name,
    wait=True  # Wait for job to complete
)

print(f"\nBatch transform job completed:")
print(f"   Job name: {job_name}")
print(f"   Status: Completed")

In [None]:
# ============================================================
# Retrieve and Analyze Batch Results
# ============================================================

# List output files
output_prefix = "lab6-batch-output/"
response = s3_client.list_objects_v2(Bucket=bucket, Prefix=output_prefix)

output_files = [obj['Key'] for obj in response.get('Contents', []) 
                if obj['Key'].endswith('.out')]

print(f"Batch transform output files: {len(output_files)}")

if output_files:
    # Download first output file
    output_key = output_files[0]
    output_file = "batch_output.jsonl"
    s3_client.download_file(bucket, output_key, output_file)
    
    print(f"\nDownloaded results: {output_key}")
    
    # Parse results
    results = []
    with open(output_file, "r") as f:
        for line in f:
            results.append(json.loads(line))
    
    print(f"\nProcessed {len(results)} predictions")
    print("\nFirst 5 predictions:")
    for i, result in enumerate(results[:5], 1):
        print(f"   {i}. Prediction: {result['prediction']}, "
              f"Confidence: {result['confidence']:.4f}")
    
    # Summary statistics
    predictions = [r['prediction'] for r in results]
    confidences = [r['confidence'] for r in results]
    
    print(f"\nSummary Statistics:")
    print(f"   Class 0 predictions: {predictions.count(0)}")
    print(f"   Class 1 predictions: {predictions.count(1)}")
    print(f"   Average confidence: {np.mean(confidences):.4f}")
    print(f"   Min confidence: {np.min(confidences):.4f}")
    print(f"   Max confidence: {np.max(confidences):.4f}")
    
    # Cleanup
    os.remove(output_file)
else:
    print("No output files found")

---

## Section 8: Resource Cleanup

Clean up all deployed resources to avoid unnecessary costs.

**Important:** Endpoints continue to incur charges while running. Always delete endpoints when not in use.


In [None]:
# ============================================================
# Delete All Endpoints
# ============================================================

endpoints_to_delete = [
    realtime_endpoint_name,
    serverless_endpoint_name,
    async_endpoint_name,
    mme_endpoint_name
]

print("Deleting endpoints...\n")

for endpoint_name in endpoints_to_delete:
    try:
        sm_client.delete_endpoint(EndpointName=endpoint_name)
        print(f"   Deleted: {endpoint_name}")
    except Exception as e:
        print(f"   Warning: Could not delete {endpoint_name}: {str(e)}")

# Also delete endpoint configurations
print("\nDeleting endpoint configurations...")
for endpoint_name in endpoints_to_delete:
    try:
        sm_client.delete_endpoint_config(EndpointConfigName=endpoint_name)
        print(f"   Deleted config: {endpoint_name}")
    except Exception as e:
        print(f"   Warning: Could not delete config {endpoint_name}: {str(e)}")

# Delete auto-scaling policy
print("\nDeleting auto-scaling configuration...")
try:
    resource_id = f"endpoint/{realtime_endpoint_name}/variant/AllTraffic"
    autoscaling_client.deregister_scalable_target(
        ServiceNamespace='sagemaker',
        ResourceId=resource_id,
        ScalableDimension='sagemaker:variant:DesiredInstanceCount'
    )
    print("   Auto-scaling policy deleted")
except Exception as e:
    print(f"   Warning: {str(e)}")

print("\nAll resources cleaned up successfully!")
print("\nNote: S3 objects are retained for analysis.")
print("Delete manually if needed: Console > S3 > {bucket} > lab6-*")

---

## Summary and Key Learnings

### What You Accomplished

1. **Real-Time Endpoints**:
   - Deployed synchronous inference endpoint
   - Configured auto-scaling based on invocation metrics
   - Measured sub-100ms latency performance

2. **Serverless Endpoints**:
   - Deployed pay-per-request serverless endpoint
   - Compared cold start vs warm invocation latency
   - Understood cost trade-offs vs real-time endpoints

3. **Asynchronous Endpoints**:
   - Deployed queue-based async endpoint
   - Handled long-running inference requests
   - Retrieved results from S3

4. **Multi-Model Endpoints**:
   - Hosted multiple models on single endpoint
   - Invoked specific models by name
   - Understood cost efficiency for many models

5. **Batch Transform**:
   - Executed bulk predictions on S3 data
   - Processed 100 predictions without persistent endpoint
   - Analyzed batch results

### Deployment Pattern Decision Tree

```
How often is inference needed?
‚îú‚îÄ Continuously ‚Üí Real-Time Endpoint
‚îÇ   ‚îú‚îÄ High QPS ‚Üí Real-Time with Auto-Scaling
‚îÇ   ‚îî‚îÄ Low/Variable QPS ‚Üí Serverless Endpoint
‚îÇ
‚îú‚îÄ Occasionally ‚Üí Batch Transform
‚îÇ   ‚îî‚îÄ (For periodic bulk scoring)
‚îÇ
‚îî‚îÄ Request takes >60s ‚Üí Async Endpoint
    ‚îî‚îÄ (For long-running jobs)

How many models?
‚îú‚îÄ 1 model ‚Üí Standard Endpoint
‚îî‚îÄ 100s-1000s models ‚Üí Multi-Model Endpoint
```

### Cost Optimization Best Practices

1. **Use Serverless for**:
   - Development and testing
   - Traffic < 1000 req/hour
   - Unpredictable workloads

2. **Use Real-Time for**:
   - Production with consistent traffic
   - Traffic > 1000 req/hour
   - Ultra-low latency requirements

3. **Use Batch Transform for**:
   - Periodic scoring (daily/weekly)
   - One-time bulk predictions
   - No real-time requirements

4. **Use Multi-Model for**:
   - Many small models
   - Per-customer/region models
   - Cost-sensitive multi-tenant apps

### Next Steps

**Lab 7**: SageMaker Pipelines, Experiments, and Model Explainability with Clarify

**Lab 8**: Model Monitor, Blue/Green, Canary, and Shadow Deployments

---

## Reflection Questions

1. **When** would you choose a serverless endpoint over a real-time endpoint?
2. **How** does multi-model endpoint reduce costs compared to individual endpoints?
3. **What** are the trade-offs between async endpoints and batch transform?
4. **Why** is auto-scaling important for production endpoints?

---

## Additional Resources

- [SageMaker Inference Documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/deploy-model.html)
- [Serverless Inference Guide](https://docs.aws.amazon.com/sagemaker/latest/dg/serverless-endpoints.html)
- [Multi-Model Endpoints](https://docs.aws.amazon.com/sagemaker/latest/dg/multi-model-endpoints.html)
- [Batch Transform](https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform.html)


In [None]:
# Cleanup: Delete BYOC endpoint
try:
    byoc_predictor.delete_endpoint(delete_endpoint_config=True)
    print(f"‚úÖ Deleted BYOC endpoint: {byoc_endpoint_name}")
except:
    print("‚ö†Ô∏è  BYOC endpoint not deployed or already deleted")

In [None]:
# BYOC: Test custom container endpoint with numeric data
test_data_byoc = {
    "instances": [np.random.randn(20).tolist()]
}

print("üß™ Testing BYOC endpoint...")
print(f"Input: 1 instance with 20 features")

try:
    response = byoc_predictor.predict(test_data_byoc)
    print("\n‚úÖ Prediction successful!")
    print(json.dumps(response, indent=2))
    
    print("\nüîç What just happened:")
    print("   1. Your custom Flask server received the request")
    print("   2. inference.py loaded model.pkl from /opt/ml/model/")
    print("   3. Prediction ran with custom preprocessing logic")
    print("   4. Response formatted as JSON")
    
except Exception as e:
    print(f"‚ö†Ô∏è  Error: {e}")
    print("   Tip: Ensure Docker container is built and pushed to ECR")

print("""
üí° Custom Container Benefits:
   ‚úÖ Full control over inference logic
   ‚úÖ Add proprietary preprocessing libraries
   ‚úÖ Optimize for specific hardware (GPU, custom chips)
   ‚úÖ Include custom authentication/logging
   ‚úÖ Support multiple data formats (JSON, CSV, binary)
   
üìÇ Container Files (docker/sklearn-custom/):
   ‚Ä¢ Dockerfile - Python 3.9 + Flask + sklearn
   ‚Ä¢ inference.py - Generic inference (numeric & text)
   ‚Ä¢ serve - Gunicorn entrypoint
   ‚Ä¢ requirements.txt - Python dependencies
   
üîß Customization Ideas:
   ‚Ä¢ Add caching layer (Redis)
   ‚Ä¢ Include feature engineering pipelines
   ‚Ä¢ Support streaming inference
   ‚Ä¢ Add custom metrics/monitoring
""")

In [None]:
# BYOC: Deploy model using custom Docker container
from sagemaker.model import Model
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

# Use the same model we created earlier (RandomForest with 20 features)
print("üê≥ Deploying with custom Docker container...")
print(f"   Model artifact: {model_s3_uri}")
print(f"   Container image: {image_uri}")

# Create SageMaker Model with custom container
byoc_model = Model(
    image_uri=image_uri,  # Our custom ECR image
    model_data=model_s3_uri,  # Use Lab 6 model
    role=role,
    sagemaker_session=session,
    name=f"byoc-model-{int(time.time())}"
)

# Deploy to endpoint
byoc_endpoint_name = f"byoc-endpoint-{int(time.time())}"

print("\n‚è≥ Deploying BYOC endpoint (4-6 minutes)...")
print("   This uses YOUR custom container from ECR")
print("   Inference logic: docker/sklearn-custom/inference.py")

byoc_predictor = byoc_model.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    endpoint_name=byoc_endpoint_name,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer()
)

print(f"\n‚úÖ BYOC Endpoint deployed: {byoc_endpoint_name}")
print(f"üì¶ Model: RandomForestClassifier (20 features)")
print(f"üê≥ Container: Custom scikit-learn with Flask")

In [None]:
# BYOC: Build and push custom Docker container to ECR
import boto3

# Get AWS account ID and region
sts_client = boto3.client('sts')
account_id = sts_client.get_caller_identity()['Account']
region = boto3.Session().region_name

# ECR repository details
ecr_repository = 'sagemaker-sklearn-custom'
image_tag = 'latest'
image_uri = f"{account_id}.dkr.ecr.{region}.amazonaws.com/{ecr_repository}:{image_tag}"

print(f"üì¶ Custom Container Image URI:")
print(f"   {image_uri}")
print(f"\nüî® To build and push the container:")
print(f"""
   cd docker/sklearn-custom
   
   # Build Docker image
   docker build -t {ecr_repository}:{image_tag} .
   
   # Create ECR repository (if doesn't exist)
   aws ecr create-repository --repository-name {ecr_repository} || true
   
   # Login to ECR
   aws ecr get-login-password --region {region} | \\
       docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com
   
   # Tag and push
   docker tag {ecr_repository}:{image_tag} {image_uri}
   docker push {image_uri}
""")

---
## üê≥ Bonus: Bring Your Own Container (BYOC)

**Why Custom Containers?**
- Add custom preprocessing libraries
- Control the exact inference environment
- Include proprietary code or models
- Optimize performance for your specific use case

**Our Custom Container:**
- Python 3.9 with scikit-learn
- Flask-based inference server
- Automatic model loading from S3
- Supports text classification with TF-IDF

**Location:** `docker/sklearn-custom/`

Let's deploy using the custom container from ECR!