# SageMaker Endpoint Deployment with Inference Component

This notebook demonstrates how to deploy a model from SageMaker Model Registry to a SageMaker Endpoint using Inference Components.

## 1. Setup and Import Libraries

In [25]:
import boto3
import sagemaker
from sagemaker.model import Model
from sagemaker.pytorch import PyTorchModel
from sagemaker import image_uris
from datetime import datetime
import json
import time
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

## 2. Configure SageMaker Session

In [26]:
# SageMaker session
sagemaker_session = sagemaker.Session()
role = "arn:aws:iam::637423390840:role/WSParticipantRole" # need to change your role
region = sagemaker_session.boto_region_name

# Clients
sm_client = boto3.client('sagemaker', region_name=region)
s3_client = boto3.client('s3', region_name=region)

# S3 Bucket
bucket = sagemaker_session.default_bucket()
prefix = "qwen3-0-6-lora-samples"

print(f"Using bucket: {bucket}")
print(f"Using region: {region}")
print(f"Using role: {role}")

Using bucket: sagemaker-us-east-1-637423390840
Using region: us-east-1
Using role: arn:aws:iam::637423390840:role/WSParticipantRole


## 3. Load Model from Model Registry

In [27]:
# Model package group name (same as used in model registry)
model_package_group_name = "qwen3-0-6b-lora-models"

# List approved models in the model package group
response = sm_client.list_model_packages(
    ModelPackageGroupName=model_package_group_name,
    ModelApprovalStatus='Approved',
    SortBy='CreationTime',
    SortOrder='Descending',
    MaxResults=10
)

if response['ModelPackageSummaryList']:
    # Get the latest approved model
    latest_model_package = response['ModelPackageSummaryList'][0]
    model_package_arn = latest_model_package['ModelPackageArn']
    
    print(f"=== Latest Approved Model ===")
    print(f"Model Package ARN: {model_package_arn}")
    print(f"Created: {latest_model_package['CreationTime']}")
    
    # Get detailed model information
    model_package_details = sm_client.describe_model_package(
        ModelPackageName=model_package_arn
    )
    
    if 'ModelPackageDescription' in model_package_details:
        print(f"Description: {model_package_details['ModelPackageDescription']}")
else:
    print("No approved models found in the registry.")
    print("Please approve a model in the Model Registry first.")
    model_package_arn = None

=== Latest Approved Model ===
Model Package ARN: arn:aws:sagemaker:us-east-1:637423390840:model-package/qwen3-0-6b-lora-models/5
Created: 2025-08-31 12:51:29.757000+00:00
Description: QWEN3-0.6B LoRA fine-tuned model | Training Job: qwen3-0-6b-lora-fine-tuning-lora-2025-08-31-12-28-29 | Eval Loss: 0.8944 | BLEU: 0.0528 | Samples: 15 | Timestamp: 20250831-125129


## 4. Create Model from Model Package

In [28]:
if model_package_arn:
    # Create model name with timestamp
    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    model_name = f"qwen3-0-6b-lora-model-{timestamp}"
    
    # Option 1: Create model directly from model package (simpler)
    try:
        # This is the simplest way - let SageMaker handle everything from the model package
        create_model_response = sm_client.create_model_from_model_package(
            ModelName=model_name,
            ModelPackageName=model_package_arn
        )
        print(f"\n=== Model Created from Model Package ===")
        print(f"Model Name: {model_name}")
        print(f"Model ARN: {create_model_response['ModelArn']}")
    except:
        # Option 2: If the above doesn't work, extract container details manually
        # Get container details from model package
        model_package_details = sm_client.describe_model_package(
            ModelPackageName=model_package_arn
        )
        
        # Extract container configuration from inference specification
        container = model_package_details['InferenceSpecification']['Containers'][0]
        
        # Create model from model package
        create_model_response = sm_client.create_model(
            ModelName=model_name,
            ExecutionRoleArn=role,
            PrimaryContainer={
                'Image': container['Image'],
                'ModelDataUrl': container['ModelDataUrl'],
                'Environment': container.get('Environment', {})
            }
        )
        
        print(f"\n=== Model Created ===")
        print(f"Model Name: {model_name}")
        print(f"Model ARN: {create_model_response['ModelArn']}")
        print(f"Model Data: {container['ModelDataUrl']}")
else:
    print("Cannot proceed without a model package ARN")


=== Model Created ===
Model Name: qwen3-0-6b-lora-model-20250831-125155
Model ARN: arn:aws:sagemaker:us-east-1:637423390840:model/qwen3-0-6b-lora-model-20250831-125155
Model Data: s3://sagemaker-us-east-1-637423390840/qwen3-0-6-lora-samples/output/qwen3-0-6b-lora-fine-tuning-lora-2025-08-31-12-28-29/output/model.tar.gz


## 5. Create Endpoint Configuration with Inference Component Support

In [29]:
if model_package_arn:
    # Endpoint configuration name
    endpoint_config_name = f"qwen3-0-6b-lora-config-{timestamp}"
    
    # Create endpoint configuration
    # Using ml.g5.2xlarge for inference (1 GPU, 24GB memory)
    endpoint_config_response = sm_client.create_endpoint_config(
        EndpointConfigName=endpoint_config_name,
        ProductionVariants=[
            {
                'VariantName': 'AllTraffic',
                'ModelName': model_name,
                'InstanceType': 'ml.g5.2xlarge',
                'InitialInstanceCount': 1,
                'InitialVariantWeight': 1,
                'ManagedInstanceScaling': {
                    'Status': 'ENABLED',
                    'MinInstanceCount': 1,
                    'MaxInstanceCount': 1
                },
                'RoutingConfig': {
                    'RoutingStrategy': 'LEAST_OUTSTANDING_REQUESTS'
                }
            }
        ],
        Tags=[
            {'Key': 'Project', 'Value': 'MLOps-Workshop'},
            {'Key': 'Model', 'Value': 'QWEN3-0.6B'},
            {'Key': 'Type', 'Value': 'Inference'}
        ]
    )
    
    print(f"\n=== Endpoint Configuration Created ===")
    print(f"Config Name: {endpoint_config_name}")
    print(f"Config ARN: {endpoint_config_response['EndpointConfigArn']}")
    print(f"Instance Type: ml.g5.2xlarge")
    print(f"Initial Instance Count: 1")
    print(f"Auto-scaling: Enabled (1-2 instances)")


=== Endpoint Configuration Created ===
Config Name: qwen3-0-6b-lora-config-20250831-125155
Config ARN: arn:aws:sagemaker:us-east-1:637423390840:endpoint-config/qwen3-0-6b-lora-config-20250831-125155
Instance Type: ml.g5.2xlarge
Initial Instance Count: 1
Auto-scaling: Enabled (1-2 instances)


## 6. Create SageMaker Endpoint

In [30]:
if model_package_arn:
    # Endpoint name
    endpoint_name = f"qwen3-0-6b-lora-endpoint-{timestamp}"
    
    # Create endpoint
    create_endpoint_response = sm_client.create_endpoint(
        EndpointName=endpoint_name,
        EndpointConfigName=endpoint_config_name,
        Tags=[
            {'Key': 'Project', 'Value': 'MLOps-Workshop'},
            {'Key': 'Model', 'Value': 'QWEN3-0.6B'},
            {'Key': 'DeploymentType', 'Value': 'RealTime'}
        ]
    )
    
    print(f"\n=== Creating Endpoint ===")
    print(f"Endpoint Name: {endpoint_name}")
    print(f"Endpoint ARN: {create_endpoint_response['EndpointArn']}")
    print(f"Status: Creating...")
    print(f"\nThis process will take 5-10 minutes. You can monitor progress in the SageMaker console.")


=== Creating Endpoint ===
Endpoint Name: qwen3-0-6b-lora-endpoint-20250831-125155
Endpoint ARN: arn:aws:sagemaker:us-east-1:637423390840:endpoint/qwen3-0-6b-lora-endpoint-20250831-125155
Status: Creating...

This process will take 5-10 minutes. You can monitor progress in the SageMaker console.


## 7. Wait for Endpoint to be Ready

In [31]:
if model_package_arn:
    # Wait for endpoint to be in service
    print("Waiting for endpoint to be in service...")
    
    waiter = sm_client.get_waiter('endpoint_in_service')
    
    try:
        waiter.wait(
            EndpointName=endpoint_name,
            WaiterConfig={
                'Delay': 30,  # Check every 30 seconds
                'MaxAttempts': 40  # Maximum 20 minutes
            }
        )
        
        # Get endpoint status
        endpoint_response = sm_client.describe_endpoint(EndpointName=endpoint_name)
        
        print(f"\n=== Endpoint Deployment Successful ===")
        print(f"Endpoint Name: {endpoint_name}")
        print(f"Status: {endpoint_response['EndpointStatus']}")
        print(f"Created: {endpoint_response['CreationTime']}")
        
    except Exception as e:
        print(f"Error waiting for endpoint: {e}")
        print("Please check the endpoint status in the SageMaker console.")

Waiting for endpoint to be in service...

=== Endpoint Deployment Successful ===
Endpoint Name: qwen3-0-6b-lora-endpoint-20250831-125155
Status: InService
Created: 2025-08-31 12:51:58.403000+00:00


## 8. Test the Endpoint

In [32]:
if model_package_arn:
    # Create a predictor for the endpoint
    from sagemaker.predictor import Predictor
    
    predictor = Predictor(
        endpoint_name=endpoint_name,
        sagemaker_session=sagemaker_session,
        serializer=JSONSerializer(),
        deserializer=JSONDeserializer()
    )
    
    print("=== Testing Endpoint ===")
    print("\nNote: If you get a timeout error, the model might still be loading.")
    print("Wait a few minutes and try again, or check CloudWatch logs for details.\n")
    
    # Start with a simple test
    simple_test = {
        "inputs": "Hi!",
        "parameters": {
            "max_new_tokens": 10,
            "temperature": 0.1,
            "do_sample": False
        }
    }

    response = predictor.predict(
        data=simple_test,
    )

=== Testing Endpoint ===

Note: If you get a timeout error, the model might still be loading.
Wait a few minutes and try again, or check CloudWatch logs for details.



In [33]:
response

{'generated_text': 'I need help with this problem. The problem is'}

## 9. Monitor Endpoint Metrics

In [34]:
if model_package_arn:
    # Get CloudWatch metrics
    cloudwatch_client = boto3.client('cloudwatch', region_name=region)
    
    print("\n=== Available CloudWatch Metrics ===")
    print(f"Namespace: AWS/SageMaker")
    print(f"Endpoint: {endpoint_name}")
    print("\nKey metrics to monitor:")
    print("- ModelLatency: Time taken for model inference")
    print("- OverheadLatency: Time taken by SageMaker overhead")
    print("- Invocations: Number of InvokeEndpoint requests")
    print("- Invocation4XXErrors: Number of 4XX errors")
    print("- Invocation5XXErrors: Number of 5XX errors")
    print("- ModelSetupTime: Time to load the model")
    print("\nYou can view these metrics in the CloudWatch console.")


=== Available CloudWatch Metrics ===
Namespace: AWS/SageMaker
Endpoint: qwen3-0-6b-lora-endpoint-20250831-125155

Key metrics to monitor:
- ModelLatency: Time taken for model inference
- OverheadLatency: Time taken by SageMaker overhead
- Invocations: Number of InvokeEndpoint requests
- Invocation4XXErrors: Number of 4XX errors
- Invocation5XXErrors: Number of 5XX errors
- ModelSetupTime: Time to load the model

You can view these metrics in the CloudWatch console.


## 10. Endpoint Information Summary

In [35]:
if model_package_arn:
    # Save endpoint information for future use
    endpoint_info = {
        "endpoint_name": endpoint_name,
        "endpoint_config_name": endpoint_config_name,
        "model_name": model_name,
        "model_package_arn": model_package_arn,
        "instance_type": "ml.g5.2xlarge",
        "created_time": timestamp,
        "region": region
    }
    
    # Save to JSON file
    with open('endpoint_info.json', 'w') as f:
        json.dump(endpoint_info, f, indent=2, default=str)
    
    print("\n=== Deployment Summary ===")
    print(f"Endpoint Name: {endpoint_name}")
    print(f"Model Package: {model_package_arn.split('/')[-1]}")
    print(f"Instance Type: ml.g5.2xlarge")
    print(f"Auto-scaling: 1-2 instances")
    print(f"Region: {region}")
    print("\nEndpoint information saved to 'endpoint_info.json'")
    print("\n=== How to Invoke the Endpoint ===")
    print("You can invoke this endpoint using:")
    print("1. SageMaker SDK (as shown above)")
    print("2. AWS SDK (boto3)")
    print("3. AWS CLI")
    print("4. HTTP API with SigV4 authentication")


=== Deployment Summary ===
Endpoint Name: qwen3-0-6b-lora-endpoint-20250831-125155
Model Package: 5
Instance Type: ml.g5.2xlarge
Auto-scaling: 1-2 instances
Region: us-east-1

Endpoint information saved to 'endpoint_info.json'

=== How to Invoke the Endpoint ===
You can invoke this endpoint using:
1. SageMaker SDK (as shown above)
2. AWS SDK (boto3)
3. AWS CLI
4. HTTP API with SigV4 authentication


## 11. Clean Up Resources (Optional)

In [36]:
# IMPORTANT: Only run this cell when you want to delete the endpoint
# This will incur no further charges

def cleanup_resources(endpoint_name, endpoint_config_name, model_name):
    """
    Clean up SageMaker resources to avoid charges.
    """
    try:
        # Delete endpoint
        print(f"Deleting endpoint: {endpoint_name}")
        sm_client.delete_endpoint(EndpointName=endpoint_name)
        print("Endpoint deletion initiated.")
        
        # Wait for endpoint to be deleted
        time.sleep(30)
        
        # Delete endpoint configuration
        print(f"Deleting endpoint configuration: {endpoint_config_name}")
        sm_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
        print("Endpoint configuration deleted.")
        
        # Delete model
        print(f"Deleting model: {model_name}")
        sm_client.delete_model(ModelName=model_name)
        print("Model deleted.")
        
        print("\n=== Cleanup Complete ===")
        print("All resources have been deleted.")
        
    except Exception as e:
        print(f"Error during cleanup: {e}")
        print("Some resources may need to be manually deleted from the SageMaker console.")

# Uncomment the following lines to delete resources:
# if model_package_arn:
#     cleanup_resources(endpoint_name, endpoint_config_name, model_name)

## Next Steps

Now that you have successfully deployed the model to a SageMaker endpoint:

1. **Integration**: Integrate the endpoint with your applications using AWS SDKs
2. **Monitoring**: Set up CloudWatch alarms for endpoint metrics
3. **A/B Testing**: Deploy multiple model variants for comparison
4. **Pipeline Integration**: Include this deployment in your SageMaker Pipeline
5. **Cost Optimization**: Consider using SageMaker Multi-Model Endpoints or Serverless Inference for cost savings

The endpoint is now ready to serve real-time predictions!