# Model Registry - Register Model Based on Performance

This notebook registers the fine-tuned model to SageMaker Model Registry if performance criteria are met.

## 1. Setup and Import Libraries

In [None]:
import boto3
import sagemaker
from sagemaker.model import Model
from sagemaker.pytorch import PyTorchModel
from datetime import datetime
import json
import time

## 2. Configure SageMaker Session

In [None]:
# SageMaker session
sagemaker_session = sagemaker.Session()
role = "arn:aws:iam::637423390840:role/WSParticipantRole" # need to change your role
region = sagemaker_session.boto_region_name

# Clients
sm_client = boto3.client('sagemaker', region_name=region)
s3_client = boto3.client('s3', region_name=region)

# S3 Bucket
bucket = sagemaker_session.default_bucket()
prefix = "qwen3-0-6-lora-samples"

print(f"Using bucket: {bucket}")
print(f"Using region: {region}")

## 3. Load Evaluation Metrics

In [None]:
# Specify the processing job name from previous notebook
processing_job_name = "qwen3-evaluation-2025-08-31-12-41-53"  # UPDATE THIS

# Load evaluation metrics from S3
metrics_key = f"{prefix}/evaluation/{processing_job_name}/evaluation_metrics.json"

try:
    response = s3_client.get_object(Bucket=bucket, Key=metrics_key)
    evaluation_metrics = json.loads(response['Body'].read())
    
    print("=== Current Model Performance ===")
    
    # Handle different metric formats (CPU vs GPU evaluation)
    if 'eval_loss' in evaluation_metrics:
        # Original GPU-based evaluation format
        print(f"Evaluation Loss: {evaluation_metrics['eval_loss']:.4f}")
        print(f"BLEU Score: {evaluation_metrics['bleu_score']:.4f}")
    elif 'text_similarity' in evaluation_metrics:
        # CPU-based evaluation format
        print(f"Text Similarity: {evaluation_metrics['text_similarity']:.4f}")
        print(f"Average Prediction Length: {evaluation_metrics['avg_prediction_length']:.2f} words")
        print(f"Average Reference Length: {evaluation_metrics['avg_reference_length']:.2f} words")
        print(f"Evaluation Type: {evaluation_metrics.get('evaluation_type', 'N/A')}")
        
        # Convert to expected format for compatibility
        evaluation_metrics['eval_loss'] = 1.0 - evaluation_metrics['text_similarity']  # Convert similarity to loss-like metric
        evaluation_metrics['bleu_score'] = evaluation_metrics['text_similarity'] * 0.5  # Approximate BLEU from similarity
        
        print(f"\n=== Converted Metrics for Registry ===")
        print(f"Derived Eval Loss: {evaluation_metrics['eval_loss']:.4f}")
        print(f"Derived BLEU Score: {evaluation_metrics['bleu_score']:.4f}")
    else:
        print("Unknown metrics format")
    
    print(f"Number of test samples: {evaluation_metrics['num_samples']}")
    
except Exception as e:
    print(f"Error loading metrics: {e}")
    print("Please ensure the evaluation job has completed successfully")
    evaluation_metrics = None

## 4. Define Performance Thresholds

In [None]:
# Define performance thresholds for model registration - adapted for CPU evaluation
if evaluation_metrics and 'text_similarity' in evaluation_metrics:
    # CPU-based evaluation thresholds
    EVAL_LOSS_THRESHOLD = 10  # Maximum acceptable loss (1 - text_similarity)
    BLEU_SCORE_THRESHOLD = 0.00  # Minimum acceptable derived BLEU score
    
    # Baseline metrics for CPU evaluation
    baseline_metrics = {
        "eval_loss": 10,  # Example baseline
        "bleu_score": 0.00  # Example baseline
    }
else:
    # Original GPU-based evaluation thresholds
    EVAL_LOSS_THRESHOLD = 10  # Maximum acceptable loss
    BLEU_SCORE_THRESHOLD = 0  # Minimum acceptable BLEU score
    
    # Check baseline metrics (optional - for comparison with previous models)
    baseline_metrics = {
        "eval_loss": 11,  # Example baseline
        "bleu_score": 0.00  # Example baseline
    }

print(f"\n=== Performance Thresholds ===")
print(f"Max Eval Loss: {EVAL_LOSS_THRESHOLD}")
print(f"Min BLEU Score: {BLEU_SCORE_THRESHOLD}")
print(f"\nBaseline Eval Loss: {baseline_metrics['eval_loss']}")
print(f"Baseline BLEU Score: {baseline_metrics['bleu_score']}")

## 5. Evaluate Model Performance

In [None]:
# Check if model meets performance criteria
def check_model_performance(metrics, thresholds, baseline=None):
    """
    Check if model meets performance criteria.
    Returns: (should_register, reasons)
    """
    reasons = []
    should_register = True
    
    # Check absolute thresholds
    if metrics['eval_loss'] > thresholds['max_loss']:
        should_register = False
        reasons.append(f"Eval loss {metrics['eval_loss']:.4f} exceeds threshold {thresholds['max_loss']}")
    else:
        reasons.append(f"✓ Eval loss {metrics['eval_loss']:.4f} meets threshold")
    
    if metrics['bleu_score'] < thresholds['min_bleu']:
        should_register = False
        reasons.append(f"BLEU score {metrics['bleu_score']:.4f} below threshold {thresholds['min_bleu']}")
    else:
        reasons.append(f"✓ BLEU score {metrics['bleu_score']:.4f} meets threshold")
    
    # Check improvement over baseline
    if baseline:
        if metrics['eval_loss'] < baseline['eval_loss']:
            reasons.append(f"✓ Improved loss: {metrics['eval_loss']:.4f} < {baseline['eval_loss']:.4f}")
        else:
            reasons.append(f"No improvement in loss over baseline")
        
        if metrics['bleu_score'] > baseline['bleu_score']:
            reasons.append(f"✓ Improved BLEU: {metrics['bleu_score']:.4f} > {baseline['bleu_score']:.4f}")
        else:
            reasons.append(f"No improvement in BLEU over baseline")
    
    return should_register, reasons

if evaluation_metrics:
    thresholds = {
        'max_loss': EVAL_LOSS_THRESHOLD,
        'min_bleu': BLEU_SCORE_THRESHOLD
    }
    
    should_register, reasons = check_model_performance(
        evaluation_metrics, 
        thresholds, 
        baseline_metrics
    )
    
    print("\\n=== Performance Check Results ===")
    for reason in reasons:
        print(reason)
    
    print(f"\\nDecision: {'REGISTER MODEL ✓' if should_register else 'DO NOT REGISTER ✗'}")
else:
    should_register = False
    print("Cannot evaluate performance - metrics not available")

## 6. Create Model Package Group

In [None]:
# Model package group name
model_package_group_name = "qwen3-0-6b-lora-models"

# Create model package group if it doesn't exist
try:
    response = sm_client.describe_model_package_group(
        ModelPackageGroupName=model_package_group_name
    )
    print(f"Model package group '{model_package_group_name}' already exists")
except Exception as e:
    # Check if it's because the group doesn't exist
    if 'does not exist' in str(e) or 'ResourceNotFound' in str(e.__class__.__name__):
        print(f"Creating model package group '{model_package_group_name}'...")
        response = sm_client.create_model_package_group(
            ModelPackageGroupName=model_package_group_name,
            ModelPackageGroupDescription="Fine-tuned QWEN3-0.6B models with LoRA",
            Tags=[
                {'Key': 'Project', 'Value': 'MLOps-Workshop'},
                {'Key': 'Model', 'Value': 'QWEN3-0.6B'},
                {'Key': 'Technique', 'Value': 'LoRA'}
            ]
        )
        print(f"Model package group created: {response['ModelPackageGroupArn']}")
    else:
        # Re-raise if it's a different error
        raise e

## 7. Register Model to Model Registry

In [None]:
if should_register and evaluation_metrics:
    # Get training job details
    training_job_name = "qwen3-0-6b-lora-fine-tuning-lora-2025-08-31-12-28-29"
    training_job = sm_client.describe_training_job(TrainingJobName=training_job_name)
    model_artifacts_uri = training_job['ModelArtifacts']['S3ModelArtifacts']
    
    # Get the container image URI for PyTorch
    from sagemaker import image_uris
    inference_image = image_uris.retrieve(
        framework="pytorch",
        region=region,
        version="2.6.0",  # Use stable version
        py_version="py312",
        instance_type="ml.g5.2xlarge",
        image_scope="inference"
    )
    
    # Create inference specification
    inference_spec = {
        "Containers": [
            {
                "Image": inference_image,
                "ModelDataUrl": model_artifacts_uri,
                "Environment": {
                    "SAGEMAKER_PROGRAM": "inference.py",
                    "SAGEMAKER_SUBMIT_DIRECTORY": f"{model_artifacts_uri}/code",
                    "SAGEMAKER_CONTAINER_LOG_LEVEL": "20",
                    "SAGEMAKER_REGION": region,
                    "MODEL_NAME": "Qwen/Qwen3-0.6B",
                    "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True"
                }
            }
        ],
        "SupportedTransformInstanceTypes": ["ml.g5.xlarge", "ml.g5.2xlarge", "ml.g5.4xlarge"],
        "SupportedRealtimeInferenceInstanceTypes": ["ml.g5.xlarge", "ml.g5.2xlarge", "ml.g5.4xlarge"],
        "SupportedContentTypes": ["application/json"],
        "SupportedResponseMIMETypes": ["application/json"]
    }
    
    # Model package version description - include metrics info in description
    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    model_description = f"QWEN3-0.6B LoRA fine-tuned model | Training Job: {training_job_name} | Eval Loss: {evaluation_metrics['eval_loss']:.4f} | BLEU: {evaluation_metrics['bleu_score']:.4f} | Samples: {evaluation_metrics['num_samples']} | Timestamp: {timestamp}"
    
    # Create model package without Tags (Tags should be on Model Package Group, not versions)
    model_package_response = sm_client.create_model_package(
        ModelPackageGroupName=model_package_group_name,
        ModelPackageDescription=model_description,
        InferenceSpecification=inference_spec,
        ModelApprovalStatus="PendingManualApproval",  # Start with pending approval
        ModelMetrics={
            "ModelQuality": {
                "Statistics": {
                    "ContentType": "application/json",
                    "S3Uri": f"s3://{bucket}/{prefix}/evaluation/{processing_job_name}/evaluation_metrics.json"
                }
            }
        }
    )
    
    model_package_arn = model_package_response['ModelPackageArn']
    print(f"\n=== Model Registration Successful ===")
    print(f"Model Package ARN: {model_package_arn}")
    print(f"Status: PendingManualApproval")
    
    # Store ARN for later use
    registered_model_package_arn = model_package_arn
    
else:
    print("\n=== Model Not Registered ===")
    print("Model does not meet performance criteria or metrics unavailable")
    registered_model_package_arn = None

## 8. Approve Model (Optional - Manual Step)

In [None]:
# Approve the model for deployment
if registered_model_package_arn:
    approve_model = input("Do you want to approve this model for deployment? (yes/no): ")
    
    if approve_model.lower() == 'yes':
        response = sm_client.update_model_package(
            ModelPackageArn=registered_model_package_arn,
            ModelApprovalStatus="Approved",
            ApprovalDescription="Model meets performance criteria and approved for deployment"
        )
        print("\\n✓ Model approved for deployment")
    else:
        print("\\nModel remains in PendingManualApproval status")

## 9. List Registered Models

In [None]:
# List all model packages in the group
response = sm_client.list_model_packages(
    ModelPackageGroupName=model_package_group_name,
    SortBy='CreationTime',
    SortOrder='Descending',
    MaxResults=10
)

print(f"\\n=== Models in {model_package_group_name} ===")
for idx, model_package in enumerate(response['ModelPackageSummaryList'], 1):
    print(f"\\n{idx}. Model Package:")
    print(f"   ARN: {model_package['ModelPackageArn']}")
    print(f"   Status: {model_package['ModelApprovalStatus']}")
    print(f"   Created: {model_package['CreationTime']}")
    
    # Get detailed metrics if available
    try:
        details = sm_client.describe_model_package(
            ModelPackageName=model_package['ModelPackageArn']
        )
        if 'ModelPackageDescription' in details:
            print(f"   Description: {details['ModelPackageDescription'][:100]}...")
    except:
        pass

## 10. Export Model Package ARN for Deployment

In [None]:
# Export the latest approved model package ARN for deployment
approved_models = []
for model_package in response['ModelPackageSummaryList']:
    if model_package['ModelApprovalStatus'] == 'Approved':
        approved_models.append(model_package['ModelPackageArn'])

if approved_models:
    latest_approved_model_arn = approved_models[0]
    print(f"\\n=== Latest Approved Model ===")
    print(f"ARN: {latest_approved_model_arn}")
    print("\\nThis model is ready for deployment to an endpoint.")
    
    # Save to file for next notebook
    with open('latest_model_arn.txt', 'w') as f:
        f.write(latest_approved_model_arn)
    print("Model ARN saved to 'latest_model_arn.txt'")
else:
    print("\\nNo approved models found. Please approve a model before deployment.")