# Model Monitoring
This notebook is intended to take the model as input (in the format of an h5 file) and create metrics and monitoring dashboards about this model.

In [1]:
import boto3
import os
import sagemaker
import pandas as pd 
import numpy as np 
import time
from sagemaker.tensorflow import TensorFlowModel
from sagemaker.model_monitor import ModelQualityMonitor

In [3]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")
 
sm = boto3.Session().client(service_name="sagemaker", region_name=region)

# Create s3 bucket for models
s3_model_path = 's3://{}/group-5/models'.format(bucket)
print(s3_model_path)

s3://sagemaker-us-east-1-399018723364/group-5/models


In [4]:
!aws s3 cp 'fer_best_model.tar.gz' $s3_model_path/

upload: ./fer_best_model.tar.gz to s3://sagemaker-us-east-1-399018723364/group-5/models/fer_best_model.tar.gz


In [7]:
# Deploy h5 to SageMaker endpoint
s3_model_tar = f"{s3_model_path}/fer_best_model.tar.gz"

model = TensorFlowModel(
    model_data=s3_model_tar,
    role=role,
    framework_version='2.8',
    entry_point='inference.py',
    source_dir='/home/sagemaker-user/AAI_540-Group-Project/'
)

predictor = model.deploy(instance_type='ml.m5.large', initial_instance_count=1)
print(f'Model deployed at {predictor.endpoint_name}')


-------------------------------------------*

UnexpectedStatusException: Error hosting endpoint tensorflow-inference-2025-02-11-03-23-17-547: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint.. Try changing the instance type or reference the troubleshooting page https://docs.aws.amazon.com/sagemaker/latest/dg/async-inference-troubleshooting.html

In [None]:
# Setup monitoring for model drift
monitor = ModelQualityMonitor(
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    volume_size_in_gb=10,
    max_runtime_in_seconds=3600,
    sagemaker_session=sess
)

monitor.create_monitoring_schedule(
    endpoint_input=predictor.endpoint_name,
    output_s3_uri='s3://{}/monitoring_reports/'.format(bucket),
    schedule_expression="cron(0 * ? * * *)"  # Run every hour
)

print("Model monitoring enabled.")

In [None]:
# Setup monitoring for infrastructure

cloudwatch = boto3.client('cloudwatch')

metric_data = {
    'MetricName': 'CPUUtilization',
    'Namespace': 'AWS/SageMaker',
    'Dimensions': [{'Name': 'EndpointName', 'Value': predictor.endpoint_name}],
    'StatisticValues': {
        'SampleCount': 1,
        'Sum': 50,
        'Minimum': 10,
        'Maximum': 90
    },
    'Unit': 'Percent'
}

cloudwatch.put_metric_data(Namespace='AWS/SageMaker', MetricData=[metric_data])
print("Infrastructure monitoring enabled.")


In [None]:
# Create a dashboard in cloudwatch
dashboard_name = "SageMakerMonitoringDashboard"

cloudwatch.put_dashboard(
    DashboardName=dashboard_name,
    DashboardBody=f"""
    {{
        "widgets": [
            {{
                "type": "metric",
                "x": 0,
                "y": 0,
                "width": 6,
                "height": 6,
                "properties": {{
                    "metrics": [
                        ["AWS/SageMaker", "CPUUtilization", "EndpointName", "{predictor.endpoint_name}"]
                    ],
                    "view": "timeSeries",
                    "stacked": false,
                    "region": "us-east-1",
                    "title": "CPU Utilization"
                }}
            }}
        ]
    }}
    """
)

print(f"CloudWatch dashboard '{dashboard_name}' created.")
