## Setup

In [20]:
import time
import boto3
import pandas as pd
import sagemaker
from sagemaker.session import Session
from sagemaker import get_execution_role
from sagemaker.model_monitor import DefaultModelMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat
from sagemaker.model_monitor import CronExpressionGenerator
from sagemaker.model_monitor import EndpointInput

region = sagemaker.Session().boto_region_name
role = get_execution_role()
sm_session = Session()
sm_client = boto3.client("sagemaker", region_name=region)

bucket = "i32419"
prefix = "ai-deployment-monitoring-grupo-5"

endpoint_name = "loan-default-endpoint-2026-01-11-23-28-57"

## Model Card

In [21]:
from sagemaker.model_card import (
    ModelCard,
    ModelCardStatusEnum,
    ModelOverview,
    IntendedUses,
    TrainingDetails,
    ObjectiveFunction,
    Function,
    ObjectiveFunctionEnum,
    FacetEnum,
    RiskRatingEnum,
)

model_card_name = "aidm-grupo-5-loan-default-byoc"

model_overview = ModelOverview(
    model_name="Loan Default Binary Classifier (BYOC)",
    model_description="Binary classifier deployed with a custom BYOC container. Includes monitoring schedules.",
    problem_type="Binary Classification",
    algorithm_type="Custom container (sklearn/xgboost pipeline)",
    model_owner="AIDM Grupo 5",
)

intended_uses = IntendedUses(
    purpose_of_model="Predict probability of default for loan applications.",
    intended_uses="Decision support / risk scoring. Not a standalone decision maker.",
    factors_affecting_model_efficiency="Data drift and feature distribution shifts in applicant population.",
    risk_rating=RiskRatingEnum.MEDIUM,
    explanations_for_risk_rating="Business-impacting decisions; requires monitoring and human oversight.",
)

training_details = TrainingDetails(
    objective_function=ObjectiveFunction(
        function=Function(
            function=ObjectiveFunctionEnum.MAXIMIZE,
            facet=FacetEnum.AUC,
            condition="Validation AUC (primary objective)",
        )
    ),
    training_observations="Training executed in SageMaker Training Jobs; Hyperparameter tuning executed via SageMaker HPO.",
)

card = ModelCard(
    name=model_card_name,
    status=ModelCardStatusEnum.DRAFT,
    model_overview=model_overview,
    intended_uses=intended_uses,
    training_details=training_details,
    sagemaker_session=sm_session,
)

# cria ou atualiza
try:
    card.create()
    print("Created Model Card:", card.arn)
except Exception as e:
    # se já existir, faz update
    print("Create failed (likely exists). Updating instead. Error:", str(e)[:200])
    card.update()
    print("Updated Model Card:", card.arn)


INFO:sagemaker.model_card.model_card:Creating model card with name: aidm-grupo-5-loan-default-byoc


Create failed (likely exists). Updating instead. Error: An error occurred (ConflictException) when calling the CreateModelCard operation: Modelcard arn:aws:sagemaker:eu-west-1:267567228900:model-card/aidm-grupo-5-loan-default-byoc with version 1 already ex


## Garantir DataCapture no endpoint

In [22]:
desc = sm_client.describe_endpoint(EndpointName=endpoint_name)
cfg_name = desc["EndpointConfigName"]
cfg = sm_client.describe_endpoint_config(EndpointConfigName=cfg_name)

if "DataCaptureConfig" not in cfg or not cfg["DataCaptureConfig"].get("EnableCapture", False):
    raise RuntimeError(
        f"Endpoint {endpoint_name} does NOT have DataCapture enabled. "
        "Create a new endpoint with DataCaptureConfig in the deployment notebook."
    )

print("DataCapture enabled. Destination:",
      cfg["DataCaptureConfig"].get("DestinationS3Uri"))

DataCapture enabled. Destination: s3://i32419/ai-deployment-monitoring-grupo-5/datacapture


## Preparar baseline dataset (a partir do CSV local)

In [23]:
train_path = "data/train.csv"
val_path = "data/validation.csv"

train_df = pd.read_csv(train_path)

target_col = "Status"  # ajusta se necessário

features_df = train_df.drop(columns=[target_col])
baseline_local = "baseline_features.csv"

# amostra para baseline para ser mais rápido
features_df.sample(n=min(len(features_df), 2000), random_state=42).to_csv(baseline_local, index=False)

print("baseline local saved:", baseline_local, "shape:", features_df.shape)

baseline_s3_uri = sm_session.upload_data(
    baseline_local,
    bucket=bucket,
    key_prefix=f"{prefix}/monitoring/baseline"
)
print("Baseline S3 URI:", baseline_s3_uri)

baseline local saved: baseline_features.csv shape: (26329, 31)
Baseline S3 URI: s3://i32419/ai-deployment-monitoring-grupo-5/monitoring/baseline/baseline_features.csv


## Criar baseline statistics/constraints + schedule (DefaultModelMonitor)

In [24]:
monitor = DefaultModelMonitor(
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size_in_gb=20,
    max_runtime_in_seconds=3600,
)

baseline_output_s3 = f"s3://{bucket}/{prefix}/monitoring/baseline-output"

monitor.suggest_baseline(
    baseline_dataset=baseline_s3_uri,
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri=baseline_output_s3,
    wait=True,
)

print("Baselining job:", monitor.latest_baselining_job.job_name)
print("Baseline output:", baseline_output_s3)

schedule_name = "aidm-grupo-5-loan-default-dataquality"

monitor_output_s3 = f"s3://{bucket}/{prefix}/monitoring/executions"

monitor.create_monitoring_schedule(
    monitor_schedule_name=schedule_name,
    endpoint_input=EndpointInput(
        endpoint_name=endpoint_name,
        destination="/opt/ml/processing/input"
    ),
    output_s3_uri=monitor_output_s3,
    statistics=monitor.baseline_statistics(),
    constraints=monitor.suggested_constraints(),
    schedule_cron_expression=CronExpressionGenerator.hourly(),
)

print("Created monitoring schedule:", schedule_name)
print("Monitoring outputs:", monitor_output_s3)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating processing-job with name baseline-suggestion-job-2026-01-11-23-34-22-491


................[34m2026-01-11 23:37:04.982616: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory[0m
[34m2026-01-11 23:37:04.982651: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.[0m
[34m2026-01-11 23:37:06.430985: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory[0m
[34m2026-01-11 23:37:06.431021: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)[0m
[34m2026-01-11 23:37:06.431046: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ip-10-0-143-243.eu-west-1.compute.internal): /proc/driver/nvidia/

INFO:sagemaker.model_monitor.model_monitoring:Creating Monitoring Schedule with name: aidm-grupo-5-loan-default-dataquality


Created monitoring schedule: aidm-grupo-5-loan-default-dataquality
Monitoring outputs: s3://i32419/ai-deployment-monitoring-grupo-5/monitoring/executions


## Simular drift (gerar tráfego “normal” e “drifted”)

In [25]:
runtime_sm_client = boto3.client(service_name='sagemaker-runtime')

df = pd.read_csv("data/validation.csv")
X_val = df.drop(columns=["Status"])

def invoke_csv_batch(batch_df: pd.DataFrame):
    payload = batch_df.to_csv(index=False)
    resp = runtime_sm_client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType="text/csv",
        Body=payload.encode("utf-8"),
    )
    return resp["Body"].read().decode("utf-8")

# 200 linhas normais (4 batches de 50)
for i in range(0, 200, 50):
    batch = X_val.iloc[i:i+50]
    _ = invoke_csv_batch(batch)
    time.sleep(0.2)

print("Sent 200 normal rows.")

# 200 linhas com drift (altera features numéricas)
drift_df = X_val.iloc[:200].copy()
num_cols = drift_df.select_dtypes(include=["number"]).columns
drift_df[num_cols] = drift_df[num_cols] * 3.0 + 100.0

for i in range(0, 200, 50):
    batch = drift_df.iloc[i:i+50]
    _ = invoke_csv_batch(batch)
    time.sleep(0.2)

print("Sent 200 drifted rows.")

Sent 200 normal rows.
Sent 200 drifted rows.


## Verificar execuções do monitor e violações

In [26]:
executions = monitor.list_executions()
print("Executions:", len(executions))
if executions:
    latest = executions[-1]
    desc = latest.describe()
    print("Latest status:", desc.get("ProcessingJobStatus"))
    print("Exit message:", desc.get("ExitMessage"))
    print("Output S3:", latest.output.destination)

Executions: 1
Latest status: Failed
Exit message: Error: Encoding mismatch: Encoding is CSV for endpointInput, but Encoding is JSON for endpointOutput. We currently only support the same type of input and output encoding at the moment.
Output S3: s3://i32419/ai-deployment-monitoring-grupo-5/monitoring/executions/loan-default-endpoint-2026-01-11-19-51-18/aidm-grupo-5-loan-default-dataquality/2026/01/11/21
