## Setup

In [19]:
import time
import boto3
import pandas as pd
import re
import sagemaker
from sagemaker.session import Session
from sagemaker import get_execution_role
from sagemaker.model_monitor import DefaultModelMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat
from sagemaker.model_monitor import CronExpressionGenerator
from sagemaker.model_monitor import EndpointInput
from sagemaker.model_card import (
    ModelCard,
    ModelCardStatusEnum,
    ModelOverview,
    IntendedUses,
    TrainingDetails,
    ObjectiveFunction,
    Function,
    ObjectiveFunctionEnum,
    FacetEnum,
    RiskRatingEnum,
    BusinessDetails,
    EvaluationJob,
    MetricGroup,
    Metric,
    MetricTypeEnum,
)

region = sagemaker.Session().boto_region_name
role = get_execution_role()
sm_session = Session()
sm_client = boto3.client("sagemaker", region_name=region)

bucket = "i32419"
prefix = "ai-deployment-monitoring-grupo-5"

endpoint_name = "loan-default-endpoint-2026-01-12-03-53-20"

## Model Card

In [15]:
model_card_name = "aidm-grupo-5-loan-default-byoc"
model_package_group_name = "aidm-grupo-5-loan-default"

# ModelArn do recurso Model que está por trás do endpoint
model_arn = "arn:aws:sagemaker:eu-west-1:267567228900:model/loan-default-model-2026-01-11-23-57-24"

def get_latest_approved_model_package_arn(sm_client, model_package_group_name: str) -> str:
    resp = sm_client.list_model_packages(
        ModelPackageGroupName=model_package_group_name,
        SortBy="CreationTime",
        SortOrder="Descending",
        MaxResults=100,
    )
    approved = [
        p for p in resp.get("ModelPackageSummaryList", [])
        if p.get("ModelApprovalStatus") == "Approved"
    ]
    if not approved:
        raise RuntimeError(f"Não existe nenhuma versão Approved no Model Package Group '{model_package_group_name}'.")
    return approved[0]["ModelPackageArn"]


def extract_training_job_name_from_model_data_url(model_data_url: str) -> str:
    """
    Espera um padrão do tipo:
      s3://.../<training-job-name>/output/model.tar.gz
    Retorna <training-job-name>
    """
    # robusto para qualquer nome de training job
    m = re.search(r"/([^/]+)/output/model\.tar\.gz$", model_data_url)
    if not m:
        # fallback: tenta capturar o segmento antes de "/output/"
        m2 = re.search(r"/([^/]+)/output/", model_data_url)
        if not m2:
            raise RuntimeError(f"Não consegui inferir TrainingJobName do ModelDataUrl: {model_data_url}")
        return m2.group(1)
    return m.group(1)


def get_training_job_auc(sm_client, training_job_name: str):
    tj = sm_client.describe_training_job(TrainingJobName=training_job_name)
    training_job_arn = tj["TrainingJobArn"]

    final_metrics = tj.get("FinalMetricDataList", []) or []
    metric_map = {m["MetricName"]: m["Value"] for m in final_metrics}

    auc = None
    if "validation_auc" in metric_map:
        auc = float(metric_map["validation_auc"])
    elif "ObjectiveMetric" in metric_map:
        auc = float(metric_map["ObjectiveMetric"])
    else:
        # fallback: procura qualquer métrica com "auc"
        for k, v in metric_map.items():
            if "auc" in (k or "").lower():
                auc = float(v)
                break

    return auc, training_job_arn, metric_map


# Descobrir melhor training job via Model Registry (Approved Model Package)
model_package_arn = get_latest_approved_model_package_arn(sm, model_package_group_name)
mp = sm_client.describe_model_package(ModelPackageName=model_package_arn)

containers = mp["InferenceSpecification"]["Containers"]
model_data_url = containers[0]["ModelDataUrl"]

training_job_name = extract_training_job_name_from_model_data_url(model_data_url)

validation_auc, training_job_arn, metric_map = get_training_job_auc(sm, training_job_name)

print("ModelPackageArn:", model_package_arn)
print("ModelDataUrl:", model_data_url)
print("Inferred TrainingJobName:", training_job_name)
print("validation_auc:", validation_auc)
print("Available metrics:", list(metric_map.keys()))


# Construir Model Card sections
model_overview = ModelOverview(
    model_id=model_arn,  # mantém a ligação ao dashboard
    model_name="Loan Default Binary Classifier (BYOC)",
    model_description=(
        "Binary classifier deployed with a custom BYOC container. Includes monitoring schedules. "
        f"Model Package Group: {model_package_group_name}. "
        f"Approved Model Package: {model_package_arn}."
    ),
    problem_type="Binary Classification",
    algorithm_type="Custom container (BYOC)",
    model_owner="AIDM Grupo 5",
)

intended_uses = IntendedUses(
    purpose_of_model="Predict probability of default for loan applications.",
    intended_uses="Decision support / risk scoring. Not a standalone decision maker.",
    factors_affecting_model_efficiency="Data drift and feature distribution shifts in applicant population.",
    risk_rating=RiskRatingEnum.MEDIUM,
    explanations_for_risk_rating="Business-impacting decisions; requires monitoring and human oversight.",
)

business_details = BusinessDetails(
    business_problem="Estimate probability of loan default at application time to reduce credit risk.",
    business_stakeholders="Credit Risk/Underwriting; Compliance; Data Science; Operations.",
    line_of_business="Retail Banking / Consumer Lending",
)

training_details = TrainingDetails(
    objective_function=ObjectiveFunction(
        function=Function(
            function=ObjectiveFunctionEnum.MAXIMIZE,
            facet=FacetEnum.AUC,
            condition="Validation AUC (primary objective)",
        )
    ),
    training_observations=(
        "Training executed in SageMaker Training Jobs; hyperparameter tuning via SageMaker HPO. "
        f"Best/Approved artifact inferred from Model Registry. TrainingJobName: {training_job_name}."
    ),
)

metric_groups = []
if validation_auc is not None:
    metric_groups = [
        MetricGroup(
            name="Validation metrics",
            metric_data=[
                Metric(
                    name="validation_auc",
                    type=MetricTypeEnum.NUMBER,
                    value=validation_auc,
                    notes=f"Sourced from DescribeTrainingJob.FinalMetricDataList ({training_job_name})",
                )
            ],
        )
    ]

evaluation_details = [
    EvaluationJob(
        name="Offline evaluation (validation split)",
        evaluation_job_arn=training_job_arn,
        evaluation_observation="Validation metric reported during training job execution; linked to approved model package.",
        metric_groups=metric_groups if metric_groups else None,
        metadata={
            "model_package_arn": model_package_arn,
            "model_data_url": model_data_url,
            "training_job_name": training_job_name,
            "metric_source": "training_job_final_metrics_inferred_from_registry",
        },
    )
]

# Create ou update
card = ModelCard(
    name=model_card_name,
    status=ModelCardStatusEnum.DRAFT,
    model_overview=model_overview,
    intended_uses=intended_uses,
    business_details=business_details,
    training_details=training_details,
    evaluation_details=evaluation_details,
    sagemaker_session=sm_session,
)

try:
    card.create()
    print("Created Model Card:", card.arn)
except Exception as e:
    print("Create failed (likely exists). Loading and updating. Error:", str(e)[:200])

    existing = ModelCard.load(name=model_card_name, sagemaker_session=sm_session)
    existing.status = ModelCardStatusEnum.DRAFT
    existing.model_overview = model_overview
    existing.intended_uses = intended_uses
    existing.business_details = business_details
    existing.training_details = training_details
    existing.evaluation_details = evaluation_details

    existing.update()
    print("Updated Model Card:", existing.arn)

ModelPackageArn: arn:aws:sagemaker:eu-west-1:267567228900:model-package/aidm-grupo-5-loan-default/1
ModelDataUrl: s3://i32419/ai-deployment-monitoring-grupo-5/aidm-loan-default/training-output/grupo-5-aidm-loan-de-260111-2226-002-23c01fb8/output/model.tar.gz
Inferred TrainingJobName: grupo-5-aidm-loan-de-260111-2226-002-23c01fb8
validation_auc: 1.0
Available metrics: ['validation_auc', 'ObjectiveMetric']
Create failed (likely exists). Loading and updating. Error: An error occurred (ConflictException) when calling the CreateModelCard operation: Modelcard arn:aws:sagemaker:eu-west-1:267567228900:model-card/aidm-grupo-5-loan-default-byoc with version 1 already ex
Updated Model Card: arn:aws:sagemaker:eu-west-1:267567228900:model-card/aidm-grupo-5-loan-default-byoc


## Garantir DataCapture no endpoint

In [20]:
desc = sm_client.describe_endpoint(EndpointName=endpoint_name)
cfg_name = desc["EndpointConfigName"]
cfg = sm_client.describe_endpoint_config(EndpointConfigName=cfg_name)

if "DataCaptureConfig" not in cfg or not cfg["DataCaptureConfig"].get("EnableCapture", False):
    raise RuntimeError(
        f"Endpoint {endpoint_name} does NOT have DataCapture enabled. "
        "Create a new endpoint with DataCaptureConfig in the deployment notebook."
    )

print("DataCapture enabled. Destination:",
      cfg["DataCaptureConfig"].get("DestinationS3Uri"))

DataCapture enabled. Destination: s3://i32419/ai-deployment-monitoring-grupo-5/datacapture


## Preparar baseline dataset (a partir do CSV local)

In [21]:
train_path = "data/train.csv"
val_path = "data/validation.csv"

train_df = pd.read_csv(train_path)

target_col = "Status"  # ajusta se necessário

features_df = train_df.drop(columns=[target_col])
baseline_local = "baseline_features.csv"

# amostra para baseline para ser mais rápido
features_df.sample(n=min(len(features_df), 2000), random_state=42).to_csv(baseline_local, index=False)

print("baseline local saved:", baseline_local, "shape:", features_df.shape)

baseline_s3_uri = sm_session.upload_data(
    baseline_local,
    bucket=bucket,
    key_prefix=f"{prefix}/monitoring/baseline"
)
print("Baseline S3 URI:", baseline_s3_uri)

baseline local saved: baseline_features.csv shape: (26329, 31)
Baseline S3 URI: s3://i32419/ai-deployment-monitoring-grupo-5/monitoring/baseline/baseline_features.csv


## Criar baseline statistics/constraints + schedule (DefaultModelMonitor)

In [22]:
monitor = DefaultModelMonitor(
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size_in_gb=20,
    max_runtime_in_seconds=3600,
)

baseline_output_s3 = f"s3://{bucket}/{prefix}/monitoring/baseline-output"

monitor.suggest_baseline(
    baseline_dataset=baseline_s3_uri,
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri=baseline_output_s3,
    wait=True,
)

print("Baselining job:", monitor.latest_baselining_job.job_name)
print("Baseline output:", baseline_output_s3)

schedule_name = "aidm-grupo-5-loan-default-dataquality"

monitor_output_s3 = f"s3://{bucket}/{prefix}/monitoring/executions"

monitor.create_monitoring_schedule(
    monitor_schedule_name=schedule_name,
    endpoint_input=EndpointInput(
        endpoint_name=endpoint_name,
        destination="/opt/ml/processing/input"
    ),
    output_s3_uri=monitor_output_s3,
    statistics=monitor.baseline_statistics(),
    constraints=monitor.suggested_constraints(),
    schedule_cron_expression=CronExpressionGenerator.hourly(),
)

print("Created monitoring schedule:", schedule_name)
print("Monitoring outputs:", monitor_output_s3)

INFO:sagemaker:Creating processing-job with name baseline-suggestion-job-2026-01-12-03-58-17-776


.................[34m2026-01-12 04:01:06.087609: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory[0m
[34m2026-01-12 04:01:06.087644: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.[0m
[34m2026-01-12 04:01:07.510923: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory[0m
[34m2026-01-12 04:01:07.510958: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)[0m
[34m2026-01-12 04:01:07.510980: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ip-10-0-151-130.eu-west-1.compute.internal): /proc/driver/nvidia

INFO:sagemaker.model_monitor.model_monitoring:Creating Monitoring Schedule with name: aidm-grupo-5-loan-default-dataquality


Created monitoring schedule: aidm-grupo-5-loan-default-dataquality
Monitoring outputs: s3://i32419/ai-deployment-monitoring-grupo-5/monitoring/executions


## Simular drift (gerar tráfego “normal” e “drifted”)

In [23]:
runtime_sm_client = boto3.client(service_name='sagemaker-runtime')

df = pd.read_csv("data/validation.csv")
X_val = df.drop(columns=["Status"])

def invoke_csv_batch(batch_df: pd.DataFrame):
    payload = batch_df.to_csv(index=False)
    resp = runtime_sm_client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType="text/csv",
        Body=payload.encode("utf-8"),
    )
    return resp["Body"].read().decode("utf-8")

# 200 linhas normais (4 batches de 50)
for i in range(0, 200, 50):
    batch = X_val.iloc[i:i+50]
    _ = invoke_csv_batch(batch)
    time.sleep(0.2)

print("Sent 200 normal rows.")

# 200 linhas com drift (altera features numéricas)
drift_df = X_val.iloc[:200].copy()
num_cols = drift_df.select_dtypes(include=["number"]).columns
drift_df[num_cols] = drift_df[num_cols] * 3.0 + 100.0

for i in range(0, 200, 50):
    batch = drift_df.iloc[i:i+50]
    _ = invoke_csv_batch(batch)
    time.sleep(0.2)

print("Sent 200 drifted rows.")

Sent 200 normal rows.
Sent 200 drifted rows.


## Verificar execuções do monitor e violações

In [16]:
executions = monitor.list_executions()
print("Executions:", len(executions))
if executions:
    latest = executions[-1]
    desc = latest.describe()
    print("Latest status:", desc.get("ProcessingJobStatus"))
    print("Exit message:", desc.get("ExitMessage"))
    print("Output S3:", latest.output.destination)