# Feature / Data Drift Monitoring

In [1]:
%store

Stored variables and their in-db values:
account_id                                -> '607916531205'
baseline_model_logistic_path              -> 'baseline_model_logistic.pkl'
baseline_model_path                       -> 'baseline_model.pkl'
create_base_csv_athena_db                 -> True
create_base_csv_athena_table              -> True
database_name                             -> 'db_airline_delay_cause'
dev_feature_group_name                    -> 'airline_delay_features_dev'
dev_feature_store_table                   -> 'airline_delay_features_dev_1740273029'
dev_s3_path                               -> 's3://sagemaker-us-east-1-607916531205/data/develo
dev_s3_uri                                -> 's3://sagemaker-us-east-1-607916531205/feature-sto
dev_table_name                            -> 'development_data'
endpoint_name_batch_transform             -> 'flight-delay-xgboost-endpoint-with-batch-transfor
endpoint_name_single_request              -> 'flight-delay-xgboost-endpoint-s

### Data Capture Validation

In [2]:
import boto3
import sagemaker

# ✅ Retrieve stored variables
%store -r endpoint_name_single_request
%store -r s3_staging_dir
%store -r role  

# ✅ Initialize AWS session
session = boto3.session.Session()
sagemaker_session = sagemaker.Session()

# ✅ Reinitialize `bucket`
bucket = sagemaker_session.default_bucket()

# ✅ Initialize S3 client
s3_client = boto3.client("s3")

# ✅ Ensure `endpoint_name_single_request` exists before using it
if 'endpoint_name_single_request' not in locals() or not endpoint_name_single_request:
    print("⚠️ `endpoint_name_single_request` is not set. Make sure 05-train-and-deploy.ipynb was run successfully.")
else:
    # ✅ Define S3 prefix for captured data dynamically
    data_capture_prefix = f"data_capture/{endpoint_name_single_request}/AllTraffic/"

    # ✅ List objects in the S3 data capture folder
    capture_files = s3_client.list_objects_v2(Bucket=bucket, Prefix=data_capture_prefix)

    if "Contents" in capture_files:
        print("✅ JSON files available in data capture S3 path:")
        for obj in capture_files["Contents"]:
            print(f"- {obj['Key']}")
    else:
        print("⚠️ No JSON files found. Ensure the endpoint is receiving inference requests.")


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
✅ JSON files available in data capture S3 path:
- data_capture/flight-delay-xgboost-endpoint-single-request/AllTraffic/2025/02/23/01/55-07-422-b3e164c6-4691-462b-ab06-b3aa78bfaf9c.jsonl


## Obtain Samples from Dev Feature Store

In [3]:
import pandas as pd
import boto3
from pyathena import connect

# ✅ Retrieve stored variables
%store -r dev_feature_store_table
%store -r s3_staging_dir
%store -r region

# ✅ Define Athena query to fetch validation data from the feature store
query = f"""
SELECT * FROM "sagemaker_featurestore"."{dev_feature_store_table}"
LIMIT 1000  -- Adjust the limit if needed
"""

# ✅ Connect to Athena and execute query
conn = connect(s3_staging_dir=s3_staging_dir, region_name=region)
validation_data = pd.read_sql(query, conn)

# ✅ Display the first few rows to confirm correct loading
print(validation_data.head())
print(f"Validation data shape: {validation_data.shape}")


  validation_data = pd.read_sql(query, conn)


     event_time  year  month carrier airport  arr_flights  arr_del15  \
0  1.740273e+09  2015      5      OO     SMX           63         22   
1  1.740273e+09  2013      4      DL     MYR           29          5   
2  1.740273e+09  2015      5      OO     FWA          341         56   
3  1.740273e+09  2008      6      OO     CHS           10          6   
4  1.740273e+09  2008      6      OO     CPR          167         21   

   carrier_ct  weather_ct  nas_ct  ...  weather_delay  nas_delay  \
0           2           0       1  ...              0         32   
1           3           0       0  ...             26         57   
2          20           2       8  ...            195        336   
3           1           0       2  ...            114        142   
4           6           0       6  ...              0        282   

   security_delay  late_aircraft_delay  delay_rate  on_time  record_id  \
0               0                  998          34        0     123259   
1         

In [6]:
from sklearn.preprocessing import LabelEncoder

# ✅ Define subset size
subset_size = 50  # Number of rows to send to the endpoint

# ✅ Retrieve categorical columns that need encoding
categorical_columns = ["carrier", "airport"]

# ✅ Create label encoders and fit on the validation data
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    validation_data[col] = le.fit_transform(validation_data[col])  # Apply encoding
    label_encoders[col] = le

# ✅ Define columns to exclude (target + metadata)
columns_to_exclude = [
    "delay_rate", "on_time",  # Target variables
    "record_id", "write_time", "api_invocation_time", "is_deleted"  # Metadata columns
]

# ✅ Select only valid feature columns
feature_columns = [col for col in validation_data.columns if col not in columns_to_exclude]

# ✅ Select subset of validation data with correct features
subset_data = validation_data[feature_columns].iloc[:subset_size]

# ✅ Convert to CSV-like strings for SageMaker endpoint
subset_data_str_list = subset_data.apply(lambda row: ",".join(row.astype(str)), axis=1).tolist()

print(f"✅ Prepared {len(subset_data_str_list)} rows with label-encoded categorical features for inference.")


✅ Prepared 50 rows with label-encoded categorical features for inference.


## Create predictor endpoint and send data to endpoint

In [7]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

# ✅ Retrieve stored endpoint name
%store -r endpoint_name_single_request

# ✅ Check if endpoint_name exists before proceeding
if not endpoint_name_single_request:
    print("⚠️ Endpoint name is missing. Ensure 05-train-and-deploy.ipynb was run successfully.")
else:
    # ✅ Initialize the endpoint predictor dynamically
    predictor = Predictor(
        endpoint_name=endpoint_name_single_request,  # Now using stored variable
        sagemaker_session=sagemaker.Session(),
        serializer=CSVSerializer(),
        deserializer=JSONDeserializer(),
    )

    # ✅ Send each row to the endpoint (with error handling)
    responses = []

    if not subset_data_str_list:
        print("⚠️ No data available to send for inference. Ensure validation data was properly loaded.")
    else:
        for i, row_str in enumerate(subset_data_str_list):
            try:
                response = predictor.predict(row_str)  # Send to the endpoint
                responses.append(response)
                
                # ✅ Print only the first 5 rows to avoid flooding logs
                if i < 5:
                    print(f"🟢 Input: {row_str}")
                    print(f"🔵 Prediction: {response}")
            except Exception as e:
                print(f"❌ Error making prediction for row {i}: {str(e)}")

        print(f"✅ Finished sending {len(responses)} rows to endpoint.")


🟢 Input: 1740273202.9792993,2015.0,5.0,6.0,217.0,63.0,22.0,2.0,0.0,1.0,0.0,18.0,0.0,0.0,1269.0,239.0,0.0,32.0,0.0,998.0
🔵 Prediction: {'predictions': [{'score': 1.0346633644076064e-05}]}
🟢 Input: 1740273202.9792993,2013.0,4.0,3.0,163.0,29.0,5.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,269.0,168.0,26.0,57.0,0.0,18.0
🔵 Prediction: {'predictions': [{'score': 1.1478582564450335e-05}]}
🟢 Input: 1740273202.9792993,2015.0,5.0,6.0,87.0,341.0,56.0,20.0,2.0,8.0,0.0,24.0,3.0,0.0,3001.0,1207.0,195.0,336.0,0.0,1263.0
🔵 Prediction: {'predictions': [{'score': 4.337779500929173e-06}]}
🟢 Input: 1740273202.9792993,2008.0,6.0,6.0,46.0,10.0,6.0,1.0,0.0,2.0,0.0,2.0,1.0,0.0,617.0,40.0,114.0,142.0,0.0,321.0
🔵 Prediction: {'predictions': [{'score': 7.515838206018088e-06}]}
🟢 Input: 1740273202.9792993,2008.0,6.0,6.0,55.0,167.0,21.0,6.0,0.0,6.0,0.0,8.0,1.0,0.0,1686.0,433.0,0.0,282.0,0.0,971.0
🔵 Prediction: {'predictions': [{'score': 4.337779500929173e-06}]}
✅ Finished sending 50 rows to endpoint.


## Validate that JSONL files were captured

In [8]:
# ✅ Use already initialized session, bucket, and S3 client
data_capture_prefix = f"data_capture/{endpoint_name_single_request}/AllTraffic/"

# ✅ List files in S3 data capture path
response = s3_client.list_objects_v2(Bucket=bucket, Prefix=data_capture_prefix)

# ✅ Check if "Contents" exists in response before accessing it
if "Contents" in response and response["Contents"]:
    print(f"✅ Files found in S3 data capture path ({data_capture_prefix}):")

    # ✅ Print only the first 5 files to avoid excessive output
    for obj in response["Contents"][:5]:  
        print(f"📄 {obj['Key']}")

    print(f"🟢 ...and {len(response['Contents']) - 5} more files" if len(response["Contents"]) > 5 else "✅ All files listed.")
else:
    print(f"⚠️ No files found in S3 data capture path: {data_capture_prefix}")


✅ Files found in S3 data capture path (data_capture/flight-delay-xgboost-endpoint-single-request/AllTraffic/):
📄 data_capture/flight-delay-xgboost-endpoint-single-request/AllTraffic/2025/02/23/01/55-07-422-b3e164c6-4691-462b-ab06-b3aa78bfaf9c.jsonl
📄 data_capture/flight-delay-xgboost-endpoint-single-request/AllTraffic/2025/02/23/04/58-47-981-b165b78d-2f81-4bd3-b5ae-23a79697e023.jsonl
✅ All files listed.


#### You should now be able to see your data in cloudwatch
- Go to Cloudwatch -> Logs -> Log Insights
- search `/aws/sagemaker/Endpoints/flight-delay-xgboost-endpoint-single-request`

## Initialize Baseline Monitoring Setup

In [10]:
from sagemaker.model_monitor import DefaultModelMonitor, DatasetFormat

# ✅ Define Required Variables
baseline_results_uri = f"s3://{bucket}/baseline_results/"  # Standardized path
baseline_dataset_path = f"s3://{bucket}/data_capture/{endpoint_name_single_request}/AllTraffic/"

print(f"📂 Baseline dataset path: {baseline_dataset_path}")
print(f"📂 Baseline results URI: {baseline_results_uri}")

# ✅ Store paths for later use
%store baseline_results_uri
%store baseline_dataset_path
print("✅ Stored baseline paths in %store.")

# ✅ Initialize the model monitor (but do NOT store it)
model_monitor = DefaultModelMonitor(
    role=role,  # ✅ Fixed: Use correct role variable from %store
    instance_type="ml.m5.xlarge",
    instance_count=1,
    max_runtime_in_seconds=3600,
    sagemaker_session=sagemaker_session
)
print("✅ Model monitor initialized. (Not stored in %store, will be re-created when needed)")


📂 Baseline dataset path: s3://sagemaker-us-east-1-607916531205/data_capture/flight-delay-xgboost-endpoint-single-request/AllTraffic/
📂 Baseline results URI: s3://sagemaker-us-east-1-607916531205/baseline_results/
Stored 'baseline_results_uri' (str)
Stored 'baseline_dataset_path' (str)
✅ Stored baseline paths in %store.
✅ Model monitor initialized. (Not stored in %store, will be re-created when needed)


## Run baseline suggestion job

In [20]:
# ✅ Send another 50 inference requests
responses = []
for row_str in subset_data_str_list:
    response = predictor.predict(row_str)
    responses.append(response)

print("✅ Finished sending 50 additional rows. Wait 2-3 minutes before checking data capture in S3.")


✅ Finished sending 50 additional rows. Wait 2-3 minutes before checking data capture in S3.


In [24]:
response = s3_client.list_objects_v2(Bucket=bucket, Prefix="data_capture/")
if "Contents" in response:
    print(f"✅ Found {len(response['Contents'])} JSONL files in S3:")
    for obj in response["Contents"][:5]:  # Show first 5 files
        print(f"📄 {obj['Key']}")
else:
    print("⚠️ No new JSONL files found yet. Wait a little longer or resend inferences.")


✅ Found 4 JSONL files in S3:
📄 data_capture/flight-delay-xgboost-endpoint-single-request/AllTraffic/2025/02/23/01/55-07-422-b3e164c6-4691-462b-ab06-b3aa78bfaf9c.jsonl
📄 data_capture/flight-delay-xgboost-endpoint-single-request/AllTraffic/2025/02/23/04/58-47-981-b165b78d-2f81-4bd3-b5ae-23a79697e023.jsonl
📄 data_capture/flight-delay-xgboost-endpoint-single-request/AllTraffic/2025/02/23/05/17-56-062-97d8dfac-98ad-422f-b0b8-e698c22ff525.jsonl
📄 data_capture/flight-delay-xgboost-endpoint-with-batch-transform/AllTraffic/2025/02/23/02/06-28-282-7ba30264-befc-49d0-80af-7061a2521244.jsonl


In [28]:
import json
import pandas as pd
import boto3

# ✅ Find the latest JSONL file dynamically
response = s3_client.list_objects_v2(Bucket=bucket, Prefix="data_capture/flight-delay-xgboost-endpoint-single-request/AllTraffic/")

if "Contents" not in response or not response["Contents"]:
    raise FileNotFoundError("⚠️ No JSONL files found. Ensure inference data is being captured.")

# ✅ Get the latest file by sorting timestamps
latest_file = sorted(response["Contents"], key=lambda x: x["LastModified"], reverse=True)[0]["Key"]

print(f"✅ Using latest JSONL file: {latest_file}")

# ✅ Download the latest JSONL file
response = s3_client.get_object(Bucket=bucket, Key=latest_file)
file_content = response["Body"].read().decode("utf-8").splitlines()

# ✅ Extract only the input feature values from each entry
feature_data = []
for line in file_content:
    entry = json.loads(line)  # Convert JSON string to dict
    if "captureData" in entry and "endpointInput" in entry["captureData"]:
        raw_features = entry["captureData"]["endpointInput"]["data"]
        feature_data.append(raw_features.split(","))  # Convert CSV-like string to list

# ✅ Convert extracted data to a DataFrame
df = pd.DataFrame(feature_data)

# ✅ Save locally as CSV
fixed_csv_path = "fixed_baseline.csv"
df.to_csv(fixed_csv_path, index=False, header=False)

print(f"✅ Processed {len(df)} records and saved as CSV.")

# ✅ Upload the processed CSV to S3
fixed_s3_path = f"{baseline_dataset_path}fixed_baseline.csv"
s3_client.upload_file(fixed_csv_path, bucket, fixed_s3_path.replace(f"s3://{bucket}/", ""))

print(f"✅ Uploaded fixed CSV file to {fixed_s3_path}.")



✅ Using latest JSONL file: data_capture/flight-delay-xgboost-endpoint-single-request/AllTraffic/2025/02/23/05/17-56-062-97d8dfac-98ad-422f-b0b8-e698c22ff525.jsonl
✅ Processed 50 records and saved as CSV.
✅ Uploaded fixed CSV file to s3://sagemaker-us-east-1-607916531205/data_capture/flight-delay-xgboost-endpoint-single-request/AllTraffic/fixed_baseline.csv.


In [29]:
import boto3

try:
    print("🔍 Checking if processed baseline CSV exists in S3...")

    # ✅ Define the fixed CSV file path in S3
    fixed_s3_path = f"{baseline_dataset_path}fixed_baseline.csv"

    # ✅ Check if the CSV file exists in S3
    response = s3_client.list_objects_v2(Bucket=bucket, Prefix=fixed_s3_path.replace(f"s3://{bucket}/", ""))

    if "Contents" not in response or not response["Contents"]:
        raise FileNotFoundError(f"⚠️ No processed CSV found at {fixed_s3_path}. Ensure JSONL was converted and uploaded.")

    print("✅ Baseline CSV found. Running baseline suggestion job...")

    # ✅ Run baseline generation using CSV format
    model_monitor.suggest_baseline(
        baseline_dataset=fixed_s3_path,  
        dataset_format=DatasetFormat.csv(header=False), 
        output_s3_uri=baseline_results_uri,  # Where baseline results will be stored
        wait=True  # Wait for job completion
    )

    print(f"✅ Baseline job completed. Results saved at: {baseline_results_uri}")

except FileNotFoundError as fe:
    print(str(fe))  # Print specific missing file error
except Exception as e:
    print(f"❌ Error running baseline suggestion job: {e}")
    raise  # Re-raise the error for debugging


INFO:sagemaker:Creating processing-job with name baseline-suggestion-job-2025-02-23-05-25-05-194


🔍 Checking if processed baseline CSV exists in S3...
✅ Baseline CSV found. Running baseline suggestion job...
............[34m2025-02-23 05:27:04.405246: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory[0m
[34m2025-02-23 05:27:04.405277: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.[0m
[34m2025-02-23 05:27:05.980301: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory[0m
[34m2025-02-23 05:27:05.980330: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)[0m
[34m2025-02-23 05:27:05.980349: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver doe

### Verify Baseline Results

In [33]:
# ✅ Check the size of statistics.json and constraints.json in S3
response = s3_client.list_objects_v2(Bucket=bucket, Prefix=baseline_results_uri.replace(f"s3://{bucket}/", ""))

if "Contents" in response:
    for obj in response["Contents"]:
        file_name = obj["Key"]
        file_size = obj["Size"]  # Size in bytes
        print(f"📂 {file_name} - {file_size} bytes")

        if file_size == 0:
            print(f"⚠️ WARNING: {file_name} is empty! The baseline job may not have written data correctly.")


📂 baseline_results/constraints.json - 3519 bytes
📂 baseline_results/statistics.json - 41918 bytes


In [39]:
import boto3

# ✅ Define S3 path
baseline_statistics_path = f"{baseline_results_uri}statistics.json"
print("Baseline statistics stored at:", baseline_statistics_path)

# ✅ Try downloading `statistics.json` using `get_object()` instead
try:
    response = s3_client.get_object(Bucket=bucket, Key=baseline_statistics_path.replace(f"s3://{bucket}/", ""))
    with open("statistics.json", "wb") as f:
        f.write(response["Body"].read())
    print("✅ statistics.json successfully retrieved using `get_object()`.")
except Exception as e:
    print(f"❌ Failed to retrieve statistics.json: {e}")


INFO:botocore.httpchecksum:Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].


Baseline statistics stored at: s3://sagemaker-us-east-1-607916531205/baseline_results/statistics.json
✅ statistics.json successfully retrieved using `get_object()`.


In [40]:
import boto3

# ✅ Define S3 path
baseline_constraints_path = f"{baseline_results_uri}constraints.json"
print("Baseline constraints stored at:", baseline_constraints_path)

# ✅ Try downloading `constraints.json` using `get_object()`
try:
    response = s3_client.get_object(Bucket=bucket, Key=baseline_constraints_path.replace(f"s3://{bucket}/", ""))
    with open("constraints.json", "wb") as f:
        f.write(response["Body"].read())
    print("✅ constraints.json successfully retrieved using `get_object()`.")
except Exception as e:
    print(f"❌ Failed to retrieve constraints.json: {e}")


INFO:botocore.httpchecksum:Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].


Baseline constraints stored at: s3://sagemaker-us-east-1-607916531205/baseline_results/constraints.json
✅ constraints.json successfully retrieved using `get_object()`.


In [42]:
import json

# ✅ Load baseline statistics
with open("statistics.json", "r") as stats_file:
    baseline_statistics = json.load(stats_file)

# ✅ Extract drift thresholds (mean + 2 * std_dev)
drift_thresholds = {}
for feature in baseline_statistics["features"]:
    feature_name = feature["name"]
    mean = feature["numerical_statistics"]["mean"]
    std_dev = feature["numerical_statistics"]["std_dev"]
    drift_thresholds[feature_name] = mean + (2 * std_dev)  # ✅ Mean + 2*std_dev threshold

# ✅ Print only a sample of drift thresholds (first 5)
print("✅ Sample Drift Thresholds:", dict(list(drift_thresholds.items())[:5]))
print(f"✅ Total features with drift thresholds: {len(drift_thresholds)}")

# ✅ Load baseline constraints
with open("constraints.json", "r") as constraints_file:
    baseline_constraints = json.load(constraints_file)

# ✅ Print summary of constraints instead of full JSON
print(f"✅ Constraints contain {len(baseline_constraints['features'])} feature checks.")

# ✅ Store the extracted thresholds for later use
%store drift_thresholds
print("✅ Stored drift thresholds in %store for CloudWatch Alarm.")


✅ Sample Drift Thresholds: {'_c0': 1740273202.9792986, '_c1': 2017.4840923430359, '_c2': 8.937354648979131, '_c3': 10.647831917092148, '_c4': 264.0408222704953}
✅ Total features with drift thresholds: 20
✅ Constraints contain 20 feature checks.
Stored 'drift_thresholds' (dict)
✅ Stored drift thresholds in %store for CloudWatch Alarm.


## Make CloudWatch Alarm

In [97]:
import numpy as np

# ✅ Retrieve stored drift thresholds
%store -r drift_thresholds

# ✅ Remove '_c0' and any timestamp-like values
filtered_thresholds = {k: v for k, v in drift_thresholds.items() if k != "_c0" and v < 1e6}

# ✅ Compute the 90th percentile threshold (if at least one valid feature remains)
if filtered_thresholds:
    threshold_value = np.percentile(list(filtered_thresholds.values()), 90)
    # ✅ Identify which feature corresponds to this threshold
    selected_feature = max(filtered_thresholds, key=lambda k: abs(filtered_thresholds[k] - threshold_value))
else:
    threshold_value = 1.0  # Default fallback value
    selected_feature = "No valid feature found"

# ✅ Initialize CloudWatch client
cw_client = boto3.client("cloudwatch")

# ✅ Define CloudWatch Alarm
model_drift_alarm_name = "FlightDelayModelDriftAlarm"

response = cw_client.put_metric_alarm(
    AlarmName=model_drift_alarm_name,
    AlarmDescription="Triggers when model drift is detected in the monitoring schedule.",
    ActionsEnabled=True,
    MetricName="FeatureAttributeDrift",
    Namespace="aws/sagemaker/Endpoints/model-metrics",
    Statistic="Average",
    Dimensions=[
        {"Name": "Endpoint", "Value": "flight-delay-xgboost-endpoint-single-request"},
        {"Name": "MonitoringSchedule", "Value": "FlightDelayMonitor"},
    ],
    Period=3600,  # Every hour
    EvaluationPeriods=1,
    DatapointsToAlarm=1,
    Threshold=threshold_value,  # ✅ Now using a corrected 90th percentile threshold
    ComparisonOperator="GreaterThanThreshold",
    TreatMissingData="breaching",
)

print(f"✅ CloudWatch Alarm '{model_drift_alarm_name}' created successfully with threshold {threshold_value}.")
print(f"🚀 The feature '{selected_feature}' has the 90th percentile drift threshold: {threshold_value}")

%store model_drift_alarm_name
print("✅ Stored model_drift_alarm_name in %store.")


✅ CloudWatch Alarm 'FlightDelayModelDriftAlarm' created successfully with threshold 13947.141033916032.
🚀 The feature '_c14' has the 90th percentile drift threshold: 13947.141033916032
Stored 'model_drift_alarm_name' (str)
✅ Stored model_drift_alarm_name in %store.


In [51]:
# ✅ Define feature names manually (from the feature store)
feature_names = [
    "year", "month", "carrier", "airport", "arr_flights", "arr_del15", "carrier_ct", "weather_ct",
    "nas_ct", "security_ct", "late_aircraft_ct", "arr_cancelled", "arr_diverted", "arr_delay",
    "carrier_delay", "weather_delay", "nas_delay", "security_delay", "late_aircraft_delay",
    "delay_rate", "on_time", "event_time", "record_id"
]

# ✅ Ensure drift_thresholds uses proper feature names
%store -r drift_thresholds

# ✅ Map `_c14` to actual feature name
mapped_feature_name = feature_names[14]  # Index 14 in the actual dataset
print(f"🚀 `_c14` corresponds to feature: {mapped_feature_name}")


🚀 `_c14` corresponds to feature: carrier_delay


## Setup Sage Model Monitor

In [73]:
import boto3
import sagemaker
from sagemaker.model_monitor import DefaultModelMonitor, CronExpressionGenerator, EndpointInput

# ✅ Define S3 paths
monitoring_results_uri = f"s3://{bucket}/monitoring_results/"

print(f"Baseline Results Path: {baseline_results_uri}")
print(f"Monitoring Results Path: {monitoring_results_uri}")

# ✅ Configure the model monitor
model_monitor = DefaultModelMonitor(
    role=role,  # ✅ Use role from %store
    instance_type="ml.m5.xlarge",
    instance_count=1,
    max_runtime_in_seconds=3600,
    sagemaker_session=sagemaker_session,
)

# ✅ Define Endpoint Input (without dataset_format)
endpoint_input = EndpointInput(
    endpoint_name="flight-delay-xgboost-endpoint-single-request",
    destination="/opt/ml/processing/input"
)

# ✅ Monitoring Schedule Name
flight_delay_drift_monitor_schedule_name = "FlightDelayMonitor"

# ✅ Check if the schedule already exists (to avoid errors)
sm_client = boto3.client("sagemaker")

existing_schedules = sm_client.list_monitoring_schedules()["MonitoringScheduleSummaries"]
schedule_names = [schedule["MonitoringScheduleName"] for schedule in existing_schedules]

if flight_delay_drift_monitor_schedule_name in schedule_names:
    print(f"⚠️ Monitoring schedule '{flight_delay_drift_monitor_schedule_name}' already exists. Skipping creation.")
else:
    # ✅ Schedule Data Monitoring (Every Hour)
    model_monitor.create_monitoring_schedule(
        monitor_schedule_name=flight_delay_drift_monitor_schedule_name,
        endpoint_input=endpoint_input,  # ✅ Fixed: Properly pass EndpointInput
        output_s3_uri=monitoring_results_uri,
        schedule_cron_expression=CronExpressionGenerator.hourly(),
        enable_cloudwatch_metrics=True,
    )
    print(f"✅ Monitoring schedule created for endpoint: flight-delay-xgboost-endpoint-single-request")

# ✅ Store the monitoring schedule names for later use
%store flight_delay_drift_monitor_schedule_name
print("✅ Stored Drift monitor schedule name in %store.")


INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: .
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


Baseline Results Path: s3://sagemaker-us-east-1-607916531205/baseline_results/
Monitoring Results Path: s3://sagemaker-us-east-1-607916531205/monitoring_results/
⚠️ Monitoring schedule 'FlightDelayMonitor' already exists. Skipping creation.
Stored 'flight_delay_drift_monitor_schedule_name' (str)
✅ Stored Drift monitor schedule name in %store.


## Send Fake Data To Trigger Alarm

In [55]:
cw_client.set_alarm_state(
    AlarmName=alarm_name,
    StateValue="OK",
    StateReason="Resetting alarm to OK state manually."
)
print(f"✅ CloudWatch Alarm '{alarm_name}' reset to OK state.")


✅ CloudWatch Alarm 'FlightDelayModelDriftAlarm' reset to OK state.


In [58]:
import pandas as pd
import random
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sklearn.preprocessing import LabelEncoder

# ✅ Feature columns (ensure these match training data)
columns = [
    "year", "month", "day", "carrier", "airport", 
    "arr_flights", "arr_del15", "carrier_ct", "weather_ct", 
    "nas_ct", "security_ct", "late_aircraft_ct", "arr_cancelled", 
    "arr_diverted", "arr_delay", "carrier_delay", "weather_delay", 
    "nas_delay", "security_delay", "late_aircraft_delay", "delay_rate"
]

# ✅ Generate random fake data
def generate_fake_data(num_rows=500):
    fake_data = []
    for _ in range(num_rows):
        fake_row = [
            random.randint(2000, 2023),  # year
            random.randint(1, 12),  # month
            random.randint(1, 28),  # day
            random.choice(["AA", "DL", "UA", "SW", "AS", "EV"]),  # carrier
            random.choice(["JFK", "LAX", "ORD", "ATL", "DFW", "DEN"]),  # airport
            random.randint(50, 1000),  # arr_flights
            random.randint(0, 200),  # arr_del15
            random.randint(0, 50),  # carrier_ct
            random.randint(0, 50),  # weather_ct
            random.randint(0, 50),  # nas_ct
            random.randint(0, 50),  # security_ct
            random.randint(0, 50),  # late_aircraft_ct
            random.randint(0, 10),  # arr_cancelled
            random.randint(0, 10),  # arr_diverted
            random.uniform(0, 300),  # arr_delay
            random.uniform(0, 100),  # carrier_delay
            random.uniform(0, 100),  # weather_delay
            random.uniform(0, 100),  # nas_delay
            random.uniform(0, 100),  # security_delay
            random.uniform(0, 100),  # late_aircraft_delay
            random.uniform(1.0, 2.0),  # delay_rate
        ]
        fake_data.append(fake_row)
    return pd.DataFrame(fake_data, columns=columns)

# ✅ Write fake data to CSV
fake_data = generate_fake_data(500)  # Generate 500 records
fake_data_file = "fake_flight_data.csv"
fake_data.to_csv(fake_data_file, index=False, header=False)
print(f"✅ Fake data generated and saved to {fake_data_file}")

# ✅ Load fake data for inference
fake_data = pd.read_csv(fake_data_file, header=None)

# ✅ Apply Label Encoding to `carrier` and `airport`
label_encoders = {}
for col_index, col_name in enumerate(["carrier", "airport"]):  
    le = LabelEncoder()
    fake_data[col_index + 3] = le.fit_transform(fake_data[col_index + 3])  # Encoding categorical columns

# ✅ Convert to CSV format for inference
fake_data_str_list = fake_data.apply(lambda row: ",".join(row.astype(str)), axis=1).tolist()

# ✅ Initialize predictor
predictor = Predictor(
    endpoint_name="flight-delay-xgboost-endpoint-single-request",
    sagemaker_session=sagemaker.Session(),
    serializer=CSVSerializer(),
)

# ✅ Send Fake Data for Inference (Print Only First 5)
responses = []
for i, row_str in enumerate(fake_data_str_list):
    response = predictor.predict(row_str)
    responses.append(response)

    # ✅ Print only the first 5 for visibility
    if i < 5:
        print(f"🟢 Input: {row_str}")
        print(f"🔵 Prediction: {response}")

print(f"✅ Sent {len(fake_data_str_list)} rows to the endpoint. Check CloudWatch for drift alerts.")


✅ Fake data generated and saved to fake_flight_data.csv
🟢 Input: 2007.0,11.0,2.0,1.0,3.0,413.0,83.0,3.0,11.0,6.0,27.0,0.0,0.0,3.0,19.38304602789831,53.842418191692886,10.762279422655618,46.16500556744988,30.895848156704897,22.469515756986723,1.186644095262146
🔵 Prediction: b'1.0346633644076064e-05\n'
🟢 Input: 2022.0,6.0,1.0,0.0,5.0,525.0,135.0,39.0,28.0,38.0,30.0,12.0,8.0,7.0,44.23849678625918,33.84863654775507,43.051928988872525,69.22475392469244,16.92049677902927,88.13869527464779,1.702360247056779
🔵 Prediction: b'4.337779500929173e-06\n'
🟢 Input: 2017.0,6.0,20.0,1.0,1.0,283.0,97.0,2.0,45.0,14.0,18.0,30.0,2.0,7.0,165.48589029039456,17.098792401646524,70.19742395014109,0.6440027865475373,40.66715806131285,78.28079551855637,1.9780693393943585
🔵 Prediction: b'4.337779500929173e-06\n'
🟢 Input: 2003.0,3.0,15.0,2.0,4.0,571.0,30.0,12.0,22.0,31.0,45.0,32.0,0.0,6.0,17.40086510779658,55.07227099860814,94.09834394912916,2.4375308366564874,2.7927741138372157,45.03554364538015,1.690335063434221
🔵

In [59]:
import boto3

# ✅ Initialize SageMaker client
sm_client = boto3.client("sagemaker")

# ✅ Retrieve monitoring schedule details
schedule_name = "FlightDelayMonitor"
response = sm_client.describe_monitoring_schedule(MonitoringScheduleName=schedule_name)

# ✅ Print status
print(f"🟢 Monitoring Schedule: {schedule_name}")
print(f"   - Status: {response['MonitoringScheduleStatus']}")
print(f"   - Last Run: {response.get('LastMonitoringExecutionSummary', {}).get('CreationTime', 'No runs yet')}")
print(f"   - Next Run: Scheduled every hour")


🟢 Monitoring Schedule: FlightDelayMonitor
   - Status: Scheduled
   - Last Run: No runs yet
   - Next Run: Scheduled every hour


# End of Feature Drift Monitor

# Data Quality Monitoring

In [86]:
import boto3
import sagemaker
from sagemaker.model_monitor import DefaultModelMonitor, CronExpressionGenerator, EndpointInput

# ✅ Define S3 paths
monitoring_results_uri = f"s3://{bucket}/monitoring_results/"

print(f"Baseline Results Path: {baseline_results_uri}")
print(f"Monitoring Results Path: {monitoring_results_uri}")

# ✅ Configure the model monitor (Data Quality)
model_monitor = DefaultModelMonitor(
    role=role,  # ✅ Use role from %store
    instance_type="ml.m5.xlarge",
    instance_count=1,
    max_runtime_in_seconds=3600,
    sagemaker_session=sagemaker_session,
)

# ✅ Define Endpoint Input Correctly (without dataset_format)
endpoint_input = EndpointInput(
    endpoint_name="flight-delay-xgboost-endpoint-single-request",
    destination="/opt/ml/processing/input"
)

# ✅ Monitoring Schedule Name
flight_delay_data_quality_monitor_schedule_name = "FlightDelayDataQualityMonitor"

# ✅ Check if the schedule already exists (to avoid errors)
sm_client = boto3.client("sagemaker")

existing_schedules = sm_client.list_monitoring_schedules()["MonitoringScheduleSummaries"]
schedule_names = [schedule["MonitoringScheduleName"] for schedule in existing_schedules]

if flight_delay_data_quality_monitor_schedule_name in schedule_names:
    print(f"⚠️ Monitoring schedule '{flight_delay_data_quality_monitor_schedule_name}' already exists. Skipping creation.")
else:
    # ✅ Schedule Data Monitoring (Every Hour)
    model_monitor.create_monitoring_schedule(
        monitor_schedule_name=flight_delay_data_quality_monitor_schedule_name,
        endpoint_input=endpoint_input,  # ✅ Properly pass EndpointInput
        output_s3_uri=monitoring_results_uri,
        schedule_cron_expression=CronExpressionGenerator.hourly(),
        enable_cloudwatch_metrics=True,
    )
    print(f"✅ Monitoring schedule created for endpoint: flight-delay-xgboost-endpoint-single-request")

# ✅ Store the second monitor schedule name in %store
%store flight_delay_data_quality_monitor_schedule_name
print("✅ Stored Data Quality monitor schedule name in %store.")


INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: .
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


Baseline Results Path: s3://sagemaker-us-east-1-607916531205/baseline_results/
Monitoring Results Path: s3://sagemaker-us-east-1-607916531205/monitoring_results/


INFO:sagemaker.model_monitor.model_monitoring:Creating Monitoring Schedule with name: FlightDelayDataQualityMonitor


✅ Monitoring schedule created for endpoint: flight-delay-xgboost-endpoint-single-request
Stored 'flight_delay_data_quality_monitor_schedule_name' (str)
✅ Stored Data Quality monitor schedule name in %store.


## Send Fake Data To Trigger Alarm

In [92]:
def generate_extreme_fake_data(num_rows=100):
    fake_data = []
    for _ in range(num_rows):
        fake_row = [
            random.randint(2000, 2023),  # year
            random.randint(1, 12),  # month
            random.randint(1, 28),  # day
            random.choice(["AA", "DL", "UA", "SW", "AS", "EV"]),  # carrier
            random.choice(["JFK", "LAX", "ORD", "ATL", "DFW", "DEN"]),  # airport
            random.randint(50, 1000),  # arr_flights
            random.choice([None, random.randint(0, 200)]),  # arr_del15 (introduce None)
            random.choice([None, random.randint(0, 50)]),  # carrier_ct (introduce None)
            random.choice([None, random.randint(0, 50)]),  # weather_ct (introduce None)
            random.choice([None, random.randint(0, 50)]),  # nas_ct (introduce None)
            random.choice([None, random.randint(0, 50)]),  # security_ct (introduce None)
            random.choice([None, random.randint(0, 50)]),  # late_aircraft_ct (introduce None)
            random.choice([None, random.randint(0, 10)]),  # arr_cancelled (introduce None)
            random.choice([None, random.randint(0, 10)]),  # arr_diverted (introduce None)
            random.uniform(0, 300),  # arr_delay
            random.uniform(0, 100),  # carrier_delay
            random.uniform(0, 100),  # weather_delay
            random.uniform(0, 100),  # nas_delay
            random.uniform(0, 100),  # security_delay
            random.uniform(0, 100),  # late_aircraft_delay
            random.uniform(1.0, 2.0),  # delay_rate
        ]
        
        # Introduce extreme outliers (more extreme than before)
        if random.random() > 0.9:
            fake_row[15] = 10000  # Extreme value for carrier_delay (outlier)
        
        fake_data.append(fake_row)
    return pd.DataFrame(fake_data, columns=columns)

# Generate extreme fake data with issues
fake_data_with_issues = generate_extreme_fake_data(100)  # Generate 100 rows with extreme issues
fake_data_file_with_issues = "fake_flight_data_with_issues.csv"
fake_data_with_issues.to_csv(fake_data_file_with_issues, index=False, header=False)
print(f"✅ Fake data with issues generated and saved to {fake_data_file_with_issues}")

# ✅ Load the generated data
fake_data_with_issues = pd.read_csv(fake_data_file_with_issues, header=None)

# ✅ Apply label encoding
label_encoders = {}
for col_index, col_name in enumerate(["carrier", "airport"]):  
    le = LabelEncoder()
    fake_data_with_issues[col_index + 3] = le.fit_transform(fake_data_with_issues[col_index + 3])  # Encoding categorical columns

# ✅ Convert to CSV format for inference
fake_data_str_list = fake_data_with_issues.apply(lambda row: ",".join(row.astype(str)), axis=1).tolist()

# ✅ Send the data to the endpoint
predictor = Predictor(
    endpoint_name="flight-delay-xgboost-endpoint-single-request",
    sagemaker_session=sagemaker.Session(),
    serializer=CSVSerializer(),
)

responses = []
for i, row_str in enumerate(fake_data_str_list):
    response = predictor.predict(row_str)
    responses.append(response)
    if i < 5:
        print(f"🟢 Input: {row_str}")
        print(f"🔵 Prediction: {response}")

print(f"✅ Sent {len(fake_data_str_list)} rows to the endpoint. Check CloudWatch for drift alerts.")


✅ Fake data with issues generated and saved to fake_flight_data_with_issues.csv
🟢 Input: 2013.0,8.0,15.0,2.0,3.0,730.0,nan,49.0,nan,44.0,27.0,nan,7.0,3.0,214.91671084684117,10000.0,21.33190081813193,63.97851559474853,71.42439276207578,12.967931724957072,1.652889545147565
🔵 Prediction: b'0.0022950877901166677\n'
🟢 Input: 2009.0,10.0,2.0,5.0,3.0,924.0,nan,42.0,8.0,27.0,38.0,nan,3.0,nan,157.8906414584232,14.401232785611771,2.009893107436067,96.31735185992108,48.21721503635166,6.516452910099446,1.5685100601887108
🔵 Prediction: b'0.0022950877901166677\n'
🟢 Input: 2013.0,8.0,16.0,5.0,2.0,57.0,54.0,nan,50.0,11.0,nan,6.0,nan,nan,278.2355559052447,80.65517873108026,80.59374894485404,95.87863404295616,79.5469161478308,16.17224497405172,1.7182541234414552
🔵 Prediction: b'1.0346633644076064e-05\n'
🟢 Input: 2009.0,4.0,12.0,1.0,2.0,942.0,8.0,nan,nan,nan,34.0,44.0,7.0,nan,206.40601430319973,33.28253542549483,68.47700015995088,36.32202356467392,94.61250398637748,92.75321425830704,1.9417928078546056
🔵 

# End of Data Quality Monitor

# Infrastructure Monitor

In [77]:
import boto3

# ✅ Initialize CloudWatch client
cw_client = boto3.client("cloudwatch")

# ✅ Define alarm names and thresholds
alarm_name_latency = "FlightDelayEndpointLatencyAlarm"
alarm_name_failure = "FlightDelayEndpointFailureAlarm"
latency_threshold = 2.0  # 2 seconds for latency alarm
failure_threshold = 0.05  # 5% failure rate

# ✅ Create Alarm for High Latency (e.g., if response time exceeds 2 seconds)
response_latency = cw_client.put_metric_alarm(
    AlarmName=alarm_name_latency,
    AlarmDescription="Triggers when endpoint latency exceeds 2 seconds.",
    ActionsEnabled=True,
    MetricName="ModelLatency",
    Namespace="aws/sagemaker/Endpoints",
    Statistic="Average",
    Dimensions=[
        {"Name": "Endpoint", "Value": "flight-delay-xgboost-endpoint-single-request"}
    ],
    Period=60,  # Check every minute
    EvaluationPeriods=1,
    DatapointsToAlarm=1,
    Threshold=latency_threshold,  # 2 seconds
    ComparisonOperator="GreaterThanThreshold",
    TreatMissingData="notBreaching"
)

# ✅ Create Alarm for Invocation Failures (e.g., if failures exceed 5%)
response_failure = cw_client.put_metric_alarm(
    AlarmName=alarm_name_failure,
    AlarmDescription="Triggers when endpoint invocation failures exceed 5%.",
    ActionsEnabled=True,
    MetricName="InvocationFailures",
    Namespace="aws/sagemaker/Endpoints",
    Statistic="Sum",
    Dimensions=[
        {"Name": "Endpoint", "Value": "flight-delay-xgboost-endpoint-single-request"}
    ],
    Period=60,  # Check every minute
    EvaluationPeriods=1,
    DatapointsToAlarm=1,
    Threshold=failure_threshold,  # 5% failure rate
    ComparisonOperator="GreaterThanThreshold",
    TreatMissingData="notBreaching"
)

print("✅ Infrastructure monitoring alarms for latency and failures created successfully.")

# ✅ Store infrastructure monitoring data in %store
%store alarm_name_latency
%store alarm_name_failure
%store latency_threshold
%store failure_threshold

print("✅ Stored infrastructure monitoring data in %store.")


✅ Infrastructure monitoring alarms for latency and failures created successfully.
Stored 'alarm_name_latency' (str)
Stored 'alarm_name_failure' (str)
Stored 'latency_threshold' (float)
Stored 'failure_threshold' (float)
✅ Stored infrastructure monitoring data in %store.


# Clean up script

In [17]:
import boto3
from time import sleep

# Initialize SageMaker client
sagemaker_client = boto3.client("sagemaker")

# Define resources to clean up
endpoint_name = "flight-delay-xgboost-endpoint-single-request"
monitoring_schedule_name = "ModelMonitorForXGBoost"  # Adjust if the schedule name is different
endpoint_config_name = f"{endpoint_name}-config"

# Step 1: Delete Monitoring Schedules
try:
    # List all monitoring schedules associated with the endpoint
    print(f"Listing monitoring schedules for endpoint: {endpoint_name}")
    monitoring_schedules = sagemaker_client.list_monitoring_schedules()['MonitoringScheduleSummaries']
    for schedule in monitoring_schedules:
        if schedule['EndpointName'] == endpoint_name:
            print(f"Deleting monitoring schedule: {schedule['MonitoringScheduleName']}")
            sagemaker_client.delete_monitoring_schedule(MonitoringScheduleName=schedule['MonitoringScheduleName'])
            sleep(10)  # Allow time for the monitoring schedule deletion process
    print("All monitoring schedules deleted.")
except sagemaker_client.exceptions.ResourceNotFound as e:
    print("Monitoring schedule not found. Skipping deletion.")
except Exception as e:
    print(f"Error while deleting monitoring schedules: {e}")
    raise

# Step 2: Delete Endpoint
try:
    print(f"Deleting endpoint: {endpoint_name}")
    sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
    print(f"Endpoint '{endpoint_name}' deleted.")
except sagemaker_client.exceptions.ResourceNotFound as e:
    print(f"Endpoint '{endpoint_name}' not found. Skipping deletion.")
except Exception as e:
    print(f"Error while deleting endpoint: {e}")
    raise

# Step 3: Delete Endpoint Configuration
try:
    print(f"Deleting endpoint configuration: {endpoint_config_name}")
    sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
    print(f"Endpoint configuration '{endpoint_config_name}' deleted.")
except sagemaker_client.exceptions.ResourceNotFound as e:
    print(f"Endpoint configuration '{endpoint_config_name}' not found. Skipping deletion.")
except Exception as e:
    print(f"Error while deleting endpoint configuration: {e}")
    raise

Listing monitoring schedules for endpoint: flight-delay-xgboost-endpoint-single-request
Deleting monitoring schedule: FlightDelayMonitor
All monitoring schedules deleted.
Deleting endpoint: flight-delay-xgboost-endpoint-single-request
Endpoint 'flight-delay-xgboost-endpoint-single-request' deleted.
Deleting endpoint configuration: flight-delay-xgboost-endpoint-single-request-config
Error while deleting endpoint configuration: An error occurred (ValidationException) when calling the DeleteEndpointConfig operation: Could not find endpoint configuration "flight-delay-xgboost-endpoint-single-request-config".


ClientError: An error occurred (ValidationException) when calling the DeleteEndpointConfig operation: Could not find endpoint configuration "flight-delay-xgboost-endpoint-single-request-config".