## Data Drift Monitoring with NannyML
Monitor model performance and data drift for the alcoholism prediction model.

In [None]:
# Enable auto-reloading of modules
%load_ext autoreload
%autoreload 2

In [None]:
# Import required modules
import os
import sys
sys.path.append(os.path.abspath(".."))

from DSML.config import RAW_DATA_DIR, categorical, target
from DSML.preproc import get_raw_data

In [None]:
# Load original dataset
df = get_raw_data()

# Split target and features
y = df.pop('Alcoholic')
X = df

In [None]:
# Train model with same parameters as before
import joblib
from DSML.train import run_hyperopt
categorical_indices = [X.columns.get_loc(col) for col in categorical if col in X.columns]
best_params_path = run_hyperopt(X, y, categorical_indices)
params = joblib.load(best_params_path)

from catboost import CatBoostClassifier
model = CatBoostClassifier(**params, verbose=True)
model.fit(X, y, cat_features=categorical_indices)

In [None]:
# Prepare reference data for NannyML
import pandas as pd
import numpy as np

reference_df = X.copy()
reference_df["prediction"] = model.predict(X)
reference_df["predicted_probability"] = [p[1] for p in model.predict_proba(X)]
reference_df[target] = y

# Get column names for drift calculation
feature_columns = X.columns

In [None]:
# Initialize NannyML calculators
import nannyml as nml

# Set chunk size for drift calculation
chunk_size = 50

# Initialize univariate drift calculator
udc = nml.UnivariateDriftCalculator(
    column_names=feature_columns,
    chunk_size=chunk_size,
)
udc.fit(X)

# Initialize performance estimator
estimator = nml.CBPE(
    problem_type="classification_binary",
    y_pred_proba="predicted_probability",
    y_pred="prediction",
    y_true=target,
    metrics=["roc_auc"],
    chunk_size=chunk_size,
)
estimator = estimator.fit(reference_df)

In [None]:
# Define analysis function
def analyze_drift(estimator, udc, analysis_df, feature_cols):
    """Analyze data drift and model performance"""
    analysis_df = analysis_df.copy()
    
    # Get predictions for analysis set
    analysis_df["prediction"] = model.predict(analysis_df[feature_cols])
    analysis_df["predicted_probability"] = [p[1] for p in model.predict_proba(analysis_df[feature_cols])]
    
    # Estimate performance
    estimated_performance = estimator.estimate(analysis_df)
    performance_plot = estimated_performance.plot()
    performance_plot.show()

    # Calculate drift
    univariate_drift = udc.calculate(analysis_df[feature_cols])
    
    # Rank features by alert count
    alert_ranker = nml.AlertCountRanker()
    alert_ranking = alert_ranker.rank(univariate_drift)
    display(alert_ranking)

    # Plot drift for each feature
    for col in feature_cols:
        try:
            univariate_drift.filter(column_names=[col]).plot().show()
            univariate_drift.filter(period="analysis", column_names=[col]).plot(kind='distribution').show()
        except:
            print(f"Failed to plot drift analysis for {col}")
            
    return univariate_drift, estimated_performance

In [None]:
# Generate synthetic data with drift for testing
import numpy as np

n_samples = 100
synthetic_data = X.copy().iloc[:n_samples]

# Introduce drift in some numeric columns
numeric_cols = X.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    synthetic_data[col] = synthetic_data[col] * np.random.uniform(0.8, 1.2, size=n_samples)

# Analyze drift in synthetic data
drift_results, perf_results = analyze_drift(estimator, udc, synthetic_data, feature_columns)

The above analysis helps identify:
1. Which features are experiencing drift
2. How the model performance is affected
3. Which features contribute most to performance degradation