In [1]:
import pandas as pd
import numpy as np
import json
from joblib import load
from sklearn.metrics import mean_absolute_error, mean_squared_error
from evidently import Dataset
from evidently import DataDefinition
from evidently import Report
from evidently.presets import DataDriftPreset
import os

In [2]:
# Step 1: Load Reference and Current Data
reference_data = pd.read_csv("../data/processed/cleaned_data.csv")
#current_data = pd.read_csv("../data/processed/new_data.csv")   # Simulate new data batch

# If new data isn't available, simulate drift for demo purposes
current_data = reference_data.sample(frac=1).copy()
current_data['sales'] = current_data['sales'] * np.random.uniform(0.7, 1.3, len(current_data))

# Define schema for Evidently
schema = DataDefinition(
    categorical_columns=["store", "item"]
)

In [3]:
# Step 2: Create Evidently Datasets
eval_data_ref = Dataset.from_pandas(
    pd.DataFrame(reference_data), data_definition=schema
)

eval_data_curr = Dataset.from_pandas(
    pd.DataFrame(current_data), data_definition=schema
)

In [4]:
# Step 3: Run Data Drift Report
report = Report([DataDriftPreset()])
data_drift_report = report.run(eval_data_curr, eval_data_ref)

# Save Data Drift Report (HTML)
data_drift_report.save_html("../reports/data_drift_report.html")

# Displaying the result for the user
print("Data Drift Report saved to '../reports/data_drift_report.html'.")

Data Drift Report saved to '../reports/data_drift_report.html'.


In [11]:

model = load("../models/model.pkl")

# Step 1: Process current data (just like the training data)
current_data['date'] = pd.to_datetime(current_data['date'])  # Ensure the date column is in datetime format
current_data['year'] = current_data['date'].dt.year
current_data['month'] = current_data['date'].dt.month
current_data['day'] = current_data['date'].dt.day
current_data['dayofweek'] = current_data['date'].dt.dayofweek
current_data['is_weekend'] = current_data['dayofweek'].isin([5,6]).astype(int)

# Sort and group by store-item
current_data = current_data.sort_values(['store', 'item', 'date'])
current_data['lag_1'] = current_data.groupby(['store','item'])['sales'].shift(1)
current_data['lag_7'] = current_data.groupby(['store','item'])['sales'].shift(7)
current_data['rolling_mean_7'] = current_data.groupby(['store','item'])['sales'].shift(1).rolling(7).mean()
current_data['rolling_mean_30'] = current_data.groupby(['store','item'])['sales'].shift(1).rolling(30).mean()

# Drop NaN values after creating features
current_data = current_data.dropna()

# Prepare current data for prediction
X_curr = current_data.drop(['sales', 'date'], axis=1)  # Make sure sales and date are dropped
y_curr = current_data['sales']

# Step 2: Model Prediction
y_pred = model.predict(X_curr)

# Step 3: Calculate performance metrics (MAE, RMSE, MAPE)
mae = mean_absolute_error(y_curr, y_pred)
rmse = np.sqrt(mean_squared_error(y_curr, y_pred))
mape = np.mean(np.abs((y_curr - y_pred) / y_curr)) * 100

# Load baseline metrics
with open("../models/metrics.json") as f:
    base_metrics = json.load(f)

# Step 4: Compare with baseline MAPE to detect concept drift
concept_drift_detected = mape > base_metrics["mape"] * 1.2

# Step 5: Print Concept Drift Status
if concept_drift_detected:
    print(f"⚠️ Concept Drift Detected! Current MAPE: {mape:.2f} (baseline: {base_metrics['mape']:.2f})")
else:
    print(f"✅ Model performance is stable. MAPE: {mape:.2f} (baseline: {base_metrics['mape']:.2f})")


⚠️ Concept Drift Detected! Current MAPE: inf (baseline: 12.64)


In [13]:
# print(dir(data_drift_report))


In [15]:
# Inspect the _metrics and metric_results attributes
#print("Metrics:", data_drift_report._metrics)
#print("Metric Results:", data_drift_report.metric_results)


In [17]:
# Access dataset drift from _metrics (or metric_results, depending on structure)
dataset_drift = data_drift_report._metrics.get('dataset_drift', None)


In [19]:
# Convert numpy.bool_ to Python bool
def convert_to_native_bool(val):
    if isinstance(val, np.bool_):  # Check if it's a numpy boolean
        return bool(val)  # Convert to native Python boolean
    return val

# Step 5: Save Drift Metrics to Log
try:
    # Convert all drift metrics to native Python types
    drift_results = {
        "data_drift_detected": convert_to_native_bool(dataset_drift),
        "concept_drift_detected": convert_to_native_bool(concept_drift_detected),
        "current_mape": mape,
        "baseline_mape": base_metrics["mape"]
    }

    # Append drift results to log file
    with open("../logs/drift_logs.log", "a") as f:
        f.write(json.dumps(drift_results) + "\n")

    # Save summary of drift results as JSON
    with open("../reports/drift_summary.json", "w") as f:
        json.dump(drift_results, f, indent=2)

    # Display success message
    print("Drift metrics logged and saved to 'drift_logs.log' and 'drift_summary.json'.")

except Exception as e:
    print(f"Error while saving drift metrics: {e}")


Drift metrics logged and saved to 'drift_logs.log' and 'drift_summary.json'.
