In [3]:
# !pip install lime

In [4]:
import os
import joblib
import numpy as np
import pandas as pd
import shap
import lime
from lime.lime_tabular import LimeTabularExplainer
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
np.random.seed(42)

# folder tags
folder_tag = "_lag"   # change to "" for no lag version
model_plot_path = f"artifacts/model_plots{folder_tag}/"
model_path = f"artifacts/models{folder_tag}/"
model_results_path = f"artifacts/model_results{folder_tag}/"
INPUT_CSV = "artifacts/data/clean_data_with_lag_roll.csv"
TARGET = "Average_Price"

os.makedirs(model_plot_path, exist_ok=True)
os.makedirs(model_path, exist_ok=True)
os.makedirs(model_results_path, exist_ok=True)

df = pd.read_csv(INPUT_CSV)
y = df[TARGET]
X = df.drop(columns=[TARGET])

RUN_MODELS = ["Linear", "RandomForest", "XGBoost", "LightGBM"]


In [6]:
models = {}
for name in RUN_MODELS:
    path = os.path.join(model_path, f"{name}_best.joblib")
    if os.path.exists(path):
        models[name] = joblib.load(path)
        print(f"Loaded: {name}")
    else:
        print(f"Model not found: {path}")

Loaded: Linear
Loaded: RandomForest
Loaded: XGBoost
Loaded: LightGBM


In [7]:
from sklearn.metrics import mean_squared_error

perm_results = {}

for name, model in models.items():
    result = permutation_importance(model, X, y, n_repeats=10, random_state=42)
    perm_df = pd.DataFrame({
        "Feature": X.columns,
        "Importance": result.importances_mean
    }).sort_values("Importance", ascending=False)
    perm_results[name] = perm_df

    plt.figure(figsize=(8,6))
    sns.barplot(data=perm_df.head(15), x="Importance", y="Feature")
    plt.title(f"Permutation Importance — {name}")
    plt.tight_layout()
    plt.savefig(os.path.join(model_plot_path, f"{name}_permutation_importance.png"), dpi=200)
    plt.close()


ValueError: SimpleImputer does not support data with dtype bool. Please provide either a numeric array (with a floating point or integer dtype) or categorical data represented either as an array with integer dtype or an array of string values with an object dtype.

In [8]:
for name, model in models.items():
    print(f"SHAP Analysis — {name}")
    sample = X.sample(300, random_state=42)
    if "Forest" in name or "Boost" in name or "LightGBM" in name:
        explainer = shap.TreeExplainer(model.named_steps['est'])
    else:
        explainer = shap.KernelExplainer(model.predict, shap.sample(X, 100))

    shap_values = explainer.shap_values(sample)
    shap.summary_plot(shap_values, sample, show=False)
    plt.title(f"SHAP Summary — {name}")
    plt.tight_layout()
    plt.savefig(os.path.join(model_plot_path, f"{name}_shap_summary.png"), dpi=200)
    plt.close()


SHAP Analysis — Linear


AttributeError: property 'feature_names_in_' of 'Pipeline' object has no setter

In [9]:
for name, model in models.items():
    preds = model.predict(X)
    abs_errors = np.abs(preds - y)
    top_idx = np.argsort(abs_errors)[-5:]
    explainer = LimeTabularExplainer(
        X.values,
        feature_names=X.columns,
        mode='regression',
        discretize_continuous=True
    )

    for i, idx in enumerate(top_idx):
        exp = explainer.explain_instance(X.iloc[idx].values, model.predict)
        fig = exp.as_pyplot_figure()
        plt.title(f"LIME — {name} | Instance {idx}")
        plt.tight_layout()
        plt.savefig(os.path.join(model_plot_path, f"{name}_lime_top{i+1}.png"), dpi=200)
        plt.close()


ValueError: SimpleImputer does not support data with dtype bool. Please provide either a numeric array (with a floating point or integer dtype) or categorical data represented either as an array with integer dtype or an array of string values with an object dtype.

In [10]:
groups = {
    "climate": ['Temperature','Precipitation','Wind_Speed','Air_Pressure','Rainfall_MM'],
    "economic": ['USD_TO_NPR','Diesel','Inflation'],
    "market": ['Supply_Volume','imported_tomato_price'],
    "temporal": ['day_of_week_sin','day_of_week_cos','month_sin','month_cos','is_festival']
}


In [11]:
from sklearn.metrics import mean_absolute_error

ablation_results = []
for name, model in models.items():
    base_pred = model.predict(X)
    base_mae = mean_absolute_error(y, base_pred)

    for grp, cols in groups.items():
        X_drop = X.drop(columns=cols, errors='ignore')
        pred = model.predict(X_drop)
        mae = mean_absolute_error(y, pred)
        ablation_results.append({
            "Model": name,
            "Group": grp,
            "MAE_Change": mae - base_mae
        })

ablation_df = pd.DataFrame(ablation_results)
plt.figure(figsize=(8,5))
sns.barplot(data=ablation_df, x="Group", y="MAE_Change", hue="Model")
plt.title("Feature Group Ablation (MAE Change)")
plt.tight_layout()
plt.savefig(os.path.join(model_plot_path, "group_ablation.png"), dpi=200)
plt.close()


ValueError: SimpleImputer does not support data with dtype bool. Please provide either a numeric array (with a floating point or integer dtype) or categorical data represented either as an array with integer dtype or an array of string values with an object dtype.

In [12]:
shap.plots.waterfall(shap.Explanation(values=shap_values[0], base_values=explainer.expected_value, data=sample.iloc[0]), show=False)
plt.savefig(os.path.join(model_plot_path, f"{name}_waterfall_sample.png"), dpi=200)
plt.close()


NameError: name 'shap_values' is not defined