<a href="https://colab.research.google.com/github/jagadeesh01032005/explainable-AI/blob/main/assignment_4_2290.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
import shap
from lime import lime_tabular

In [None]:
OUTDIR = "outputs"
os.makedirs(OUTDIR, exist_ok=True)

In [None]:
UCI_XLSX_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx"


try:
    df = pd.read_excel(UCI_XLSX_URL, engine="openpyxl")
except Exception as e:
    # Fallback: try alternate path used by UCI mirrors
    ALT_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx"
    print("Primary URL failed; trying alternate...")
    df = pd.read_excel(ALT_URL, engine="openpyxl")

In [None]:
# Clean column names (strip spaces/newlines)
df.columns = [str(c).strip().replace(" ", "_") for c in df.columns]

# According to the dataset, first 8 columns are features; last 2 are targets: Y1(Heating) and Y2(Cooling)
# Common canonical names:
# X1: Relative_Compactness, X2: Surface_Area, X3: Wall_Area, X4: Roof_Area,
# X5: Overall_Height, X6: Orientation, X7: Glazing_Area, X8: Glazing_Area_Distribution,
# Y1: Heating_Load, Y2: Cooling_Load

# Standardize to these names where possible
rename_map = {
    "X1": "Relative_Compactness",
    "X2": "Surface_Area",
    "X3": "Wall_Area",
    "X4": "Roof_Area",
    "X5": "Overall_Height",
    "X6": "Orientation",
    "X7": "Glazing_Area",
    "X8": "Glazing_Area_Distribution",
    "Y1": "Heating_Load",
    "Y2": "Cooling_Load",
}
for k, v in rename_map.items():
    if k in df.columns:
        df.rename(columns={k: v}, inplace=True)

# A few UCI versions include an extra unnamed column; drop any fully-NA columns
na_all_cols = [c for c in df.columns if df[c].isna().all()]
if na_all_cols:
    df.drop(columns=na_all_cols, inplace=True)

# Keep only the expected 10 columns if present
expected_cols = [
    "Relative_Compactness", "Surface_Area", "Wall_Area", "Roof_Area",
    "Overall_Height", "Orientation", "Glazing_Area",
    "Glazing_Area_Distribution", "Heating_Load", "Cooling_Load"
]
available_cols = [c for c in expected_cols if c in df.columns]
df = df[available_cols]

# ===============
# 2) Define features/target (focus on Heating_Load Y1 for this assignment)
# ===============
feature_cols = [
    "Relative_Compactness", "Surface_Area", "Wall_Area", "Roof_Area",
    "Overall_Height", "Orientation", "Glazing_Area", "Glazing_Area_Distribution"
]

target_col = "Heating_Load"  # You can switch to "Cooling_Load" if desired

X = df[feature_cols].copy()
y = df[target_col].copy()

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
rfr = RandomForestRegressor(
n_estimators=600,
max_depth=None,
min_samples_leaf=1,
random_state=42,
n_jobs=-1
)
rfr.fit(X_train, y_train)

In [None]:
# ===============
# 4) Permutation Importance (PI)
# ===============
pi = permutation_importance(rfr, X_test, y_test, n_repeats=20, random_state=42, n_jobs=-1)
pi_means = pd.Series(pi.importances_mean, index=feature_cols).sort_values(ascending=False)

plt.figure(figsize=(8, 5))
pi_means.plot(kind="bar")
plt.title("Permutation Importance – Heating Load (RandomForest)")
plt.ylabel("Mean decrease in R^2 on shuffle")
plt.tight_layout()
pi_path = os.path.join(OUTDIR, "pi_bar_heating.png")
plt.savefig(pi_path, dpi=180)
plt.close()
print(f"Saved PI bar chart to: {pi_path}")

Saved PI bar chart to: outputs/pi_bar_heating.png


In [None]:
# ===============
# 5) SHAP – global (summary) & local (force) explanations
# ===============
explainer = shap.TreeExplainer(rfr)
shap_values = explainer.shap_values(X_train)

# Global: beeswarm + bar summary
plt.figure()
shap.summary_plot(shap_values, X_train, show=False)
shap_beeswarm_path = os.path.join(OUTDIR, "shap_summary_beeswarm.png")
plt.tight_layout()
plt.savefig(shap_beeswarm_path, dpi=180)
plt.close()
print(f"Saved SHAP beeswarm to: {shap_beeswarm_path}")

plt.figure()
shap.summary_plot(shap_values, X_train, plot_type="bar", show=False)
shap_bar_path = os.path.join(OUTDIR, "shap_summary_bar.png")
plt.tight_layout()
plt.savefig(shap_bar_path, dpi=180)
plt.close()
print(f"Saved SHAP bar summary to: {shap_bar_path}")

# Local: SHAP force plot for one representative building (pick median predicted demand)
# Identify an index near the median predicted demand on the test set
median_pred = np.median(preds)
idx_median = int(np.argsort(np.abs(preds - median_pred))[0])

x_instance = X_test.iloc[idx_median:idx_median+1]
y_true_inst = y_test.iloc[idx_median]
y_pred_inst = preds[idx_median]

# Force plot expects a 1D shap_values row and base value
shap_values_test = explainer.shap_values(X_test)
force = shap.force_plot(
    explainer.expected_value,
    shap_values_test[idx_median, :],
    X_test.iloc[idx_median, :],
    matplotlib=False
)
force_path = os.path.join(OUTDIR, "shap_force_local.html")
shap.save_html(force_path, force)
print(f"Saved SHAP local force plot to: {force_path}")

Saved SHAP beeswarm to: outputs/shap_summary_beeswarm.png
Saved SHAP bar summary to: outputs/shap_summary_bar.png
Saved SHAP local force plot to: outputs/shap_force_local.html


In [None]:
explainer_lime = lime_tabular.LimeTabularExplainer(
training_data=X_train.values,
feature_names=feature_cols,
discretize_continuous=True,
mode="regression",
verbose=False,
random_state=42
)

In [None]:
low_idx = int(np.argsort(preds)[0])
high_idx = int(np.argsort(preds)[-1])

In [None]:
for label, idx in [("low", low_idx), ("high", high_idx)]:
    exp = explainer_lime.explain_instance(
        data_row=X_test.iloc[idx].values,
        predict_fn=rfr.predict,
        num_features=8
    )
    html_path = os.path.join(OUTDIR, f"lime_{label}_heating.html")
    exp.save_to_file(html_path)
    print(f"Saved LIME explanation to: {html_path} (instance index {idx})")

Saved LIME explanation to: outputs/lime_low_heating.html (instance index 42)
Saved LIME explanation to: outputs/lime_high_heating.html (instance index 3)


In [None]:
mean_abs_shap = pd.Series(np.abs(shap_values).mean(axis=0), index=feature_cols).sort_values(ascending=False)

In [None]:
comp_df = pd.DataFrame({
'PI_rank': pi_means.rank(ascending=False, method='dense'),
'mean|SHAP|_rank': mean_abs_shap.rank(ascending=False, method='dense')
}).sort_values('PI_rank')

In [None]:
insights = []
insights.append("Permutation Importance and SHAP broadly agree on the dominant geometric/area features (e.g., Surface_Area, Wall_Area, Roof_Area, Relative_Compactness).")
insights.append("SHAP reveals directionality: higher Relative_Compactness and greater Overall_Height tend to reduce heating load, while larger Surface/Wall/Roof areas tend to increase it (given other factors constant).")
insights.append("Orientation and Glazing characteristics show moderate effects globally, but can be decisive locally depending on the specific building.")
insights.append("LIME explanations for the low-demand building highlight compactness/height and smaller envelope areas as key to lower load; for the high-demand building, large envelope areas and glazing often push predictions upward.")
insights.append("Where PI and SHAP diverge slightly, it is often due to correlated features; SHAP distributes credit more consistently across correlated groups.")
insights.append("Local SHAP force plot shows how a handful of features push the prediction above/below the base value, clarifying case-by-case tradeoffs.")
insights.append("Across methods: PI is simple and model-agnostic for global ranks; SHAP provides both global ranking and signed, instance-level attributions; LIME offers sparse, human-readable rules for specific buildings.")

In [None]:
insights_path = os.path.join(OUTDIR, "insights.txt")
with open(insights_path, "w", encoding="utf-8") as f:
    f.write("Insights (5–10 bullets)\n-------------------------\n")
    for i, s in enumerate(insights, 1):
        f.write(f"{i}. {s}\n")

In [None]:
comp_path = os.path.join(OUTDIR, "pi_vs_shap_ranks.csv")
comp_df.to_csv(comp_path)

In [None]:
print("\n=== Deliverables written ===")
print(f"PI bar chart: {pi_path}")
print(f"SHAP summary (beeswarm): {shap_beeswarm_path}")
print(f"SHAP summary (bar): {shap_bar_path}")
print(f"SHAP local force (HTML): {force_path}")
print(f"LIME low/high (HTML): {os.path.join(OUTDIR, 'lime_low_heating.html')} | {os.path.join(OUTDIR, 'lime_high_heating.html')}")
print(f"Insights: {insights_path}")
print(f"PI vs SHAP ranks CSV: {comp_path}")


=== Deliverables written ===
PI bar chart: outputs/pi_bar_heating.png
SHAP summary (beeswarm): outputs/shap_summary_beeswarm.png
SHAP summary (bar): outputs/shap_summary_bar.png
SHAP local force (HTML): outputs/shap_force_local.html
LIME low/high (HTML): outputs/lime_low_heating.html | outputs/lime_high_heating.html
Insights: outputs/insights.txt
PI vs SHAP ranks CSV: outputs/pi_vs_shap_ranks.csv


In [None]:
print("\nDone. Open the HTML files for interactive local explanations and the PNGs for plots.")


Done. Open the HTML files for interactive local explanations and the PNGs for plots.
