In [None]:
# 0) Setup
import os, sys, json, pickle, math
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import List, Dict, Any, Optional

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss,
    classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve,
    brier_score_loss
)
from sklearn.calibration import CalibrationDisplay, calibration_curve

import joblib

# Try optional imports
try:
    import shap  # type: ignore
except Exception:
    shap = None

try:
    import xgboost as xgb  # type: ignore
except Exception:
    xgb = None

try:
    import lightgbm as lgb  # type: ignore
except Exception:
    lgb = None

# Config (EDIT HERE)
ARTIFACT_DIR = Path("artifacts")
TASK_TYPE = "classification"  # or "regression"
MODEL_FILES = [
    ARTIFACT_DIR / "best_model.joblib",
]
MODEL_NAMES = [
    "BestModel",
]
# If using test.csv
TEST_CSV = None  # e.g., ARTIFACT_DIR / "test.csv"
TARGET_NAME = "diabetes"
CLASS_NAMES = ["0", "1"]

# If using npy files
X_TEST_NPY = None  # e.g., ARTIFACT_DIR / "X_test.npy"
Y_TEST_NPY = None  # e.g., ARTIFACT_DIR / "y_test.npy"

# Output paths
OUT_DIR = Path("reports") / "report_outputs"
PLOTS_DIR = OUT_DIR / "plots"
TABLES_DIR = OUT_DIR / "tables"

RANDOM_STATE = 42

# Utils
PLOTS_DIR.mkdir(parents=True, exist_ok=True)
TABLES_DIR.mkdir(parents=True, exist_ok=True)


def ensure_dir(p: Path) -> None:
    p.mkdir(parents=True, exist_ok=True)


def save_df(name: str, df: pd.DataFrame) -> None:
    path = TABLES_DIR / f"{name}.csv"
    df.to_csv(path, index=False)


def save_fig(name: str) -> None:
    path = PLOTS_DIR / f"{name}.png"
    plt.tight_layout()
    plt.savefig(path, dpi=160)
    plt.close()

print("Setup complete.")


In [None]:
# 1) Data loading
from pathlib import Path
from typing import Tuple

# Allow overriding TEST_CSV via config; default to project dataset if present
_default_csv = Path('diabetes_prediction_dataset.csv')
if TEST_CSV is None and _default_csv.exists():
    TEST_CSV = _default_csv


def load_test_data() -> Tuple[pd.DataFrame, pd.Series]:
    if X_TEST_NPY and Y_TEST_NPY and Path(X_TEST_NPY).exists() and Path(Y_TEST_NPY).exists():
        X = np.load(X_TEST_NPY, allow_pickle=False)
        y = np.load(Y_TEST_NPY, allow_pickle=False)
        if isinstance(X, np.ndarray) and X.ndim == 2:
            X_df = pd.DataFrame(X)
        else:
            raise ValueError("X_test.npy must be a 2D array")
        y_ser = pd.Series(y)
        return X_df, y_ser
    if TEST_CSV and Path(TEST_CSV).exists():
        df = pd.read_csv(TEST_CSV)
        if TARGET_NAME not in df.columns:
            raise ValueError(f"TARGET_NAME '{TARGET_NAME}' not in columns of {TEST_CSV}")
        X_df = df.drop(columns=[TARGET_NAME])
        y_ser = df[TARGET_NAME]
        return X_df, y_ser
    raise FileNotFoundError("No test data found. Provide X_test.npy/y_test.npy or test.csv (TEST_CSV).")

X_test, y_test = load_test_data()
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
print("Nulls in X_test:", int(pd.isna(X_test).sum().sum()))
if TASK_TYPE == 'classification':
    print("Class balance:")
    print(y_test.value_counts(normalize=True))


In [None]:
# 2) Model loading
from typing import Any


def try_load_model(path: Path) -> Any:
    if not path.exists():
        raise FileNotFoundError(f"Model file not found: {path}")
    # joblib
    try:
        return joblib.load(path)
    except Exception:
        pass
    # pickle
    try:
        with open(path, 'rb') as f:
            return pickle.load(f)
    except Exception:
        pass
    # xgboost
    if xgb is not None:
        try:
            booster = xgb.Booster()
            booster.load_model(str(path))
            return booster
        except Exception:
            pass
    # lightgbm
    if lgb is not None:
        try:
            booster = lgb.Booster(model_file=str(path))
            return booster
        except Exception:
            pass
    raise RuntimeError(f"Unable to load model: {path}")


models = []
for i, mf in enumerate(MODEL_FILES):
    try:
        m = try_load_model(Path(mf))
        name = MODEL_NAMES[i] if i < len(MODEL_NAMES) else f"Model_{i+1}"
        models.append({"name": name, "model": m})
    except Exception as e:
        print(f"Warning: skipping {mf}: {e}")

if not models:
    raise RuntimeError("No models loaded. Check MODEL_FILES and paths.")

print("Loaded models:", [m["name"] for m in models])


In [None]:
# 3) Predictions
from sklearn.preprocessing import LabelBinarizer

preds = []
for entry in models:
    name, model = entry["name"], entry["model"]
    y_pred = None
    y_proba = None
    try:
        # scikit-like API
        y_pred = model.predict(X_test)
        if TASK_TYPE == 'classification':
            if hasattr(model, 'predict_proba'):
                y_proba = model.predict_proba(X_test)
            elif hasattr(model, 'decision_function'):
                scores = model.decision_function(X_test)
                # convert to pseudo-proba (min-max)
                scores = np.asarray(scores)
                if scores.ndim == 1:
                    s = (scores - scores.min()) / (scores.max() - scores.min() + 1e-12)
                    y_proba = np.vstack([1 - s, s]).T
                else:
                    s = (scores - scores.min()) / (scores.max() - scores.min() + 1e-12)
                    y_proba = s
    except Exception:
        # xgboost/lightgbm boosters require DMatrix/construct
        try:
            if xgb is not None and isinstance(model, xgb.Booster):
                dtest = xgb.DMatrix(X_test)
                probs = model.predict(dtest)
                y_proba = probs if probs.ndim > 1 else np.vstack([1 - probs, probs]).T
                y_pred = np.argmax(y_proba, axis=1)
            elif lgb is not None and isinstance(model, lgb.Booster):
                probs = model.predict(X_test)
                y_proba = probs if probs.ndim > 1 else np.vstack([1 - probs, probs]).T
                y_pred = np.argmax(y_proba, axis=1)
        except Exception as e:
            print(f"Prediction failed for {name}: {e}")
            continue

    preds.append({"name": name, "y_pred": y_pred, "y_proba": y_proba})

print("Predictions computed for:", [p["name"] for p in preds])


In [None]:
# 4) Metrics & plots

if TASK_TYPE == 'classification':
    summary_rows = []
    # ROC/PR combined across models
    plt.figure(figsize=(6,5))
    any_proba = False
    for pr in preds:
        name = pr['name']
        y_pred = pr['y_pred']
        y_proba = pr['y_proba']

        acc = accuracy_score(y_test, y_pred)
        prec_macro = precision_score(y_test, y_pred, average='macro', zero_division=0)
        rec_macro = recall_score(y_test, y_pred, average='macro', zero_division=0)
        f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)
        f1_weighted = f1_score(y_test, y_pred, average='weighted', zero_division=0)

        rocauc = np.nan
        logloss_val = np.nan
        if y_proba is not None:
            any_proba = True
            # binary or multiclass
            try:
                if y_proba.ndim == 1 or y_proba.shape[1] == 2:
                    pos = y_proba[:, -1] if y_proba.ndim > 1 else y_proba
                    rocauc = roc_auc_score(y_test, pos)
                    fpr, tpr, _ = roc_curve(y_test, pos)
                    plt.plot(fpr, tpr, label=f"{name} (AUC={rocauc:.3f})")
                    try:
                        logloss_val = log_loss(y_test, np.vstack([1-pos, pos]).T)
                    except Exception:
                        pass
                else:
                    rocauc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='macro')
                    # macro one-vs-rest: approximate by averaging curves of classes
                    # For brevity, skip per-class plotting; include name in legend only
                    plt.plot([0,1],[0,1],'--', color='gray')
            except Exception:
                pass

        summary_rows.append({
            'model': name,
            'accuracy': acc,
            'precision_macro': prec_macro,
            'recall_macro': rec_macro,
            'f1_macro': f1_macro,
            'f1_weighted': f1_weighted,
            'roc_auc': rocauc,
            'log_loss': logloss_val,
        })

        # Per-class metrics table
        try:
            rpt = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
            df_rpt = pd.DataFrame(rpt).T.reset_index().rename(columns={'index':'class'})
            save_df(f"per_class_metrics_{name}", df_rpt)
        except Exception as e:
            print(f"classification_report failed for {name}: {e}")

        # Confusion matrices
        try:
            cm = confusion_matrix(y_test, y_pred)
            fig, ax = plt.subplots(figsize=(5,4))
            im = ax.imshow(cm, cmap='Blues')
            ax.set_title(f"Confusion (counts) - {name}")
            ax.set_xlabel('Predicted')
            ax.set_ylabel('True')
            plt.colorbar(im, ax=ax)
            save_fig(f"confusion_matrix_counts_{name}")

            cmn = cm.astype('float') / (cm.sum(axis=1, keepdims=True) + 1e-12)
            fig, ax = plt.subplots(figsize=(5,4))
            im = ax.imshow(cmn, cmap='Blues', vmin=0, vmax=1)
            ax.set_title(f"Confusion (normalized) - {name}")
            ax.set_xlabel('Predicted')
            ax.set_ylabel('True')
            plt.colorbar(im, ax=ax)
            save_fig(f"confusion_matrix_normalized_{name}")
        except Exception as e:
            print(f"Confusion matrix plotting failed for {name}: {e}")

        # Precision-Recall curve
        if y_proba is not None:
            try:
                if y_proba.ndim == 1 or y_proba.shape[1] == 2:
                    pos = y_proba[:, -1] if y_proba.ndim > 1 else y_proba
                    prec, rec, _ = precision_recall_curve(y_test, pos)
                    plt.figure(figsize=(6,5))
                    plt.plot(rec, prec, label=name)
                    plt.xlabel('Recall')
                    plt.ylabel('Precision')
                    plt.title('Precision-Recall Curve')
                    plt.legend()
                    save_fig(f"pr_curve_{name}")
            except Exception:
                pass

        # Calibration curve
        if y_proba is not None:
            try:
                pos = y_proba[:, -1] if y_proba.ndim > 1 else y_proba
                fig, ax = plt.subplots(figsize=(5,4))
                prob_true, prob_pred = calibration_curve(y_test, pos, n_bins=10)
                CalibrationDisplay(prob_true, prob_pred, pos).plot(ax=ax)
                ax.set_title(f"Calibration - {name}")
                save_fig(f"calibration_{name}")
                # Brier score
                brier = brier_score_loss(y_test, pos)
                print(f"Brier score {name}: {brier:.4f}")
            except Exception as e:
                print(f"Calibration failed for {name}: {e}")

    # Save summary table
    df_summary = pd.DataFrame(summary_rows)
    save_df("classification_summary", df_summary)

    # Save combined ROC if any probs
    if any_proba:
        plt.plot([0,1],[0,1],'--', color='gray')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves (All Models)')
        plt.legend()
        save_fig('roc_curves_all_models')

else:
    # Regression branch (metrics & plots)
    rows = []
    for pr in preds:
        name = pr['name']
        y_pred = pr['y_pred']
        resid = y_test - y_pred
        rmse = float(np.sqrt(np.mean(resid**2)))
        mae = float(np.mean(np.abs(resid)))
        mape_mask = y_test != 0
        mape = float(np.mean((np.abs(resid[mape_mask]) / np.abs(y_test[mape_mask])))) if mape_mask.any() else np.nan
        r2 = 1 - (np.sum(resid**2) / (np.sum((y_test - np.mean(y_test))**2) + 1e-12))
        medae = float(np.median(np.abs(resid)))
        rows.append({'model': name, 'rmse': rmse, 'mae': mae, 'mape': mape, 'r2': r2, 'medae': medae})

        # Residuals vs fitted
        plt.figure(figsize=(5,4))
        plt.scatter(y_pred, resid, s=8, alpha=0.6)
        plt.axhline(0, color='gray', ls='--')
        plt.xlabel('Fitted')
        plt.ylabel('Residuals')
        plt.title(f'Residuals vs Fitted - {name}')
        save_fig(f"residuals_vs_fitted_{name}")

        # Pred vs Actual
        plt.figure(figsize=(5,4))
        plt.scatter(y_test, y_pred, s=8, alpha=0.6)
        lims = [min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1])]
        plt.plot(lims, lims, '--', color='gray')
        plt.xlabel('Actual')
        plt.ylabel('Predicted')
        plt.title(f'Predicted vs Actual - {name}')
        save_fig(f"pred_vs_actual_{name}")

    df_reg = pd.DataFrame(rows)
    save_df('regression_summary', df_reg)

print('Metrics & plots done.')


In [None]:
# 5) Feature importance and SHAP (optional)

FEATURE_NAMES = list(X_test.columns) if hasattr(X_test, 'columns') else [f'f{i}' for i in range(X_test.shape[1])]

for entry, pr in zip(models, preds):
    name, model = entry['name'], entry['model']
    # Feature importances
    try:
        imp = None
        if hasattr(model, 'feature_importances_'):
            imp = np.asarray(model.feature_importances_)
        elif hasattr(model, 'coef_'):
            coef = np.asarray(model.coef_)
            if coef.ndim == 1:
                imp = np.abs(coef)
            else:
                imp = np.abs(coef).mean(axis=0)
        if imp is not None and imp.shape[0] == len(FEATURE_NAMES):
            df_imp = pd.DataFrame({'feature': FEATURE_NAMES, 'importance': imp})\
                     .sort_values('importance', ascending=False).head(20)
            plt.figure(figsize=(6,5))
            plt.barh(df_imp['feature'][::-1], df_imp['importance'][::-1])
            plt.title(f'Feature Importance - {name}')
            save_fig(f'feature_importance_{name}')
            save_df(f'feature_importance_{name}', df_imp)
    except Exception as e:
        print(f"Feature importance failed for {name}: {e}")

    # SHAP
    if shap is not None:
        try:
            # Take a small sample for performance
            idx = np.random.RandomState(RANDOM_STATE).choice(np.arange(len(X_test)), size=min(1000, len(X_test)), replace=False)
            Xs = X_test.iloc[idx] if hasattr(X_test, 'iloc') else X_test[idx]
            explainer = None
            if hasattr(model, 'predict_proba') and hasattr(model, 'fit'):
                # Tree or linear detection is rough; try TreeExplainer fallback to Kernel
                try:
                    explainer = shap.TreeExplainer(model)
                    shap_values = explainer.shap_values(Xs)
                except Exception:
                    explainer = shap.KernelExplainer(model.predict_proba, Xs[:100])
                    shap_values = explainer.shap_values(Xs[:200])
            else:
                # boosters
                if xgb is not None and isinstance(model, xgb.Booster):
                    explainer = shap.TreeExplainer(model)
                    dtest = xgb.DMatrix(Xs)
                    shap_values = explainer.shap_values(dtest)
                else:
                    # generic fallback
                    explainer = shap.KernelExplainer(lambda a: pr['y_proba'], Xs[:100])
                    shap_values = explainer.shap_values(Xs[:200])

            plt.figure(figsize=(6,5))
            shap.summary_plot(shap_values, Xs, show=False)
            save_fig(f'shap_summary_{name}')
        except Exception as e:
            print(f"SHAP skipped for {name}: {e}")


In [None]:
# 5b) Compare models from artifacts/metrics.csv (optional)
from pathlib import Path
metrics_csv = Path('artifacts/metrics.csv')
if metrics_csv.exists():
    dfm = pd.read_csv(metrics_csv)
    # keep common columns if present
    cols = [c for c in ['model','recall','specificity','accuracy','f1','val_auc'] if c in dfm.columns]
    if 'model' in cols:
        dfm_sorted = dfm.sort_values(by=[c for c in cols if c!='model'][0], ascending=False)
        save_df('leaderboard_from_training', dfm_sorted)
        # One plot per metric
        for m in [c for c in cols if c!='model']:
            plt.figure(figsize=(7,4))
            plt.bar(dfm_sorted['model'], dfm_sorted[m], color='#2563eb')
            plt.ylabel(m)
            plt.title(f'Model comparison ({m}) from training metrics')
            plt.xticks(rotation=30, ha='right')
            save_fig(f'compare_{m}_from_training')
    else:
        print('metrics.csv missing model column; skipping plots')
else:
    print('artifacts/metrics.csv not found; skipping training metrics comparison plots')


In [None]:
# 6) Model comparison & index

# Comparison bars (classification: F1 macro; regression: RMSE)
try:
    if TASK_TYPE == 'classification':
        df = pd.read_csv(TABLES_DIR / 'classification_summary.csv')
        df_plot = df[['model','f1_macro','accuracy','roc_auc']]
        df_plot = df_plot.set_index('model')
        plt.figure(figsize=(7,4))
        df_plot['f1_macro'].plot(kind='bar', color='#2563eb')
        plt.ylabel('F1 (macro)')
        plt.title('Model Comparison - F1 macro')
        save_fig('metrics_bar_compare')
    else:
        df = pd.read_csv(TABLES_DIR / 'regression_summary.csv')
        df_plot = df[['model','rmse','r2']]
        df_plot = df_plot.set_index('model')
        plt.figure(figsize=(7,4))
        df_plot['rmse'].plot(kind='bar', color='#2563eb')
        plt.ylabel('RMSE')
        plt.title('Model Comparison - RMSE')
        save_fig('metrics_bar_compare')
except Exception as e:
    print('Comparison plot skipped:', e)

# Generate index markdown
index_lines = []
index_lines.append('# Model Validation Report Index')
index_lines.append('')
index_lines.append('## Tables')
for p in sorted(TABLES_DIR.glob('*.csv')):
    index_lines.append(f"- {p.name}")
index_lines.append('')
index_lines.append('## Plots')
for p in sorted(PLOTS_DIR.glob('*.png')):
    index_lines.append(f"- {p.name}")

ensure_dir(OUT_DIR)
(OUT_DIR / 'README.md').write_text('\n'.join(index_lines))

print('Index written to', OUT_DIR / 'README.md')
print('\n'.join(index_lines))
