In [7]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import xgboost as xgb
import yaml
from sklearn.model_selection import train_test_split


def load_yaml(path: Path) -> dict:
    with open(path, "r") as f:
        return yaml.safe_load(f)


def rebuild_splits(joined_csv: Path, test_size: float, val_size: float, random_state: int):

    df = pd.read_csv(joined_csv, low_memory=False)
    if "loanId" not in df.columns or "isBadDebt" not in df.columns:
        raise ValueError("joined_df.csv must contain loanId and isBadDebt columns.")
    y = df["isBadDebt"].to_numpy(np.int64)
    n = len(y)
    all_idx = np.arange(n)

    train_idx, temp_idx = train_test_split(
        all_idx,
        test_size=test_size + val_size,
        stratify=y,
        random_state=random_state,
    )
    rel_test = test_size / (test_size + val_size)
    val_idx, test_idx = train_test_split(
        temp_idx,
        test_size=rel_test,
        stratify=y[temp_idx],
        random_state=random_state,
    )

    return df["loanId"].astype(str).tolist(), y, train_idx, val_idx, test_idx


def main():
    repo_root = Path(".").resolve()

    cfg = load_yaml(repo_root / "yamls" / "config.yaml")
    prm = load_yaml(repo_root / "yamls" / "params.yaml")

    joined_csv = Path(cfg["data_transformation"]["joined_local"])
    embed_dir = Path(cfg["data_embedding"]["root_dir"])
    model_dir = Path(cfg["model_training"]["root_dir"])
    raw_loan_csv = Path(cfg["data_ingestion"]["local_download_dir"]) / "loan.csv"

    test_size = float(prm["data_preprocessing"]["test_size"])
    val_size = float(prm["data_preprocessing"]["val_size"])
    random_state = int(prm["data_preprocessing"]["random_state"])

    loan_ids_all, y_all, train_idx, val_idx, test_idx = rebuild_splits(
        joined_csv, test_size, val_size, random_state
    )

    X_train = np.load(embed_dir / "X_train.npy")
    y_train = np.load(embed_dir / "y_train.npy").astype(np.int64)

    X_val = np.load(embed_dir / "X_val.npy")
    y_val = np.load(embed_dir / "y_val.npy").astype(np.int64)

    X_test = np.load(embed_dir / "X_test.npy")
    y_test = np.load(embed_dir / "y_test.npy").astype(np.int64)

    assert len(train_idx) == len(X_train) == len(y_train), "Train split length mismatch."
    assert len(val_idx) == len(X_val) == len(y_val), "Val split length mismatch."
    assert len(test_idx) == len(X_test) == len(y_test), "Test split length mismatch."

    booster = xgb.Booster()
    booster.load_model(model_dir / "xgb_model.json")

    with open(model_dir / "metrics.json", "r") as f:
        metrics = json.load(f)
    threshold = float(metrics.get("decision_threshold", 0.5))

    def predict_block(X: np.ndarray) -> np.ndarray:
        dmat = xgb.DMatrix(X)
        return booster.predict(dmat)

    p_train = predict_block(X_train)
    p_val = predict_block(X_val)
    p_test = predict_block(X_test)

    def build_df(split_name: str, idx: np.ndarray, probs: np.ndarray, y_slice: np.ndarray) -> pd.DataFrame:
        loan_ids = [loan_ids_all[i] for i in idx.tolist()]
        y_true = y_slice.astype(int)
        y_pred_cls = (probs >= threshold).astype(int)
        return pd.DataFrame(
            {
                "loanId": loan_ids,
                "split": split_name,
                "isBadDebt": y_true,
                "predicted_prob": probs,
                "predicted_class": y_pred_cls,
            }
        )

    df_train = build_df("train", train_idx, p_train, y_train)
    df_val = build_df("val", val_idx, p_val, y_val)
    df_test = build_df("test", test_idx, p_test, y_test)
    preds_df = pd.concat([df_train, df_val, df_test], ignore_index=True)

    raw_cols = ["loanId", "loanAmount", "originallyScheduledPaymentAmount", "isFunded"]
    loan_raw = pd.read_csv(raw_loan_csv, low_memory=False, usecols=lambda c: c in raw_cols)
    loan_raw["loanId"] = loan_raw["loanId"].astype(str)
    if "isFunded" in loan_raw.columns:
        loan_raw["isFunded"] = loan_raw["isFunded"].astype(int)
    loan_raw = loan_raw.drop_duplicates(subset=["loanId"], keep="last")

    preds_df = preds_df.merge(loan_raw, on="loanId", how="left")

    cols = [
        "loanId",
        "split",
        "isBadDebt",
        "predicted_prob",
        "predicted_class",
        "loanAmount",
        "originallyScheduledPaymentAmount",
        "isFunded",
    ]
    preds_df = preds_df[cols]

    out_path = repo_root / 'artifacts' / 'analysis' / "predictions.csv"
    preds_df.to_csv(out_path, index=False)
    print(f"Wrote {len(preds_df):,} rows")

    print(
        "Summary:",
        preds_df.groupby("split")["predicted_class"].value_counts(normalize=True).unstack(fill_value=0.0),
        sep="\n",
    )


if __name__ == "__main__":
    main()


Wrote 114,588 rows
Summary:
predicted_class         0         1
split                              
test             0.098261  0.901739
train            0.097867  0.902133
val              0.099139  0.900861


In [11]:
#!/usr/bin/env python3
from pathlib import Path
import json
import shutil
import pandas as pd
import numpy as np

def _pct(numer, denom):
    numer = float(numer)
    denom = float(denom)
    return (numer / denom * 100.0) if denom > 0 else 0.0

def main():
    repo = Path(".").resolve()
    analysis_dir = repo / "artifacts" / "analysis"
    model_dir = repo / "artifacts" / "model_training"
    analysis_dir.mkdir(parents=True, exist_ok=True)

    preds_path = analysis_dir / "predictions.csv"
    metrics_path = model_dir / "metrics.json"
    cm_path = model_dir / "confusion_matrix.json"

    if not preds_path.exists():
        raise FileNotFoundError(str(preds_path))
    if not metrics_path.exists():
        raise FileNotFoundError(str(metrics_path))
    if not cm_path.exists():
        raise FileNotFoundError(str(cm_path))

    df = pd.read_csv(preds_path)

    for col in ["isBadDebt", "predicted_class", "isFunded"]:
        df[col] = pd.to_numeric(df.get(col, 0), errors="coerce").fillna(0).astype(int)
    for col in ["loanAmount", "originallyScheduledPaymentAmount", "predicted_prob"]:
        df[col] = pd.to_numeric(df.get(col, 0.0), errors="coerce").fillna(0.0).astype(float)

    total_principal_all_loans = float(df["loanAmount"].sum())
    funded_df = df[df["isFunded"] == 1].copy()
    total_principal_funded_baseline = float(funded_df["loanAmount"].sum())
    pct_funded_value_baseline = _pct(total_principal_funded_baseline, total_principal_all_loans)

    funded_bad_df = funded_df[funded_df["isBadDebt"] == 1]
    funded_good_df = funded_df[funded_df["isBadDebt"] == 0]

    total_principal_funded_bad_baseline = float(funded_bad_df["loanAmount"].sum())
    total_principal_funded_good_baseline = float(funded_good_df["loanAmount"].sum())

    pct_bad_debt_share_of_funded_value_baseline = _pct(
        total_principal_funded_bad_baseline, total_principal_funded_baseline
    )
    pct_good_debt_share_of_funded_value_baseline = _pct(
        total_principal_funded_good_baseline, total_principal_funded_baseline
    )

    funded_bad_tp_df = funded_bad_df[funded_bad_df["predicted_class"] == 1]
    total_principal_funded_bad_correctly_flagged = float(funded_bad_tp_df["loanAmount"].sum())
    pct_of_funded_bad_value_correctly_flagged = _pct(
        total_principal_funded_bad_correctly_flagged, total_principal_funded_bad_baseline
    )

    total_principal_funded_bad_after_model = (
        total_principal_funded_bad_baseline - total_principal_funded_bad_correctly_flagged
    )
    pct_bad_debt_share_of_funded_value_after_model = _pct(
        total_principal_funded_bad_after_model, total_principal_funded_baseline
    )
    pct_point_change_bad_debt_share_of_funded_value = (
        pct_bad_debt_share_of_funded_value_after_model - pct_bad_debt_share_of_funded_value_baseline
    )

    funded_good_fp_df = funded_good_df[funded_good_df["predicted_class"] == 1]
    total_principal_funded_good_incorrectly_flagged = float(funded_good_fp_df["loanAmount"].sum())
    pct_of_funded_good_value_incorrectly_flagged = _pct(
        total_principal_funded_good_incorrectly_flagged, total_principal_funded_baseline
    )

    not_funded_df = df[df["isFunded"] == 0]
    not_funded_pred_good_df = not_funded_df[not_funded_df["predicted_class"] == 0]
    total_principal_not_funded_predicted_good = float(
        not_funded_pred_good_df["loanAmount"].sum()
    )
    pct_of_total_value_not_funded_but_predicted_good = _pct(
        total_principal_not_funded_predicted_good, total_principal_all_loans
    )

    total_principal_funded_after_model = (
        total_principal_funded_baseline
        - total_principal_funded_good_incorrectly_flagged
        + total_principal_not_funded_predicted_good
    )
    pct_funded_value_after_model = _pct(
        total_principal_funded_after_model, total_principal_all_loans
    )
    pct_point_change_funded_value = (
        pct_funded_value_after_model - pct_funded_value_baseline
    )

    total_revenue_from_funded_good_baseline = float(
        funded_good_df["originallyScheduledPaymentAmount"].sum()
    )
    total_profit_baseline_amount = (
        total_revenue_from_funded_good_baseline - total_principal_funded_baseline
    )
    pct_profit_margin_baseline = _pct(
        total_profit_baseline_amount, total_principal_funded_baseline
    )

    total_principal_retained_after_filter = float(
        funded_df[funded_df["predicted_class"] == 0]["loanAmount"].sum()
    )
    total_profit_after_model_amount = (
        total_revenue_from_funded_good_baseline - total_principal_retained_after_filter
    )
    pct_profit_margin_after_model = _pct(
        total_profit_after_model_amount, total_principal_funded_baseline
    )
    pct_point_uplift_in_profit_margin = (
        pct_profit_margin_after_model - pct_profit_margin_baseline
    )

    with open(metrics_path, "r") as f:
        model_metrics = json.load(f)
    with open(cm_path, "r") as f:
        confusion = json.load(f)

    impact = {
        "pct_funded_value_baseline": pct_funded_value_baseline,
        "pct_funded_value_after_model": pct_funded_value_after_model,
        "pct_point_change_funded_value": pct_point_change_funded_value,
        "pct_bad_debt_share_of_funded_value_baseline": pct_bad_debt_share_of_funded_value_baseline,
        "pct_bad_debt_share_of_funded_value_after_model": pct_bad_debt_share_of_funded_value_after_model,
        "pct_point_change_bad_debt_share_of_funded_value": pct_point_change_bad_debt_share_of_funded_value,
        "pct_good_debt_share_of_funded_value_baseline": pct_good_debt_share_of_funded_value_baseline,
        "pct_of_funded_bad_value_correctly_flagged": pct_of_funded_bad_value_correctly_flagged,
        "pct_of_funded_good_value_incorrectly_flagged": pct_of_funded_good_value_incorrectly_flagged,
        "pct_of_total_value_not_funded_but_predicted_good": pct_of_total_value_not_funded_but_predicted_good,
        "pct_profit_margin_baseline": pct_profit_margin_baseline,
        "pct_profit_margin_after_model": pct_profit_margin_after_model,
        "pct_point_uplift_in_profit_margin": pct_point_uplift_in_profit_margin,
        "total_principal_all_loans": total_principal_all_loans,
        "total_principal_funded_baseline": total_principal_funded_baseline,
        "total_principal_funded_after_model": total_principal_funded_after_model,
        "total_principal_funded_bad_baseline": total_principal_funded_bad_baseline,
        "total_principal_funded_bad_correctly_flagged": total_principal_funded_bad_correctly_flagged,
        "total_principal_funded_good_incorrectly_flagged": total_principal_funded_good_incorrectly_flagged,
        "total_principal_not_funded_predicted_good": total_principal_not_funded_predicted_good,
        "total_revenue_from_funded_good_baseline": total_revenue_from_funded_good_baseline,
        "total_profit_baseline_amount": total_profit_baseline_amount,
        "total_profit_after_model_amount": total_profit_after_model_amount,
    }

    out_json = analysis_dir / "impact_metrics.json"
    with open(out_json, "w") as f:
        json.dump(
            {
                "impact": impact,
                "model_metrics": model_metrics,
                "confusion_matrix": confusion,
            },
            f,
            indent=2,
        )

    rows = [
        ("pct_funded_value_baseline", impact["pct_funded_value_baseline"], "Percent of total loan value funded (baseline)"),
        ("pct_funded_value_after_model", impact["pct_funded_value_after_model"], "Percent of total loan value funded (after model)"),
        ("pct_point_change_funded_value", impact["pct_point_change_funded_value"], "Percentage point change in funded value"),
        ("pct_bad_debt_share_of_funded_value_baseline", impact["pct_bad_debt_share_of_funded_value_baseline"], "Percent of funded value that is bad debt (baseline)"),
        ("pct_bad_debt_share_of_funded_value_after_model", impact["pct_bad_debt_share_of_funded_value_after_model"], "Percent of funded value that is bad debt (after model)"),
        ("pct_point_change_bad_debt_share_of_funded_value", impact["pct_point_change_bad_debt_share_of_funded_value"], "Percentage point change in bad-debt share of funded value"),
        ("pct_good_debt_share_of_funded_value_baseline", impact["pct_good_debt_share_of_funded_value_baseline"], "Percent of funded value that is good debt (baseline)"),
        ("pct_of_funded_bad_value_correctly_flagged", impact["pct_of_funded_bad_value_correctly_flagged"], "Percent of funded bad value correctly flagged by model"),
        ("pct_of_funded_good_value_incorrectly_flagged", impact["pct_of_funded_good_value_incorrectly_flagged"], "Percent of funded good value incorrectly flagged by model"),
        ("pct_of_total_value_not_funded_but_predicted_good", impact["pct_of_total_value_not_funded_but_predicted_good"], "Percent of total loan value not funded but predicted good"),
        ("pct_profit_margin_baseline", impact["pct_profit_margin_baseline"], "Profit margin (baseline)"),
        ("pct_profit_margin_after_model", impact["pct_profit_margin_after_model"], "Profit margin (after model)"),
        ("pct_point_uplift_in_profit_margin", impact["pct_point_uplift_in_profit_margin"], "Percentage point uplift in profit margin"),
    ]
    pd.DataFrame(rows, columns=["Metric", "Value", "Description"]).to_csv(analysis_dir / "impact_table.csv", index=False)

    shutil.copy2(metrics_path, analysis_dir / "metrics.json")
    shutil.copy2(cm_path, analysis_dir / "confusion_matrix.json")


    print("=== Impact Summary ===")
    print(f"Funded value baseline: {impact['pct_funded_value_baseline']:.2f}% of total")
    print(f"Funded value after model: {impact['pct_funded_value_after_model']:.2f}% ({impact['pct_point_change_funded_value']:+.2f} pp)")
    print(f"Bad-debt share of funded (baseline): {impact['pct_bad_debt_share_of_funded_value_baseline']:.2f}%")
    print(f"Bad-debt share of funded (after): {impact['pct_bad_debt_share_of_funded_value_after_model']:.2f}% ({impact['pct_point_change_bad_debt_share_of_funded_value']:+.2f} pp)")
    print(f"Bad funded correctly flagged: {impact['pct_of_funded_bad_value_correctly_flagged']:.2f}% of funded bad value")
    print(f"Good funded incorrectly flagged: {impact['pct_of_funded_good_value_incorrectly_flagged']:.2f}% of funded value")
    print(f"Not funded but predicted good: {impact['pct_of_total_value_not_funded_but_predicted_good']:.2f}% of total value")
    print(f"Profit margin baseline: {impact['pct_profit_margin_baseline']:.2f}%")
    print(f"Profit margin after model: {impact['pct_profit_margin_after_model']:.2f}% (uplift {impact['pct_point_uplift_in_profit_margin']:+.2f} pp)")
    print(f"Wrote {out_json}")
    print(f"Wrote {analysis_dir / 'impact_table.csv'}")
    print(f"Copied {analysis_dir / 'metrics.json'} and {analysis_dir / 'confusion_matrix.json'}")

if __name__ == "__main__":
    main()


=== Impact Summary ===
Funded value baseline: 30.11% of total
Funded value after model: 25.86% (-4.25 pp)
Bad-debt share of funded (baseline): 58.31%
Bad-debt share of funded (after): 12.68% (-45.62 pp)
Bad funded correctly flagged: 78.25% of funded bad value
Good funded incorrectly flagged: 14.11% of funded value
Not funded but predicted good: 0.00% of total value
Profit margin baseline: 10.19%
Profit margin after model: 69.93% (uplift +59.74 pp)
Wrote /Users/johnteng/Library/CloudStorage/GoogleDrive-ystengjohn@gmail.com/My Drive/DOC_ARCHIVE_2025-6/github/moneyLion-202511010/artifacts/analysis/impact_metrics.json
Wrote /Users/johnteng/Library/CloudStorage/GoogleDrive-ystengjohn@gmail.com/My Drive/DOC_ARCHIVE_2025-6/github/moneyLion-202511010/artifacts/analysis/impact_table.csv
Copied /Users/johnteng/Library/CloudStorage/GoogleDrive-ystengjohn@gmail.com/My Drive/DOC_ARCHIVE_2025-6/github/moneyLion-202511010/artifacts/analysis/metrics.json and /Users/johnteng/Library/CloudStorage/Google

In [13]:
#!/usr/bin/env python3
import json
from pathlib import Path

import numpy as np

try:
    import seaborn as sns
    _HAS_SNS = True
except Exception:
    _HAS_SNS = False

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt


def _ensure_dir(path: Path):
    path.mkdir(parents=True, exist_ok=True)


def _load_json(path: Path):
    with open(path, "r") as f:
        return json.load(f)


def _cm_heatmap(cm: np.ndarray, out_path: Path):
    plt.figure(figsize=(7, 6), dpi=150)
    if _HAS_SNS:
        sns.set_theme(style="white")
        sns.heatmap(
            cm,
            annot=True,
            fmt="d",
            cmap="Blues",
            cbar=False,
            xticklabels=["Pred Good", "Pred Bad"],
            yticklabels=["Actual Good", "Actual Bad"],
            linewidths=0.5,
            linecolor="white",
            annot_kws={"fontsize": 11, "fontweight": "bold"},
        )
    else:
        ax = plt.gca()
        im = ax.imshow(cm, cmap="Blues")
        ax.set_xticks([0, 1])
        ax.set_xticklabels(["Pred Good", "Pred Bad"])
        ax.set_yticks([0, 1])
        ax.set_yticklabels(["Actual Good", "Actual Bad"])
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                ax.text(j, i, f"{cm[i, j]:d}", ha="center", va="center", color="black", fontsize=11)
        plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.title("Confusion Matrix", fontsize=14, fontweight="bold")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()


def _label_containers(ax, fmt="%.2f"):
    for container in ax.containers:
        try:
            ax.bar_label(container, fmt=fmt, padding=3, fontsize=10)
        except Exception:
            pass


def _bar_two_series(title, labels, values, colors, ylabel, out_path: Path):
    plt.figure(figsize=(7.5, 5.5), dpi=150)
    if _HAS_SNS:
        sns.set_theme(style="whitegrid")
        ax = sns.barplot(x=labels, y=values, palette=colors)
    else:
        ax = plt.gca()
        ax.bar(labels, values, color=colors)
    ax.set_title(title, fontsize=14, fontweight="bold")
    ax.set_ylabel(ylabel)
    _label_containers(ax, fmt="%.2f%%")
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()


def _bar_changes(title, metric_labels, pp_changes, colors, out_path: Path):
    plt.figure(figsize=(8.5, 5.5), dpi=150)
    if _HAS_SNS:
        sns.set_theme(style="whitegrid")
        ax = sns.barplot(x=metric_labels, y=pp_changes, palette=colors)
    else:
        ax = plt.gca()
        ax.bar(metric_labels, pp_changes, color=colors)
    ax.set_title(title, fontsize=14, fontweight="bold")
    ax.set_ylabel("Change (percentage points)")
    plt.axhline(0, color="#666", linewidth=1)
    _label_containers(ax, fmt="%.2f pp")
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()


def _two_pies(title_left, title_right, shares_left, shares_right, labels, colors, out_path: Path):
    fig, axs = plt.subplots(1, 2, figsize=(10, 5.5), dpi=150)
    axs[0].pie(
        shares_left, labels=labels, autopct="%1.2f%%", colors=colors, startangle=90, textprops={"fontsize": 10}
    )
    axs[0].set_title(title_left, fontsize=12, fontweight="bold")
    axs[1].pie(
        shares_right, labels=labels, autopct="%1.2f%%", colors=colors, startangle=90, textprops={"fontsize": 10}
    )
    axs[1].set_title(title_right, fontsize=12, fontweight="bold")
    plt.suptitle("Composition of Funded Value", fontsize=14, fontweight="bold")
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()


def main():
    repo = Path(".").resolve()
    analysis_dir = repo / "artifacts" / "analysis"
    _ensure_dir(analysis_dir)

    impact_json = analysis_dir / "impact_metrics.json"
    cm_json = analysis_dir / "confusion_matrix.json"
    if not impact_json.exists():
        raise FileNotFoundError(str(impact_json))
    if not cm_json.exists():
        raise FileNotFoundError(str(cm_json))

    impact_all = _load_json(impact_json)
    impact = impact_all.get("impact", {})
    cm_info = _load_json(cm_json)
    cm = np.array(cm_info["matrix"], dtype=int)

    cm_path = analysis_dir / "cm_heatmap.png"
    _cm_heatmap(cm, cm_path)

    baseline_profit = float(impact.get("pct_profit_margin_baseline", 0.0))
    after_profit = float(impact.get("pct_profit_margin_after_model", 0.0))
    uplift_pp = float(impact.get("pct_point_uplift_in_profit_margin", 0.0))
    profit_path = analysis_dir / "profit_impact_bar.png"
    _bar_two_series(
        title="Profitability Impact",
        labels=["Baseline", "After Model"],
        values=[baseline_profit, after_profit],
        colors=["#7cb342", "#1e88e5"],
        ylabel="Profit Margin (%)",
        out_path=profit_path,
    )

    funded_baseline = float(impact.get("pct_funded_value_baseline", 0.0))
    funded_after = float(impact.get("pct_funded_value_after_model", 0.0))
    funded_change_pp = float(impact.get("pct_point_change_funded_value", 0.0))
    funded_path = analysis_dir / "funded_value_change_bar.png"
    _bar_two_series(
        title="Funded Value (as % of Total)",
        labels=["Baseline", "After Model"],
        values=[funded_baseline, funded_after],
        colors=["#6d4c41", "#26a69a"],
        ylabel="Percent of Total Value (%)",
        out_path=funded_path,
    )

    bad_share_baseline = float(impact.get("pct_bad_debt_share_of_funded_value_baseline", 0.0))
    bad_share_after = float(impact.get("pct_bad_debt_share_of_funded_value_after_model", 0.0))
    bad_change_pp = float(impact.get("pct_point_change_bad_debt_share_of_funded_value", 0.0))
    good_share_baseline = float(impact.get("pct_good_debt_share_of_funded_value_baseline", max(0.0, 100.0 - bad_share_baseline)))
    good_share_after = max(0.0, 100.0 - bad_share_after)
    pies_path = analysis_dir / "bad_good_share_pies.png"
    _two_pies(
        title_left="Baseline",
        title_right="After Model",
        shares_left=[bad_share_baseline, good_share_baseline],
        shares_right=[bad_share_after, good_share_after],
        labels=["Bad Debt", "Good Debt"],
        colors=["#e53935", "#43a047"],
        out_path=pies_path,
    )

    changes_path = analysis_dir / "impact_changes_bar.png"
    _bar_changes(
        title="Key Impact Changes (percentage points)",
        metric_labels=["Funded Value", "Bad-Share of Funded", "Profit Margin"],
        pp_changes=[funded_change_pp, bad_change_pp, uplift_pp],
        colors=["#8e24aa", "#fb8c00", "#3949ab"],
        out_path=changes_path,
    )

    print("Charts generated:")
    print(f"- {cm_path}")
    print(f"- {profit_path}")
    print(f"- {funded_path}")
    print(f"- {pies_path}")
    print(f"- {changes_path}")


if __name__ == "__main__":
    main()


Charts generated:
- /Users/johnteng/Library/CloudStorage/GoogleDrive-ystengjohn@gmail.com/My Drive/DOC_ARCHIVE_2025-6/github/moneyLion-202511010/artifacts/analysis/cm_heatmap.png
- /Users/johnteng/Library/CloudStorage/GoogleDrive-ystengjohn@gmail.com/My Drive/DOC_ARCHIVE_2025-6/github/moneyLion-202511010/artifacts/analysis/profit_impact_bar.png
- /Users/johnteng/Library/CloudStorage/GoogleDrive-ystengjohn@gmail.com/My Drive/DOC_ARCHIVE_2025-6/github/moneyLion-202511010/artifacts/analysis/funded_value_change_bar.png
- /Users/johnteng/Library/CloudStorage/GoogleDrive-ystengjohn@gmail.com/My Drive/DOC_ARCHIVE_2025-6/github/moneyLion-202511010/artifacts/analysis/bad_good_share_pies.png
- /Users/johnteng/Library/CloudStorage/GoogleDrive-ystengjohn@gmail.com/My Drive/DOC_ARCHIVE_2025-6/github/moneyLion-202511010/artifacts/analysis/impact_changes_bar.png


In [4]:
#!/usr/bin/env python3
from pathlib import Path
import yaml

try:
    from graphviz import Source
    _HAS_GRAPHVIZ = True
except Exception:
    _HAS_GRAPHVIZ = False


def load_yaml(p: Path):
    with open(p, "r") as f:
        return yaml.safe_load(f)


def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)


def write_text(p: Path, s: str):
    p.write_text(s, encoding="utf-8")


def build_architecture_dot(cfg: dict) -> str:
    bucket = cfg["gcs_artifact_storage"]["bucket_name"]
    base_prefix = cfg["gcs_artifact_storage"]["base_prefix"]
    return f"""
digraph ARCH {{
  rankdir=LR;
  graph [fontname="Helvetica", fontsize=10];
  node  [fontname="Helvetica", fontsize=10, shape=box, style=rounded];
  edge  [fontname="Helvetica", fontsize=9];

  subgraph cluster_orch {{
    label="Orchestration (Composer / Airflow)";
    color="#90CAF9";
    DAG [label="DAG: moneylion_training_pipeline"];
    RUN [label="run_full_pipeline\\n(KubernetesPodOperator: python main.py)"];
    ROLL [label="rollout_cloud_run\\n(gcloud run services update)"];
    DAG -> RUN -> ROLL;
  }}

  subgraph cluster_raw {{
    label="Raw Data (GCS)";
    color="#A5D6A7";
    RAW [label="gs://{bucket}/raw/\\nloan.csv, payment.csv, clarity_*.csv", shape=folder];
  }}

  subgraph cluster_train {{
    label="Training Pod (single run of main.py)";
    color="#FFE082";
    S1 [label="1. Data Ingestion\\n→ artifacts/data_ingestion/raw"];
    S2 [label="2. Transformation\\n→ joined_df.csv + stats/dummies"];
    S3 [label="3. Preprocessing\\n→ splits npy + vocabs"];
    S4 [label="4. Embedding & Export\\n→ embed_matrices + X_*.npy"];
    S5 [label="5. Model Training (XGB)\\n→ xgb_model.json + metrics + CM"];
    ANA [label="Analysis Scripts\\n→ predictions.csv + impact_metrics.json + charts/*.png"];
    S1 -> S2 -> S3 -> S4 -> S5 -> ANA;
  }}

  subgraph cluster_art {{
    label="Artifacts (GCS)";
    color="#CE93D8";
    ART [label="gs://{bucket}/{base_prefix}/**", shape=folder];
  }}

  subgraph cluster_srv {{
    label="Serving (Cloud Run)";
    color="#EF9A9A";
    CR [label="Service: loan-default-api"];
    INIT [label="Startup: ensure_artifacts()\\nload model + schema"];
    API [label="API: /health, /predict"];
    CR -> INIT -> API;
  }}

  RAW -> S1 [label="download raw"];
  S1 -> ART [label="upload ingestion"];
  S2 -> ART [label="upload transformation"];
  S3 -> ART [label="upload preprocessing"];
  S4 -> ART [label="upload embedding"];
  S5 -> ART [label="upload model"];
  ANA -> ART [label="upload analysis", style=dashed, color="#777"];

  ROLL -> CR [label="update env → new revision"];
  ART -> INIT [label="download on startup"];
}}
""".strip()


def build_dataflow_dot(cfg: dict) -> str:
    bucket = cfg["gcs_artifact_storage"]["bucket_name"]
    base_prefix = cfg["gcs_artifact_storage"]["base_prefix"]
    return f"""
digraph FLOW {{
  rankdir=TB;
  node [shape=box, style=rounded, fontname="Helvetica", fontsize=10];
  edge [fontname="Helvetica", fontsize=9];

  RAW_L [label="gs://{bucket}/raw/loan.csv", shape=folder];
  RAW_P [label="gs://{bucket}/raw/payment.csv", shape=folder];
  RAW_C [label="gs://{bucket}/raw/clarity_underwriting_variables.csv", shape=folder];

  ING [label="artifacts/data_ingestion/raw/*.csv"];
  TRN [label="artifacts/data_transformation/\\njoined_df.csv + stats/dummies"];
  PRE [label="artifacts/data_preprocessing/\\ntrain|val|test {{num,cat,y}}.npy + vocabs.json"];
  EMB [label="artifacts/data_embedding/\\nembed_matrices/*.npy + X_*.npy + embed_schema.json"];
  MOD [label="artifacts/model_training/\\nxgb_model.json + metrics.json + confusion_matrix.json"];
  ANA [label="artifacts/analysis/\\npredictions.csv + impact_metrics.json + charts/*.png"];

  RAW_L -> ING; RAW_P -> ING; RAW_C -> ING;
  ING -> TRN -> PRE -> EMB -> MOD -> ANA;

  GCS [label="GCS mirror\\n(gs://{bucket}/{base_prefix}/**)", shape=folder];
  ING -> GCS; TRN -> GCS; PRE -> GCS; EMB -> GCS; MOD -> GCS; ANA -> GCS;

  SRV [label="Cloud Run (loan-default-api)"];
  SRV -> MOD [label="startup download", dir=back, style=dashed, color="#777"];
}}
""".strip()


def build_mermaid_architecture(cfg: dict) -> str:
    bucket = cfg["gcs_artifact_storage"]["bucket_name"]
    base_prefix = cfg["gcs_artifact_storage"]["base_prefix"]
    return f"""
flowchart LR
  subgraph Orchestration[Composer / Airflow]
    DAG([DAG: moneylion_training_pipeline])
    RUN([run_full_pipeline: python main.py])
    ROLL([rollout_cloud_run: gcloud run update])
    DAG --> RUN --> ROLL
  end

  subgraph Raw[GCS raw]
    RAW[(gs://{bucket}/raw/*)]
  end

  subgraph Training[Training Pod]
    S1([1. Ingestion → artifacts/data_ingestion/raw])
    S2([2. Transformation → joined_df.csv + stats/dummies])
    S3([3. Preprocessing → splits npy + vocabs])
    S4([4. Embedding → embed_matrices + X_*.npy])
    S5([5. Model Training (XGB) → xgb_model.json + metrics + CM])
    ANA([Analysis → predictions.csv + impact_metrics.json + charts])
    S1 --> S2 --> S3 --> S4 --> S5 --> ANA
  end

  subgraph Artifacts[GCS artifacts]
    ART[/gs://{bucket}/{base_prefix}/**/]
  end

  subgraph Serving[Cloud Run]
    CR[(loan-default-api)]
    INIT([ensure_artifacts() at startup])
    API[/GET /health, POST /predict/]
    CR --> INIT --> API
  end

  RAW --> S1
  RUN -. triggers .-> S1
  S1 --> ART
  S2 --> ART
  S3 --> ART
  S4 --> ART
  S5 --> ART
  ANA --> ART
  ROLL --> CR
  ART --> INIT
""".strip()


def build_mermaid_dataflow(cfg: dict) -> str:
    bucket = cfg["gcs_artifact_storage"]["bucket_name"]
    base_prefix = cfg["gcs_artifact_storage"]["base_prefix"]
    return f"""
flowchart TB
  RAW_L[(gs://{bucket}/raw/loan.csv)]
  RAW_P[(gs://{bucket}/raw/payment.csv)]
  RAW_C[(gs://{bucket}/raw/clarity_underwriting_variables.csv)]

  ING[artifacts/data_ingestion/raw/*.csv]
  TRN[artifacts/data_transformation/joined_df.csv + stats/dummies]
  PRE[artifacts/data_preprocessing/ splits + vocabs]
  EMB[artifacts/data_embedding/ embed_matrices + X_*.npy]
  MOD[artifacts/model_training/ xgb_model.json + metrics + CM]
  ANA[artifacts/analysis/ predictions.csv + impact_metrics.json + charts]

  RAW_L --> ING
  RAW_P --> ING
  RAW_C --> ING
  ING --> TRN --> PRE --> EMB --> MOD --> ANA

  ART[/gs://{bucket}/{base_prefix}/**/]
  ING --> ART
  TRN --> ART
  PRE --> ART
  EMB --> ART
  MOD --> ART
  ANA --> ART
""".strip()


def render_graphviz(dot_src: str, out_png: Path, out_dot: Path):
    write_text(out_dot, dot_src)
    if _HAS_GRAPHVIZ:
        Source(dot_src, filename=str(out_png.with_suffix("")), format="png").render(cleanup=True)


def main():
    repo = Path(".").resolve()
    cfg = load_yaml(repo / "yamls" / "config.yaml")

    out_dir = repo / "artifacts" / "analysis" / "diagrams"
    ensure_dir(out_dir)

    arch_dot = build_architecture_dot(cfg)
    flow_dot = build_dataflow_dot(cfg)
    render_graphviz(arch_dot, out_dir / "architecture.png", out_dir / "architecture.dot")
    render_graphviz(flow_dot, out_dir / "data_flow.png", out_dir / "data_flow.dot")

    arch_mmd = build_mermaid_architecture(cfg)
    flow_mmd = build_mermaid_dataflow(cfg)
    write_text(out_dir / "architecture.mmd", arch_mmd)
    write_text(out_dir / "data_flow.mmd", flow_mmd)

    print("Diagrams written:")
    print(f"- {out_dir / 'architecture.dot'}")
    print(f"- {out_dir / 'data_flow.dot'}")
    if _HAS_GRAPHVIZ:
        print(f"- {out_dir / 'architecture.png'}")
        print(f"- {out_dir / 'data_flow.png'}")
    else:
        print("Graphviz not available: PNGs not rendered; use the .dot or .mmd files in draw.io.")
    print(f"- {out_dir / 'architecture.mmd'}")
    print(f"- {out_dir / 'data_flow.mmd'}")


if __name__ == "__main__":
    main()


Diagrams written:
- /Users/johnteng/Library/CloudStorage/GoogleDrive-ystengjohn@gmail.com/My Drive/DOC_ARCHIVE_2025-6/github/moneyLion-202511010/artifacts/analysis/diagrams/architecture.dot
- /Users/johnteng/Library/CloudStorage/GoogleDrive-ystengjohn@gmail.com/My Drive/DOC_ARCHIVE_2025-6/github/moneyLion-202511010/artifacts/analysis/diagrams/data_flow.dot
- /Users/johnteng/Library/CloudStorage/GoogleDrive-ystengjohn@gmail.com/My Drive/DOC_ARCHIVE_2025-6/github/moneyLion-202511010/artifacts/analysis/diagrams/architecture.png
- /Users/johnteng/Library/CloudStorage/GoogleDrive-ystengjohn@gmail.com/My Drive/DOC_ARCHIVE_2025-6/github/moneyLion-202511010/artifacts/analysis/diagrams/data_flow.png
- /Users/johnteng/Library/CloudStorage/GoogleDrive-ystengjohn@gmail.com/My Drive/DOC_ARCHIVE_2025-6/github/moneyLion-202511010/artifacts/analysis/diagrams/architecture.mmd
- /Users/johnteng/Library/CloudStorage/GoogleDrive-ystengjohn@gmail.com/My Drive/DOC_ARCHIVE_2025-6/github/moneyLion-202511010/a