In [2]:
# ======================================================================
#  STAGE 22 · Portfolio vs Market Comparative Resilience Dashboard
# ======================================================================

import os, logging, warnings
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import yaml
from scipy.stats import ttest_ind

warnings.filterwarnings("ignore", category=FutureWarning)
plt.rcParams["figure.dpi"] = 110
sns.set_style("whitegrid")
import IPython

# ── CONFIG ───────────────────────────────────────────────────────────

PORTFOLIO = [
    "RIO", "ANZ", "BXB", "BHP", "WPL", "BAB", "CBA", "AMP", "PTM", "ALL", "COH",
    "WOW", "MQG", "JBH", "SEK", "MOC", "GTP", "BBG", "CAB", "QGC", "AQUA", "CDS",
    "VGH", "IRE", "WOR"
]
PORTFOLIO = set(x.upper().strip() for x in PORTFOLIO)

OUTPUT_DIR = Path(os.getenv("OUTPUT_DIR", os.getenv("OUTPUT_RUN_DIR", "."))).resolve()
STAGE_DIR = OUTPUT_DIR / "stage22"
STAGE_DIR.mkdir(exist_ok=True)

def latest_file(path: Path, pattern: str) -> Path:
    candidates = sorted(path.rglob(pattern), key=lambda p: p.stat().st_mtime)
    if not candidates:
        raise FileNotFoundError(f"No file found matching {pattern} under {path}")
    return candidates[-1]

STAGE3_CSV = latest_file(OUTPUT_DIR, "stage03/Stage3_Data_WithRatios*.csv")
STAGE11_CSV = latest_file(OUTPUT_DIR, "stage11/Stage11_RISE_Probabilities_All*.csv")
DOMAIN_MAP_PATH = Path(os.getenv("DOMAIN_STAGE_MAP", "ratio_domain_stage_map.yaml"))

# ── LOAD & MASK ─────────────────────────────────────────────────────

def load_data(csv):
    df = pd.read_csv(csv, low_memory=False)
    df.columns = df.columns.str.lower().str.strip()
    return df

stage3 = load_data(STAGE3_CSV)
stage11 = load_data(STAGE11_CSV)

id_col = next((c for c in stage3.columns if c in ("symbol", "ticker", "id")), None)
date_col = next((c for c in stage3.columns if "date" in c), None)
stage3[id_col] = stage3[id_col].astype(str).str.upper().str.strip()
stage11[id_col] = stage11[id_col].astype(str).str.upper().str.strip()
if "year" not in stage3.columns:
    stage3["year"] = pd.to_datetime(stage3[date_col], errors="coerce").dt.year

# Portfolio mask
stage3["group"] = np.where(stage3[id_col].isin(PORTFOLIO), "Portfolio", "Market")
# For high/low later:
all_score_cols = [c for c in stage3.columns if c.startswith("score_")]

# ── LOAD DOMAIN/STAGE MAPPING ───────────────────────────────────────

with open(DOMAIN_MAP_PATH, "r") as f:
    domain_map = yaml.safe_load(f)

domain_stage = {}
for col in all_score_cols:
    entry = domain_map.get(col.replace("score_",""), {})
    domain_stage[col] = (entry.get("domain", "Unknown"), entry.get("stage", "Unknown"))

# ── 1. SUMMARY STATS – Per Group, Per Year, Per Domain/Stage ───────

summary_rows = []
for c in all_score_cols:
    for group in ["Portfolio", "Market"]:
        for year, df_ in stage3[stage3["group"]==group].groupby("year"):
            vals = df_[c].dropna()
            if len(vals)==0: continue
            summary_rows.append({
                "metric": c,
                "domain": domain_stage[c][0],
                "stage": domain_stage[c][1],
                "year": year,
                "group": group,
                "mean": vals.mean(),
                "median": vals.median(),
                "std": vals.std(),
                "min": vals.min(),
                "max": vals.max(),
                "n": len(vals)
            })

summary_df = pd.DataFrame(summary_rows)
summary_csv = STAGE_DIR / "Stage22_SummaryStats.csv"
summary_df.to_csv(summary_csv, index=False)
IPython.display.display(summary_df.head(10))
print(f"✓ Group/year/domain summary stats table saved: {summary_csv.name}")

# ── 2. VISUALS: TIME SERIES AND BOXES ──────────────────────────────

for c in all_score_cols:
    plt.figure(figsize=(7,3))
    for group in ["Portfolio", "Market"]:
        grp = summary_df[(summary_df.metric==c)&(summary_df.group==group)]
        if grp.empty: continue
        plt.plot(grp.year, grp["mean"], marker="o", label=f"{group}")
    plt.title(f"{c} ({domain_stage[c][0]}, {domain_stage[c][1]}) Mean by Year")
    plt.ylabel("Score")
    plt.xlabel("Year")
    plt.legend()
    plt.tight_layout()
    fn = STAGE_DIR / f"Stage22_MeanTS_{c}_{domain_stage[c][0]}_{domain_stage[c][1]}.png"
    plt.savefig(fn)
    plt.close()

    # Show
    IPython.display.display(summary_df[(summary_df.metric==c)].head(10))

# Grouped boxplots for latest year:
latest_year = summary_df.year.max()
for c in all_score_cols:
    plt.figure(figsize=(6,2.5))
    data = stage3[stage3["year"]==latest_year]
    sns.boxplot(x="group", y=c, data=data, showmeans=True)
    plt.title(f"{c} distribution, {latest_year} (Portfolio vs Market)")
    plt.tight_layout()
    fn = STAGE_DIR / f"Stage22_Box_{c}_{latest_year}.png"
    plt.savefig(fn)
    plt.close()

# ── 3. HEATMAP: GROUP/DOMAIN/YEAR ──────────────────────────────────

heatmap_tbl = summary_df.pivot_table(index=["domain","stage","metric"],
                                     columns="year", values="mean", aggfunc="first")
plt.figure(figsize=(15, 0.33*heatmap_tbl.shape[0] + 3))
sns.heatmap(heatmap_tbl, cmap="coolwarm", annot=False, center=0)
plt.title("All scores (Portfolio+Market): mean values by domain/stage/year")
plt.tight_layout()
fn = STAGE_DIR / "Stage22_Heatmap_AllScores.png"
plt.savefig(fn)
plt.close()
IPython.display.display(heatmap_tbl.head(10))

# ── 4. t-TESTS: Portfolio vs Market Per Metric, Per Year ───────────

ttest_rows = []
for c in all_score_cols:
    for year in sorted(stage3["year"].dropna().unique()):
        pvals = stage3[(stage3["group"]=="Portfolio") & (stage3["year"]==year)][c].dropna()
        mvals = stage3[(stage3["group"]=="Market") & (stage3["year"]==year)][c].dropna()
        if len(pvals)>=5 and len(mvals)>=5:
            t, p = ttest_ind(pvals, mvals, equal_var=False, nan_policy="omit")
            ttest_rows.append({
                "metric": c, "domain": domain_stage[c][0], "stage": domain_stage[c][1],
                "year": year, "Portfolio_mean": pvals.mean(), "Market_mean": mvals.mean(),
                "t": t, "p": p, "n_portfolio": len(pvals), "n_market": len(mvals)
            })

ttest_df = pd.DataFrame(ttest_rows)
ttest_csv = STAGE_DIR / "Stage22_Portfolio_vs_Market_ttests.csv"
ttest_df.to_csv(ttest_csv, index=False)
IPython.display.display(ttest_df.head(10))
print(f"✓ Portfolio vs market t-tests saved: {ttest_csv.name}")

# ── 5. HIGH/LOW SPLIT: By Median RISE Score ────────────────────────

# For this, you may want to use mean RISE or sum of main score cols
if "score_rise" in stage3.columns:
    med = stage3["score_rise"].median()
    stage3["highlow"] = np.where(stage3["score_rise"] >= med, "High", "Low")
else:
    score_med = stage3[all_score_cols].mean(axis=1)
    med = score_med.median()
    stage3["highlow"] = np.where(score_med >= med, "High", "Low")

# Compare high vs low for both Portfolio and Market
hl_summary = stage3.groupby(["group", "highlow"])[all_score_cols].mean().stack().unstack(0)
hl_summary_csv = STAGE_DIR / "Stage22_HighLow_Summary.csv"
hl_summary.to_csv(hl_summary_csv)
IPython.display.display(hl_summary.head(10))
print(f"✓ High/low portfolio summary table saved: {hl_summary_csv.name}")

# ── 6. INDIVIDUAL STOCK SUMMARY TABLES ─────────────────────────────

per_stock = []
for ticker in PORTFOLIO:
    d = stage3[stage3[id_col]==ticker]
    if d.empty: continue
    row = {"symbol": ticker}
    for c in all_score_cols:
        row[f"{c}_mean"] = d[c].mean()
        row[f"{c}_std"] = d[c].std()
    per_stock.append(row)
per_stock_df = pd.DataFrame(per_stock)
per_stock_csv = STAGE_DIR / "Stage22_PerStock_Summary.csv"
per_stock_df.to_csv(per_stock_csv, index=False)
IPython.display.display(per_stock_df.head(10))
print(f"✓ Individual stock summary table saved: {per_stock_csv.name}")

# ── 7. FIRM-BY-FIRM PLOTS ─────────────────────────────────────────

for ticker in PORTFOLIO:
    sub = stage3[stage3[id_col]==ticker]
    if sub.empty: continue
    plt.figure(figsize=(9,3))
    for c in all_score_cols[:5]: # For brevity
        plt.plot(sub["year"], sub[c], marker="o", label=c)
    plt.title(f"{ticker}: Domain/stage scores (first 5 metrics)")
    plt.xlabel("Year")
    plt.ylabel("Score")
    plt.legend(loc="best", fontsize=7)
    plt.tight_layout()
    fpath = STAGE_DIR / f"Stage22_Trajectory_{ticker}.png"
    plt.savefig(fpath)
    plt.close()

print("\nStage 22 complete – comparative dashboard artefacts for portfolio and market saved and all tables/charts shown in notebook.\n")


FileNotFoundError: No file found matching stage11/Stage11_RISE_Probabilities_All*.csv under C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff