In [4]:
#!/usr/bin/env python
"""
Stage-12 · FY-PRE PCA  ➜  K-means clustering
============================================

Part A  Snapshot (FY-PRE) ratio matrix  ➜  PCA  
Part B  PCA scores                      ➜  best-k K-means

Outputs (all written to <run>/stage12/)
---------------------------------------
Stage12A_PCA_Variance.csv
Stage12A_PCA_Loadings.csv
Stage12B_PCA_Scores.csv
Stage12B_ClusterLabels.csv
Stage12B_ClusterSummary.csv
Stage12B_SectorCluster_Table.csv
"""
from __future__ import annotations
import os, sys, logging, warnings
from pathlib import Path
import yaml                          # config loader
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

warnings.filterwarnings("ignore", category=FutureWarning)
plt.rcParams["figure.dpi"] = 110
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s | %(levelname)-7s | %(message)s")
log = logging.getLogger(__name__)

# ╔══════════════════════════════════════════════════════════════════╗
# 0 · CONFIG & RUN-DIR DISCOVERY                                     #
# ╚══════════════════════════════════════════════════════════════════╝
CFG_FILE = Path(os.getenv("PIPELINE_CFG", "pipeline_config.yaml")).expanduser()
if not CFG_FILE.is_file():
    raise FileNotFoundError(f"pipeline_config.yaml not found at {CFG_FILE}")
CFG       = yaml.safe_load(CFG_FILE.read_text()) or {}
DEFAULTS  = CFG.get("defaults", {})
EVENTS    = {str(k): v for k, v in CFG.get("events", {}).items()}

SWAN_YEAR = int(os.getenv("SWAN_YEAR", next(iter(EVENTS))))
if str(SWAN_YEAR) not in EVENTS:
    raise KeyError(f"SWAN_YEAR={SWAN_YEAR} missing in config events")
PRE_YEAR  = SWAN_YEAR - 1

OUTPUT_ROOT = Path(DEFAULTS["OUTPUT_ROOT"]).expanduser()
EVENT_DIR   = OUTPUT_ROOT / f"event={SWAN_YEAR}"

RUN_DIR: Path | None = None
if os.getenv("RUN_DIR"):
    RUN_DIR = Path(os.getenv("RUN_DIR")).expanduser()
elif os.getenv("RUN_DATE"):
    RUN_DIR = EVENT_DIR / os.getenv("RUN_DATE")

if RUN_DIR is None:
    cands = list(EVENT_DIR.glob("*/stage03/Stage3_Data_WithRatios.csv"))
    if not cands:
        raise FileNotFoundError("Stage-03 outputs not found – run Stage 03 first")
    STAGE3_FILE = max(cands, key=lambda p: p.stat().st_mtime)
    RUN_DIR     = STAGE3_FILE.parents[1]
else:
    STAGE3_FILE = RUN_DIR / "stage03" / "Stage3_Data_WithRatios.csv"
    if not STAGE3_FILE.is_file():
        raise FileNotFoundError(f"{STAGE3_FILE} missing – run Stage 03")

RUN_DATE  = RUN_DIR.name
STAGE_DIR = RUN_DIR / "stage12"
STAGE_DIR.mkdir(parents=True, exist_ok=True)

log.info("==========  STAGE 12  ==========")
log.info("SWAN_YEAR=%s  RUN_DATE=%s  RUN_DIR=%s", SWAN_YEAR, RUN_DATE, RUN_DIR)

# ── env-overridable knobs ───────────────────────────────────────────
MIN_ROW_CVR = float(os.getenv("MIN_ROW_CVR", 0.60))
MIN_COL_CVR = float(os.getenv("MIN_COL_CVR", 0.60))
MAX_PC      = int(os.getenv("MAX_PC", 20))
VAR_THRESH  = float(os.getenv("VAR_THRESH", 90.0))
K_RANGE     = range(*map(int, os.getenv("K_RANGE", "2,11").split(",")))
ID_COL      = os.getenv("ID_COL", "Symbol")
DATE_COL    = os.getenv("DATE_COL", "ReportDate")
SECTOR_COL  = os.getenv("SECTOR_COL", "SectorName")

# ╔══════════════════════════════════════════════════════════════════╗
# 1 · LOAD FY-PRE SNAPSHOT                                           #
# ╚══════════════════════════════════════════════════════════════════╝
df = pd.read_csv(STAGE3_FILE, low_memory=False)
df.columns = [c.lower().strip() for c in df.columns]

date_col = DATE_COL.lower()
if date_col not in df.columns:
    alt = [c for c in df.columns if "reportdate" in c]
    if not alt:
        raise KeyError(f"'{DATE_COL}' column not found in Stage-3 file")
    date_col = alt[0]
    log.warning("DATE_COL not found exactly; using '%s'", date_col)
df[date_col] = pd.to_datetime(df[date_col], errors="coerce")

df_pre = df[df[date_col].dt.year == PRE_YEAR].copy()
if df_pre.empty:
    raise RuntimeError(f"No FY-{PRE_YEAR} snapshot rows")
log.info("Snapshot rows: %s", f"{len(df_pre):,}")

# ╔══════════════════════════════════════════════════════════════════╗
# 2 · RATIO MATRIX & ROW FILTER                                      #
# ╚══════════════════════════════════════════════════════════════════╝
ignore = {ID_COL.lower(), date_col}
ratio_cols = [c for c in df_pre.columns
              if c not in ignore
              and df_pre[c].dtype.kind in "fi"
              and "_" in c
              and not c.endswith("_raw")]

mat = df_pre[ratio_cols].replace([np.inf, -np.inf], np.nan)
mat = mat.loc[:,  mat.notna().mean() >= MIN_COL_CVR]
mat = mat.loc[   mat.notna().mean(axis=1) >= MIN_ROW_CVR]
if mat.shape[1] < 2:
    raise RuntimeError("Matrix too sparse after coverage filters")
log.info("Matrix for PCA: %d firms × %d ratios", *mat.shape)

# keep *only* filtered rows for the remainder
snap = df_pre.loc[mat.index].copy()

# ╔══════════════════════════════════════════════════════════════════╗
# 3 · PCA                                                            #
# ╚══════════════════════════════════════════════════════════════════╝
Z   = StandardScaler().fit_transform(SimpleImputer(strategy="median").fit_transform(mat))
pca = PCA(n_components=min(MAX_PC, Z.shape[1]), random_state=42).fit(Z)
cum = np.cumsum(pca.explained_variance_ratio_) * 100
k_keep = np.argmax(cum >= VAR_THRESH) + 1
log.info("k=%d PCs capture %.1f %% variance", k_keep, cum[k_keep-1])

(pd.DataFrame({"PC": [f"PC{i+1}" for i in range(len(cum))],
               "Eigen%": (pca.explained_variance_ratio_*100).round(2),
               "Cum%": cum.round(2)})
   .set_index("PC")
   .to_csv(STAGE_DIR / "Stage12A_PCA_Variance.csv"))

(pd.DataFrame(pca.components_.T,
              index   = mat.columns,
              columns = [f"PC{i+1}" for i in range(pca.n_components_)])
   .reset_index()
   .rename(columns={"index": "ratio"})
   .to_csv(STAGE_DIR / "Stage12A_PCA_Loadings.csv", index=False))

# ╔══════════════════════════════════════════════════════════════════╗
# 4 · PCA SCORES & K-MEANS                                           #
# ╚══════════════════════════════════════════════════════════════════╝
loadings = pd.read_csv(STAGE_DIR / "Stage12A_PCA_Loadings.csv").set_index("ratio")
pc_cols  = [f"PC{i+1}" for i in range(k_keep)]
scores   = Z @ loadings[pc_cols].values
scores_df = (pd.DataFrame(scores, columns=pc_cols, index=snap.index)
               .assign(**{ID_COL.lower(): snap[ID_COL.lower()].values}))
scores_df.to_csv(STAGE_DIR / "Stage12B_PCA_Scores.csv", index=False)

sil, models = [], {}
X = scores_df[pc_cols].values
for k in K_RANGE:
    km = KMeans(n_clusters=k, n_init=20, random_state=42).fit(X)
    sil.append(silhouette_score(X, km.labels_))
    models[k] = km
best_k = max(models, key=lambda k: sil[K_RANGE.index(k)])
km     = models[best_k]
snap["cluster"] = km.labels_

snap[[ID_COL.lower(), "cluster"]]\
    .to_csv(STAGE_DIR / "Stage12B_ClusterLabels.csv", index=False)
pd.DataFrame({"k": list(K_RANGE), "silhouette": sil})\
    .to_csv(STAGE_DIR / "Stage12B_ClusterSummary.csv", index=False)
log.info("Best k = %d  (silhouette = %.3f)", best_k, sil[K_RANGE.index(best_k)])

sec_col = SECTOR_COL.lower()
if sec_col in snap.columns:
    pd.crosstab(snap["cluster"], snap[sec_col])\
      .to_csv(STAGE_DIR / "Stage12B_SectorCluster_Table.csv")
    log.info("Sector × cluster table written")
else:
    log.warning("Column '%s' absent – sector table skipped", SECTOR_COL)

log.info("✓  Stage 12 complete – artefacts in %s", STAGE_DIR)
print(f"\n✓ Stage 12 complete – outputs in {STAGE_DIR}\n")

2025-06-10 14:27:15,406 | INFO    | SWAN_YEAR=2008  RUN_DATE=20250609  RUN_DIR=C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250609
2025-06-10 14:27:20,706 | INFO    | Snapshot rows: 974
2025-06-10 14:27:20,730 | INFO    | Matrix for PCA: 941 firms × 204 ratios
2025-06-10 14:27:20,845 | INFO    | k=1 PCs capture 12.2 % variance
[WinError 2] The system cannot find the file specified
  File "c:\Users\Jason Pohl\miniconda3\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\Jason Pohl\miniconda3\lib\subprocess.py", line 493, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\Jason Pohl\miniconda3\lib\subprocess.py", line 858, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\Jason Pohl\miniconda3\lib\subprocess.py", line 1311, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable,


✓ Stage 12 complete – outputs in C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\outputs_rff\event=2008\20250609\stage12

