In [1]:
# Parameters
INPUT_CSV = "C:/Users/Jason Pohl/OneDrive - Bond University/PhD/rff/NEW_DATA.csv"
OUTPUT_ROOT = "C:/Users/Jason Pohl/OneDrive - Bond University/PhD/rff/outputs_rff"
STAGE1_CFG = ""
SWAN_YEAR = 2008
WIN_START = 2004
WIN_END = 2012
RUN_TAG = "myUniqueRunId"


In [2]:
"""
STAGE 01 · DATA LOAD & PRE-PROCESSING
────────────────────────────────────────────────────────────────────────
• Reads run parameters from pipeline_config.yaml
• If SWAN_YEAR is not supplied via env-var, defaults to the first event
  key in the YAML, so the script can run with *zero* manual input.
"""

from __future__ import annotations
from pathlib import Path
from datetime import datetime
import os, logging, io, re, yaml, sys

import pandas as pd
import numpy as np
import daily_setup  # This will set up RUN_DIR for today

# ──────────────────────────────────────────────────────────────────────
# 0-2 · UNIVERSAL BOOTSTRAP  (cfg + run-path resolver + logger)
#       uses pipeline_utils.py that you added earlier
# ──────────────────────────────────────────────────────────────────────
from pipeline_utils import load_cfg, resolve_run_dir    # <— NEW
import logging, sys
from pathlib import Path
from datetime import datetime
import os, io, re

# 0.1 read YAML once
CFG       = load_cfg()
DEFAULTS  = CFG.get("defaults", {})
EVENTS    = CFG.get("events", {})

# 0.2 decide event (swan year) and resolve run directory
SWAN_YEAR = str(os.getenv("SWAN_YEAR") or next(iter(EVENTS)))
RUN_DIR   = resolve_run_dir()          # OUTPUT_ROOT/event=YYYY/<run-tag>/

RUN_DATE  = RUN_DIR.name               # folder name itself
INPUT_CSV = Path(DEFAULTS["INPUT_CSV"]).expanduser()

# 0.3 misc constants
DATE_COL  = DEFAULTS.get("DATE_COL", "ReportDate")
ID_COL    = DEFAULTS.get("ID_COL",   "Symbol")
FILTERS   = {
    "pct_non_na": DEFAULTS.get("PCT_NON_NA", 95),
    "pct_zero":   DEFAULTS.get("PCT_ZERO",   98),
    "min_unique": DEFAULTS.get("MIN_UNIQUE", 10),
}

# 1. output directory for this stage
OUTPUT_DIR = RUN_DIR / "stage01"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# 2. logger
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-7s | %(message)s",
    handlers=[
        logging.FileHandler(OUTPUT_DIR / "stage01.log", mode="w", encoding="utf-8"),
        logging.StreamHandler(sys.stdout),
    ],
)
logger = logging.getLogger(__name__)
logger.info("==========  STAGE 01: DATA LOAD & PRE-PROCESSING ==========")
logger.info("RUN_DIR        : %s", RUN_DIR)
logger.info("INPUT_CSV      : %s", INPUT_CSV)
logger.info("SWAN_YEAR      : %s  |  RUN_DATE: %s", SWAN_YEAR, RUN_DATE)
logger.info("DATE_COL / ID_COL   = %s / %s", DATE_COL, ID_COL)
logger.info("FILTERS             = %s", FILTERS)
# ──────────────────────────────────────────────────────────────────────

# ──────────────────────────────────────────────────────────────────────
# 3 · LOAD RAW CSV
# ──────────────────────────────────────────────────────────────────────
df = pd.read_csv(INPUT_CSV, low_memory=False)
logger.info("Rows loaded: %s", f"{len(df):,}")

# ──────────────────────────────────────────────────────────────────────
# 4 · DATE PARSE & ID NORMALISATION
# ──────────────────────────────────────────────────────────────────────
df[DATE_COL] = (
    pd.to_datetime(df[DATE_COL], errors="coerce", dayfirst=True)
      .fillna(pd.to_datetime(df[DATE_COL], errors="coerce", dayfirst=False))
)
bad_dates = df[df[DATE_COL].isna()]
if not bad_dates.empty:
    bad_dates.to_csv(OUTPUT_DIR / "bad_dates.csv", index=False)
    logger.warning("Bad dates → %d rows saved to bad_dates.csv", len(bad_dates))

df[ID_COL] = df[ID_COL].astype(str).str.strip().str.upper()

# ──────────────────────────────────────────────────────────────────────
# 5 · SMART NUMERIC COERCION
# ──────────────────────────────────────────────────────────────────────
_num_rx = re.compile(r"[$€£,%]")

def smart_to_num(series: pd.Series) -> pd.Series:
    if series.dtype != "object":
        return series
    out = pd.to_numeric(series.str.replace(_num_rx, "", regex=True), errors="coerce")
    return out if out.notna().mean() >= 0.50 else series

df = df.apply(smart_to_num)

# ──────────────────────────────────────────────────────────────────────
# 6 · BASIC FILTERS
# ──────────────────────────────────────────────────────────────────────
before = len(df)
df = df.dropna(subset=[ID_COL, DATE_COL])
logger.info("After ID/date filter: %d rows (%.1f%%)", len(df), len(df) / before * 100)

num_cols = df.select_dtypes(include=[np.number]).columns
meta = pd.DataFrame({
    "pct_non_na": df[num_cols].notna().mean() * 100,
    "pct_zero":   (df[num_cols] == 0).mean() * 100,
    "n_unique":   df[num_cols].nunique(dropna=True),
})
good_mask = (
    (meta["pct_non_na"] >= FILTERS["pct_non_na"]) &
    (meta["pct_zero"]   <  FILTERS["pct_zero"])   &
    (meta["n_unique"]   >= FILTERS["min_unique"])
)
drop_cols = list(meta.index[~good_mask])
if drop_cols:
    df.drop(columns=drop_cols, inplace=True)
    logger.info("Dropped %d low-quality numeric columns", len(drop_cols))

# ──────────────────────────────────────────────────────────────────────
# 7 · COLLAPSE TO ONE ROW PER FIRM-YEAR
# ──────────────────────────────────────────────────────────────────────
df["Year"] = df[DATE_COL].dt.year
df = (
    df.sort_values(DATE_COL)
      .groupby([ID_COL, "Year"], as_index=False)
      .last()
)

# ──────────────────────────────────────────────────────────────────────
# 8 · EXPORT & SUMMARY
# ──────────────────────────────────────────────────────────────────────
out_csv = OUTPUT_DIR / "stage01_cleaned.csv"
df.to_csv(out_csv, index=False)

buf = io.StringIO(); df.info(buf=buf)
logger.info("Final DataFrame info:\n%s", buf.getvalue())
logger.info("Saved cleaned CSV → %s", out_csv)

# Make result available in-memory if next stage imports in same session
data_stage_1 = df.copy()
logger.info("✅ STAGE 01 complete — `data_stage_1` ready")

Using existing RUN_DIR: outputs_rff\daily\2025-06-11


2025-06-11 14:39:50,364 | INFO    | RUN_DIR        : outputs_rff\daily\2025-06-11


2025-06-11 14:39:50,365 | INFO    | INPUT_CSV      : C:\Users\Jason Pohl\OneDrive - Bond University\PhD\rff\NEW_DATA.csv


2025-06-11 14:39:50,367 | INFO    | SWAN_YEAR      : 2008  |  RUN_DATE: 2025-06-11


2025-06-11 14:39:50,367 | INFO    | DATE_COL / ID_COL   = ReportDate / Symbol


2025-06-11 14:39:50,368 | INFO    | FILTERS             = {'pct_non_na': 95, 'pct_zero': 98, 'min_unique': 10}


2025-06-11 14:39:52,435 | INFO    | Rows loaded: 55,800


  pd.to_datetime(df[DATE_COL], errors="coerce", dayfirst=True)


2025-06-11 14:39:53,107 | INFO    | After ID/date filter: 55800 rows (100.0%)


2025-06-11 14:39:53,562 | INFO    | Dropped 94 low-quality numeric columns


2025-06-11 14:39:57,418 | INFO    | Final DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34862 entries, 0 to 34861
Columns: 164 entries, Symbol to ReportDate
dtypes: datetime64[ns](1), float64(148), int32(1), int64(4), object(10)
memory usage: 43.5+ MB



2025-06-11 14:39:57,419 | INFO    | Saved cleaned CSV → outputs_rff\daily\2025-06-11\stage01\stage01_cleaned.csv


2025-06-11 14:39:57,438 | INFO    | ✅ STAGE 01 complete — `data_stage_1` ready
