In [1]:

# Week 2 — Section 4: Data Splitting & Preparation (Single‑cell, business‑friendly)
from pathlib import Path
import os, re, json
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

# ---------------- Config ----------------
SECTION = "Wk02_Section4"
WEEK_REPORT_FILENAME = "SDS-CP036-powercast_Wk02_Report_Business.md"
SECTION_REPORT_FILENAME = "SDS-CP036-powercast_Wk02_Section4_Business_Report.md"

# ---------------- Helpers ----------------
def find_base_dir(start: Path) -> Path:
    env = os.getenv("POWERCAST_BASE_DIR")
    if env and (Path(env)/"Code").exists():
        return Path(env).resolve()
    p = start.resolve()
    for _ in range(8):
        if (p/"Code").exists() and ((p/"data").exists() or (p/"results").exists()):
            return p
        if p.name.lower()=="powercast" and (p/"Code").exists():
            return p
        p = p.parent
    return start.resolve()

def _setup_dirs(base_dir: Path):
    out_dir = base_dir / "results" / SECTION
    features_dir = out_dir / "features"
    plots_dir = out_dir / "plots"
    reports_dir = out_dir / "reports"
    for d in (out_dir, features_dir, plots_dir, reports_dir):
        d.mkdir(parents=True, exist_ok=True)
    return out_dir, features_dir, plots_dir, reports_dir

def _clean_prev(*dirs: Path):
    for folder in dirs:
        if folder.exists():
            for p in folder.glob("*"):
                try:
                    if p.is_file(): p.unlink()
                except Exception:
                    pass

def _resolve_best_input(base_dir: Path):
    # Prefer Section 3 split → Section 2 engineered (imputed → raw) → raw CSV
    s3_train = base_dir/"results"/"Wk02_Section3"/"features"/"scaled_standard_train.csv"
    s3_test  = base_dir/"results"/"Wk02_Section3"/"features"/"scaled_standard_test.csv"
    if s3_train.exists() and s3_test.exists():
        return "section3", (s3_train, s3_test)

    s2_imp = base_dir/"results"/"Wk02_Section2"/"features"/"engineered_lag_rolling_imputed.csv"
    s2_raw = base_dir/"results"/"Wk02_Section2"/"features"/"engineered_lag_rolling.csv"
    if s2_imp.exists(): return "section2_imputed", (s2_imp,)
    if s2_raw.exists(): return "section2_raw", (s2_raw,)

    raw = base_dir/"data"/"Tetuan City power consumption.csv"
    if raw.exists(): return "raw", (raw,)
    raise FileNotFoundError("No suitable input found. Expected Section 3 or Section 2 outputs, or data/Tetuan City power consumption.csv.")

def _find_datetime_column(df: pd.DataFrame):
    for c in ["DateTime","datetime","date_time","Timestamp","timestamp","time","Date","date"]:
        if c in df.columns: return c
    for c in df.columns:
        if any(k in c.lower() for k in ["date","time","stamp"]): return c
    return None

def _ensure_dt(series):
    dt = pd.to_datetime(series, errors="coerce")
    if dt.isna().any():
        dt2 = pd.to_datetime(series, errors="coerce", dayfirst=True)
        dt = dt.fillna(dt2)
    return dt

def _time_split(df: pd.DataFrame, dt_col: str, test_size=0.2):
    n = len(df); split = int(n*(1-test_size))
    train = df.iloc[:split].copy()
    test  = df.iloc[split:].copy()
    return train, test

def _plot_time_coverage(df, dt_col, train, test, plots_dir: Path):
    dt_all = _ensure_dt(df[dt_col]).ffill().bfill()
    dt_tr  = _ensure_dt(train[dt_col]).ffill().bfill()
    dt_te  = _ensure_dt(test[dt_col]).ffill().bfill()
    if dt_tr.isna().all() or dt_te.isna().all():
        return None  # skip plotting if datetimes are unusable
    plt.figure()
    plt.plot(dt_all, np.arange(len(dt_all)), label="timeline index")
    split_ts = pd.to_datetime(dt_tr.iloc[-1])
    plt.axvline(split_ts, linestyle="--", label="train/test split")
    plt.title("Chronological index & split marker"); plt.xlabel("time"); plt.ylabel("index")
    plt.legend()
    p = plots_dir/"wk02_section4_split_marker.png"
    plt.savefig(p, bbox_inches="tight"); plt.close()
    return p

def _mae(y_true, y_pred):
    m = (~pd.isna(y_true)) & (~pd.isna(y_pred))
    if m.sum()==0: return float("nan")
    return float(np.mean(np.abs(y_true[m]-y_pred[m])))

def _choose_target(df: pd.DataFrame, dt_col: str):
    for cand in ["Total","total","Total_kW","Global_active_power","Appliances","Total_auto"]:
        if cand in df.columns: return cand
    zones = [c for c in df.columns if ("zone" in c.lower() and pd.api.types.is_numeric_dtype(df[c]))]
    if zones:
        df["Total_auto"] = df[zones].sum(axis=1, numeric_only=True)
        return "Total_auto"
    num_cols = [c for c in df.columns if c!=dt_col and pd.api.types.is_numeric_dtype(df[c])]
    return num_cols[0] if num_cols else None

def _walk_forward_folds(df: pd.DataFrame, dt_col: str, n_folds=3):
    n = len(df)
    if n < 50: return []
    train_frac, test_frac = 0.6, 0.2
    win = int(n*(train_frac+test_frac))
    step = max(int(n*0.2), 1)
    folds = []
    start = 0
    while len(folds) < n_folds and start+win <= n:
        split = start+int(win*train_frac/(train_frac+test_frac))
        tr = df.iloc[start:split].copy()
        te = df.iloc[split:start+win].copy()
        if len(te) > 5 and len(tr) > 5:
            folds.append((tr, te))
        start += step
    return folds

# ---------------- Business-friendly Q&A ----------------
def _business_qna_text():
    q1 = ("We respected the **natural flow of time** in the data. "
          "If training/testing files already existed from Section 3, we used them as-is. "
          "Otherwise, we split by time: the **first 80% (earlier dates)** for training and the **last 20% (later dates)** for testing, "
          "with **no shuffling**. This matches real-world usage, where yesterday teaches us to predict tomorrow.")
    q2 = ("We prevented **information leakage**—that’s when future knowledge accidentally sneaks into training—by ensuring any learned settings "
          "(such as scaling/normalization from earlier steps) were **fit on the training period only** and then **applied to the later test period**. "
          "When building targets and features, we also avoided using any future-looking information. This keeps evaluation realistic for live operations.")
    q3 = ("We validated the split in two ways: (1) a **timeline visualization** with a clear marker where training ends and testing begins; "
          "and (2) **walk‑forward validation**, which repeatedly tests on successive future slices. "
          "Think of it as asking, *“If we stopped here, could we predict the next step?”* "
          "Consistent results across slices and a clean split line indicate the split is sound for forecasting.")
    return q1, q2, q3

def _write_section_report(reports_dir: Path, csv_name: str, diagnostics: dict, qna: tuple[str,str,str], artifacts: dict, plot_path):
    q1, q2, q3 = qna
    lines = [
        "# Week 2 — Section 4: Data Splitting & Preparation",
        "",
        f"**Primary input:** `{csv_name}`",
        "**" + " | ".join([
            f"Rows: {diagnostics.get('rows')}",
            f"Train rows: {diagnostics.get('rows_train')}",
            f"Test rows: {diagnostics.get('rows_test')}",
            f"Target: {diagnostics.get('target')}"
        ]) + "**",
        "",
        "## Key Questions Answered",
        "### 4. Data Splitting & Preparation",
        "Q: How did you split your data into training and test sets to maintain chronological order?",
        "A: " + q1,
        "",
        "Q: What steps did you take to prevent information leakage between splits?",
        "A: " + q2,
        "",
        "Q: How did you verify that your train/test split was appropriate for time-series forecasting?",
        "A: " + q3,
        "",
        "## Artifacts",
        f"- Train: `features/{Path(artifacts['train']).name}`",
        f"- Test: `features/{Path(artifacts['test']).name}`",
        f"- Walk-forward results: `features/{Path(artifacts['wf']).name}`",
        f"- Plot: `{Path(plot_path).name if plot_path else 'n/a'}`",
        "- Machine-readable summary: `summary.json`"
    ]
    rp = reports_dir/SECTION_REPORT_FILENAME
    rp.write_text("\n".join(lines), encoding="utf-8")
    return rp

def _update_week_report(base_dir: Path, section_block_md: str):
    wk_path = base_dir / WEEK_REPORT_FILENAME
    if not wk_path.exists():
        base = [
            "# SDS-CP036-powercast — Wk02 Consolidated Business Report (Inline Plots v2)",
            "",
            f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
            f"Project root: `{base_dir}`",
            "",
            "Includes Sections: 1, 2, 3, 4, 5",
            "",
            "## Section 1 — (placeholder)",
            "",
            "## Section 2 — (placeholder)",
            "",
            "## Section 3 — (placeholder)",
            "",
            "## Section 4 — Data Splitting & Preparation",
            "",
            section_block_md,
            ""
        ]
        wk_path.write_text("\n".join(base), encoding="utf-8")
        return str(wk_path)
    txt = wk_path.read_text(encoding="utf-8")
    if "## Section 4 — Data Splitting & Preparation" in txt:
        sec_pat = re.compile(r"(## Section 4 — Data Splitting & Preparation[\s\S]*?)(?=^## |\Z)", re.MULTILINE)
        if sec_pat.search(txt): txt = sec_pat.sub(section_block_md + "\n", txt)
        else: txt += "\n" + section_block_md + "\n"
    else:
        txt += "\n" + section_block_md + "\n"
    wk_path.write_text(txt, encoding="utf-8")
    return str(wk_path)

def _find_section_bounds(md: str, header_text: str):
    pattern = re.compile(rf"(^## {re.escape(header_text)}\s*$)", re.MULTILINE)
    m = pattern.search(md)
    if not m: return None, None
    start = m.end()
    n = re.compile(r"^## ", re.MULTILINE).search(md, start)
    end = n.start() if n else len(md)
    return start, end

def _insert_at_end_of_section(md: str, header_text: str, block: str) -> str:
    if not block.strip(): return md
    start, end = _find_section_bounds(md, header_text)
    if start is None: return md.rstrip() + f"\n\n## {header_text}\n\n{block.rstrip()}\n"
    if block.strip() in md[start:end]: return md
    return md[:end] + ("\n" if not md[start:end].endswith("\n") else "") + block.rstrip() + "\n" + md[end:]

def _ensure_toc_item(md: str, title: str) -> str:
    start, end = _find_section_bounds(md, "Table of Contents")
    if start is None:
        md = md.rstrip() + "\n\n## Table of Contents\n\n"
        start, end = _find_section_bounds(md, "Table of Contents")
    anchor = title.strip().lower().replace(" ", "-")
    bullet = f"- [{title}](#{anchor})"
    body = md[start:end]
    if bullet in body: return md
    new = body.rstrip() + ("\n" if body and not body.endswith("\n") else "") + bullet + "\n"
    return md[:start] + new + md[end:]

def _update_readme(base_dir: Path, section_report_path: Path, plot_path):
    readme = base_dir/"README.md"
    md = readme.read_text(encoding="utf-8") if readme.exists() else "# Powercast — Project Overview\n\n## Table of Contents\n"

    thumbs = []
    if plot_path:
        rel = Path(plot_path).relative_to(base_dir).as_posix()
        thumbs.append(f'<a href="./{rel}"><img src="./{rel}" width="260" alt="Wk02_Section4 — split marker"></a>')
    thumbs_block = "\n".join(thumbs)

    plots_block = "### Wk02_Section4\n" + ("\n- [" + Path(plot_path).stem + f"](./{Path(plot_path).relative_to(base_dir).as_posix()})" if plot_path else "")

    rel_rep = section_report_path.relative_to(base_dir).as_posix()
    section_block = f"### Wk02_Section4\n- [Week 2 – Section 4: Data Splitting & Preparation](./{rel_rep})"

    wk2_path = base_dir / WEEK_REPORT_FILENAME
    if wk2_path.exists():
        md = _ensure_toc_item(md, "Top-level Week 2 Report")
        if "## Top-level Week 2 Report" not in md:
            md += f"\n## Top-level Week 2 Report\n\n- [SDS-CP036-powercast_Wk02_Report_Business.md](./{wk2_path.relative_to(base_dir).as_posix()})\n"

    md = _insert_at_end_of_section(md, "Quick Gallery (click any thumbnail)", thumbs_block)
    md = _insert_at_end_of_section(md, "Plots (grouped by Section)", plots_block)
    md = _insert_at_end_of_section(md, "Section Reports (grouped)", section_block)

    readme.write_text(md, encoding="utf-8")
    return str(readme)

# ---------------- Main process ----------------
def process(base_dir: Path):
    base_dir = Path(base_dir)
    out_dir, features_dir, plots_dir, reports_dir = _setup_dirs(base_dir)
    _clean_prev(features_dir, plots_dir, reports_dir)

    mode, paths = _resolve_best_input(base_dir)
    if mode == "section3":
        train = pd.read_csv(paths[0]); test = pd.read_csv(paths[1])
        dt_col = _find_datetime_column(train) or _find_datetime_column(test)
        if dt_col and not (pd.api.types.is_datetime64_any_dtype(train[dt_col]) or pd.api.types.is_datetime64tz_dtype(train[dt_col])):
            train[dt_col] = _ensure_dt(train[dt_col]); test[dt_col] = _ensure_dt(test[dt_col])
        src_name = f"{paths[0].name} + {paths[1].name}"
        df_all = pd.concat([train, test], ignore_index=True)
    else:
        df = pd.read_csv(paths[0])
        dt_col = _find_datetime_column(df)
        if dt_col is None: raise ValueError("No datetime-like column found.")
        df[dt_col] = _ensure_dt(df[dt_col])
        df = df.sort_values(dt_col).reset_index(drop=True)
        train, test = _time_split(df, dt_col, test_size=0.2)
        src_name = paths[0].name
        df_all = df

    target = _choose_target(df_all, dt_col)
    diagnostics = {
        "source_mode": mode,
        "source": src_name,
        "rows": int(len(df_all)),
        "rows_train": int(len(train)),
        "rows_test": int(len(test)),
        "target": target
    }

    train_csv = features_dir/"train.csv"; train.to_csv(train_csv, index=False)
    test_csv  = features_dir/"test.csv";  test.to_csv(test_csv, index=False)

    plot_path = _plot_time_coverage(df_all, dt_col, train, test, plots_dir)

    # Walk-forward baseline using lag-1 on target (if available)
    wf_csv = features_dir/"walk_forward_results.csv"
    if target and target in df_all.columns:
        folds = _walk_forward_folds(df_all[[dt_col, target]].copy(), dt_col, n_folds=3)
        rows = []
        for i, (tr, te) in enumerate(folds, 1):
            y = pd.to_numeric(te[target], errors="coerce")
            # naive lag-1 using last seen value from train part to predict test segment
            last_train_val = pd.to_numeric(tr[target], errors="coerce").iloc[-1] if len(tr) else np.nan
            pred = pd.Series(last_train_val, index=y.index)
            rows.append({"fold": i, "mae_lag1": _mae(y.values, pred.values)})
        pd.DataFrame(rows).to_csv(wf_csv, index=False)
    else:
        pd.DataFrame([{"note":"no target available for baseline"}]).to_csv(wf_csv, index=False)

    # Business-friendly Q&A
    qna = _business_qna_text()

    # Section report
    artifacts = {"train": str(train_csv), "test": str(test_csv), "wf": str(wf_csv)}
    section_report = _write_section_report(reports_dir, src_name, diagnostics, qna, artifacts, plot_path)

    # Summary
    (out_dir/"summary.json").write_text(json.dumps({"input_mode": mode, **diagnostics}, indent=2), encoding="utf-8")

    # Week report block
    block = []
    block.append("## Section 4 — Data Splitting & Preparation")
    block.append("")
    block.append("### Key Questions Answered")
    block.append("Q: How did you split your data into training and test sets to maintain chronological order?")
    block.append("A: " + qna[0])
    block.append("")
    block.append("Q: What steps did you take to prevent information leakage between splits?")
    block.append("A: " + qna[1])
    block.append("")
    block.append("Q: How did you verify that your train/test split was appropriate for time-series forecasting?")
    block.append("A: " + qna[2])
    block.append("")
    if plot_path:
        rel = Path(plot_path).relative_to(base_dir).as_posix()
        block.append(f"![Split marker]({rel})")
    week_report = _update_week_report(base_dir, "\n".join(block))

    # README
    readme = _update_readme(base_dir, section_report, plot_path)

    print(json.dumps({
        "train_csv": str(train_csv),
        "test_csv": str(test_csv),
        "walk_forward_csv": str(wf_csv),
        "plot": str(plot_path) if plot_path else None,
        "section_report": str(section_report),
        "week_report": week_report,
        "readme": readme
    }, indent=2))

# ---------------- Execute ----------------
BASE = find_base_dir(Path.cwd())
process(BASE)


{
  "train_csv": "/home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/results/Wk02_Section4/features/train.csv",
  "test_csv": "/home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/results/Wk02_Section4/features/test.csv",
  "walk_forward_csv": "/home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/results/Wk02_Section4/features/walk_forward_results.csv",
  "plot": "/home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/results/Wk02_Section4/plots/wk02_section4_split_marker.png",
  "section_report": "/home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/results/Wk02_Section4/reports/SDS-CP036-powercast_Wk02_Section4_Business_Report.md",
  "week_report": "/home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/SDS-CP036-powercast_Wk02_Report_Business.md",
  "readme": "/home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/README.md"
}
