In [1]:

# Week 2 — Section 3: Feature Scaling & Normalization (single-cell, business-friendly)
from pathlib import Path
import os, re, json
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

SECTION = "Wk02_Section3"
WEEK_REPORT_FILENAME = "SDS-CP036-powercast_Wk02_Report_Business.md"
SECTION_REPORT_FILENAME = "SDS-CP036-powercast_Wk02_Section3_Business_Report.md"

def find_base_dir(start: Path) -> Path:
    env = os.getenv("POWERCAST_BASE_DIR")
    if env and (Path(env)/"Code").exists():
        return Path(env).resolve()
    p = start.resolve()
    for _ in range(8):
        if (p/"Code").exists(): return p
        if p.name.lower()=="powercast": return p
        p = p.parent
    return start.resolve()

def _setup_dirs(base_dir: Path):
    out_dir = base_dir / "results" / SECTION
    features_dir = out_dir / "features"
    plots_dir = out_dir / "plots"
    reports_dir = out_dir / "reports"
    for d in (out_dir, features_dir, plots_dir, reports_dir):
        d.mkdir(parents=True, exist_ok=True)
    # clean prior artifacts in this section only
    for folder in (features_dir, plots_dir, reports_dir):
        for p in folder.glob("*"):
            try:
                if p.is_file(): p.unlink()
            except: pass
    return out_dir, features_dir, plots_dir, reports_dir

def _pick_input(base_dir: Path):
    # Prefer Section 2 imputed, then raw, else original Tetuan CSV
    s2_imp = base_dir/"results"/"Wk02_Section2"/"features"/"engineered_lag_rolling_imputed.csv"
    s2_raw = base_dir/"results"/"Wk02_Section2"/"features"/"engineered_lag_rolling.csv"
    if s2_imp.exists(): return s2_imp, "Section2_imputed"
    if s2_raw.exists(): return s2_raw, "Section2_raw"
    orig = base_dir/"data"/"Tetuan City power consumption.csv"
    if orig.exists(): return orig, "original"
    # fallback to any csv in data
    any_csv = list((base_dir/"data").glob("*.csv"))
    if any_csv: return any_csv[0], "original"
    raise FileNotFoundError("No suitable input found: expected Section2 features or data CSV under <BASE>/data.")

def _find_datetime_column(df: pd.DataFrame):
    for c in ["DateTime","datetime","date_time","Timestamp","timestamp","time","Date","date"]:
        if c in df.columns: return c
    for c in df.columns:
        if any(k in c.lower() for k in ["date","time","stamp"]): return c
    return None

def _ensure_dt(df: pd.DataFrame, dt_col: str):
    dt = pd.to_datetime(df[dt_col], errors="coerce")
    if dt.isna().any():
        dt2 = pd.to_datetime(df[dt_col], errors="coerce", dayfirst=True)
        dt = dt.fillna(dt2)
    if dt.isna().any(): raise ValueError("Unable to parse timestamps.")
    return dt

def _train_test_split_time(df: pd.DataFrame, dt_col: str, test_size=0.2):
    n = len(df); split = max(1, int(n*(1-test_size)))
    return df.iloc[:split].copy(), df.iloc[split:].copy()

def _choose_numeric_feature_columns(df: pd.DataFrame, dt_col: str):
    num_cols = [c for c in df.columns if c!=dt_col and pd.api.types.is_numeric_dtype(df[c])]
    # binary flags stay unscaled
    bin_like = []
    for c in list(num_cols):
        vals = pd.Series(df[c]).dropna().unique()
        if len(vals)<=3 and set(vals).issubset({0,1}):
            bin_like.append(c)
    # cyclic encodings in [-1,1] leave unscaled
    cyc_like = [c for c in num_cols if any(x in c.lower() for x in ["sin_", "cos_"])]
    skip_cols = sorted(set(bin_like+cyc_like))
    scale_cols = [c for c in num_cols if c not in skip_cols]
    return scale_cols, skip_cols

def _scale_safe(train, test, cols, method):
    if not cols: 
        return train.copy(), test.copy(), {"columns": [], "method": method}
    if method=="standard": scaler = StandardScaler()
    elif method=="minmax": scaler = MinMaxScaler()
    elif method=="robust": scaler = RobustScaler()
    else: raise ValueError("unknown method")
    scaler.fit(train[cols])
    tr = train.copy(); te = test.copy()
    tr[cols] = scaler.transform(train[cols])
    te[cols] = scaler.transform(test[cols])
    params = {"columns": cols, "method": method}
    if hasattr(scaler,"mean_"): params["mean_"]=scaler.mean_.tolist()
    if hasattr(scaler,"scale_"): params["scale_"]=scaler.scale_.tolist()
    if hasattr(scaler,"center_"): 
        try: params["center_"]=scaler.center_.tolist()
        except: params["center_"]=None
    if hasattr(scaler,"quantile_range"): 
        try: params["quantile_range"]=list(scaler.quantile_range)
        except: pass
    return tr, te, params

def _plot_hist_before_after(train_before, train_after, cols, plots_dir, prefix):
    if not cols: return None, None
    c = cols[0]
    plt.figure(); pd.Series(train_before[c]).dropna().hist(bins=40); 
    plt.title(f"{prefix}: BEFORE scaling — {c}"); plt.xlabel(c); plt.ylabel("count")
    p1 = plots_dir/f"{prefix.lower()}_{c}_before.png"; plt.savefig(p1, bbox_inches="tight"); plt.close()
    plt.figure(); pd.Series(train_after[c]).dropna().hist(bins=40);
    plt.title(f"{prefix}: AFTER scaling — {c}"); plt.xlabel(c); plt.ylabel("count")
    p2 = plots_dir/f"{prefix.lower()}_{c}_after.png"; plt.savefig(p2, bbox_inches="tight"); plt.close()
    return p1, p2

def _to_serializable(obj):
    import numpy as np
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    if isinstance(obj, (list, tuple)):
        return [_to_serializable(x) for x in obj]
    if isinstance(obj, dict):
        return {k:_to_serializable(v) for k,v in obj.items()}
    return obj

def _write_section_report(reports_dir: Path, csv_name: str, diagnostics: dict, scaled_cols, skipped_cols, plots):
    # Business-friendly Q&A copy
    q1 = ("We prepared the numbers so different features are on comparable scales, which helps models learn reliably and keeps one large-magnitude feature from dominating the rest. "
          "We used three common approaches (each with a clear use case):\n"
          "- **Standardization (z-score)** – centers at 0 and scales by typical spread (standard deviation). Good default when data is roughly bell-shaped.\n"
          "- **Min–Max scaling** – compresses values to a 0–1 range. Helpful for algorithms sensitive to absolute ranges.\n"
          "- **Robust scaling** – centers by the median and scales by the IQR (inter-quartile range); more resistant to outliers and skew.\n"
          "Using all three gives the team flexibility to pick the best fit for downstream modeling without re-running feature engineering.")
    q2 = ("We guarded against **data leakage**—letting future information seep into training—by splitting the data **by time**, "
          "fitting each scaler **only on the training period**, and applying that scaling to the later **test period**. "
          "This mirrors production and keeps evaluation honest.")
    if skipped_cols:
        special = ("We intentionally **left some features unscaled** because they are binary flags (0/1) or already unitless cyclical encodings (sine/cosine in [-1,1]). "
                   f"Examples: {', '.join(skipped_cols[:8])}" + (" ..." if len(skipped_cols)>8 else "") +
                   ". For outlier‑prone metrics we prefer **Robust scaling** to avoid over‑weighting spikes.")
    else:
        special = ("We intentionally **left binary flags** and **cyclical encodings** unscaled; for outlier‑prone metrics we prefer **Robust scaling**.")
    lines = [
        "# Week 2 — Section 3: Feature Scaling & Normalization",
        "",
        f"**Input dataset:** `{csv_name}`",
        f"**Rows:** {diagnostics.get('rows')} | **Period:** {diagnostics.get('start')} → {diagnostics.get('end')}",
        "",
        "## Key Questions Answered",
        "### 3. Feature Scaling & Normalization",
        "Q: Which normalization or scaling techniques did you apply to your numerical features, and why?",
        "A: " + q1,
        "",
        "Q: How did you ensure that scaling was performed without introducing data leakage?",
        "A: " + q2,
        "",
        "Q: Did you notice any features that required special treatment during normalization?",
        "A: " + special,
        "",
        "## Artifacts",
        "- Scaled features (Standard): `features/scaled_standard_train.csv`, `features/scaled_standard_test.csv`",
        "- Scaled features (MinMax): `features/scaled_minmax_train.csv`, `features/scaled_minmax_test.csv`",
        "- Scaled features (Robust): `features/scaled_robust_train.csv`, `features/scaled_robust_test.csv`",
        "- Scaler parameters: `features/scalers.json`",
        f"- Plots: {[Path(p).name for p in plots if p]}",
        "- Machine-readable summary: `summary.json`"
    ]
    rp = reports_dir/SECTION_REPORT_FILENAME
    rp.write_text("\n".join(lines), encoding="utf-8")
    return rp

def _update_week_report(base_dir: Path, section_block_md: str):
    wk_path = base_dir / WEEK_REPORT_FILENAME
    if not wk_path.exists():
        base = [
            "# SDS-CP036-powercast — Wk02 Consolidated Business Report",
            "",
            f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
            "",
            "## Section 1 — (placeholder)",
            "## Section 2 — (placeholder)",
            section_block_md,
            "## Section 4 — (placeholder)",
            "## Section 5 — (placeholder)",
        ]
        wk_path.write_text("\n".join(base), encoding="utf-8")
        return str(wk_path)
    txt = wk_path.read_text(encoding="utf-8")
    sec_pat = re.compile(r"(## Section 3[\s\S]*?)(?=^## |\Z)", re.MULTILINE)
    block = section_block_md + "\n"
    if "## Section 3" in txt and sec_pat.search(txt):
        txt = sec_pat.sub(block, txt)
    else:
        txt += ("\n" if not txt.endswith("\n") else "") + block
    wk_path.write_text(txt, encoding="utf-8")
    return str(wk_path)

def _update_readme(base_dir: Path, section_report_path: Path, plots):
    readme = base_dir/"README.md"
    md = readme.read_text(encoding="utf-8") if readme.exists() else "# Powercast — Project Overview\n\n## Table of Contents\n"
    def _find(md, hdr):
        m = re.search(rf"(^## {re.escape(hdr)}\\s*$)", md, flags=re.MULTILINE)
        if not m: return None, None
        start = m.end(); n = re.search(r"^## ", md[start:], flags=re.MULTILINE)
        end = start + (n.start() if n else len(md[start:]))
        return start, end
    def _insert(md, hdr, block):
        s,e = _find(md, hdr)
        if s is None: return md.rstrip()+f"\n\n## {hdr}\n\n{block}\n"
        body = md[s:e]
        if block.strip() in body: return md
        return md[:e] + ("\n" if not body.endswith("\n") else "") + block + "\n" + md[e:]
    thumbs = []
    for p in plots:
        if p:
            rel = Path(p).relative_to(base_dir).as_posix()
            thumbs.append(f'<a href="./{rel}"><img src="./{rel}" width="260" alt="Wk02_Section3 — {Path(p).name}"></a>')
    thumbs_block = "\n".join(thumbs)
    plots_block = "### Wk02_Section3\n" + "\n".join([f"- [{Path(p).stem}](./{Path(p).relative_to(base_dir).as_posix()})" for p in plots if p])
    rel_rep = section_report_path.relative_to(base_dir).as_posix()
    reps_block = f"### Wk02_Section3\n- [Week 2 – Section 3: Feature Scaling & Normalization](./{rel_rep})"
    md = _insert(md, "Quick Gallery (click any thumbnail)", thumbs_block)
    md = _insert(md, "Plots (grouped by Section)", plots_block)
    md = _insert(md, "Section Reports (grouped)", reps_block)
    readme.write_text(md, encoding="utf-8")
    return str(readme)

def process(base_dir: Path):
    base_dir = Path(base_dir)
    out_dir, features_dir, plots_dir, reports_dir = _setup_dirs(base_dir)

    csv_path, src = _pick_input(base_dir)
    df = pd.read_csv(csv_path)
    dt_col = _find_datetime_column(df)
    if dt_col is None: raise ValueError("No datetime-like timestamp column found.")
    df[dt_col] = _ensure_dt(df, dt_col)

    # split
    train, test = _train_test_split_time(df, dt_col, test_size=0.2)

    # choose columns
    scale_cols, skip_cols = _choose_numeric_feature_columns(train, dt_col)

    # scale
    tr_std, te_std, p_std = _scale_safe(train, test, scale_cols, "standard")
    tr_mm,  te_mm,  p_mm  = _scale_safe(train, test, scale_cols, "minmax")
    tr_rb,  te_rb,  p_rb  = _scale_safe(train, test, scale_cols, "robust")

    # save datasets
    std_train_csv = features_dir/"scaled_standard_train.csv"; tr_std.to_csv(std_train_csv, index=False)
    std_test_csv  = features_dir/"scaled_standard_test.csv";  te_std.to_csv(std_test_csv, index=False)
    mm_train_csv  = features_dir/"scaled_minmax_train.csv";   tr_mm.to_csv(mm_train_csv, index=False)
    mm_test_csv   = features_dir/"scaled_minmax_test.csv";    te_mm.to_csv(mm_test_csv, index=False)
    rb_train_csv  = features_dir/"scaled_robust_train.csv";   tr_rb.to_csv(rb_train_csv, index=False)
    rb_test_csv   = features_dir/"scaled_robust_test.csv";    te_rb.to_csv(rb_test_csv, index=False)

    # scalers params json
    scalers_json = features_dir/"scalers.json"
    with open(scalers_json, "w", encoding="utf-8") as f:
        json.dump({
            "standard": _to_serializable(p_std),
            "minmax": _to_serializable(p_mm),
            "robust": _to_serializable(p_rb),
            "skipped_columns": skip_cols,
            "scaled_columns": scale_cols,
            "input_source": src
        }, f, indent=2)

    # plots
    p_before, p_after = _plot_hist_before_after(train, tr_std, scale_cols, plots_dir, "Standard")

    # diagnostics
    dt = df[dt_col]
    diagnostics = {"rows": int(len(df)), "start": str(dt.min()), "end": str(dt.max())}

    # section report
    section_report = _write_section_report(reports_dir, csv_path.name, diagnostics, scale_cols, skip_cols, [p for p in [p_before,p_after] if p])

    # week report block
    block = []
    block.append("## Section 3 — Feature Scaling & Normalization")
    block.append("")
    block.append("### Key Questions Answered")
    block.append("Q: Which normalization or scaling techniques did you apply to your numerical features, and why?")
    block.append("A: We prepared the numbers so different features are on comparable scales, which helps models learn reliably and keeps one large-magnitude feature from dominating the rest. We used three common approaches (each with a clear use case): Standardization (z-score); Min–Max scaling; and Robust scaling. Using all three gives the team flexibility to pick the best fit for downstream modeling without re-running feature engineering.")
    block.append("")
    block.append("Q: How did you ensure that scaling was performed without introducing data leakage?")
    block.append("A: We guarded against data leakage by splitting the data by time, fitting each scaler only on the training period, and applying that scaling to the later test period. This mirrors production and keeps evaluation honest.")
    block.append("")
    block.append("Q: Did you notice any features that required special treatment during normalization?")
    if skip_cols:
        block.append("A: Yes. We intentionally left some features unscaled because they are binary flags (0/1) or already unitless cyclical encodings (sine/cosine in [-1,1]). Examples: " + ", ".join(skip_cols[:8]) + (" ..." if len(skip_cols)>8 else "") + ". For outlier‑prone metrics we prefer Robust scaling to avoid over‑weighting spikes.")
    else:
        block.append("A: Yes. We intentionally left binary flags and cyclical encodings unscaled; for outlier‑prone metrics we prefer Robust scaling.")
    for p in [p_before,p_after]:
        if p:
            rel = Path(p).relative_to(base_dir).as_posix()
            block.append(f"![{Path(p).name}]({rel})")
    week_report = _update_week_report(base_dir, "\n".join(block))

    # README
    readme = _update_readme(base_dir, section_report, [p for p in [p_before,p_after] if p])

    print(f"[Powercast] Using input: {csv_path} (source={src})")
    return {
        "std_train": str(std_train_csv),
        "std_test": str(std_test_csv),
        "mm_train": str(mm_train_csv),
        "mm_test": str(mm_test_csv),
        "rb_train": str(rb_train_csv),
        "rb_test": str(rb_test_csv),
        "scalers_json": str(scalers_json),
        "section_report": str(section_report),
        "week_report": week_report,
        "readme": readme
    }

# Execute
BASE = find_base_dir(Path.cwd())
info = process(BASE)
print(json.dumps(info, indent=2))


[Powercast] Using input: /home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/results/Wk02_Section2/features/engineered_lag_rolling_imputed.csv (source=Section2_imputed)
{
  "std_train": "/home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/results/Wk02_Section3/features/scaled_standard_train.csv",
  "std_test": "/home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/results/Wk02_Section3/features/scaled_standard_test.csv",
  "mm_train": "/home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/results/Wk02_Section3/features/scaled_minmax_train.csv",
  "mm_test": "/home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/results/Wk02_Section3/features/scaled_minmax_test.csv",
  "rb_train": "/home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/results/Wk02_Section3/features/scaled_robust_train.csv",
  "rb_test": "/home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/results/Wk02_Section3/features/scaled_robust_test.csv",
  "scalers_json": "/home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/result