In [3]:

# Week 3 - Section 1: Model Selection & Training (Business)
# Single Code Cell Execution - with Profiles: dev, preprod, final
# Includes: Upsert into consolidated Week 3 report (no duplicates).

import os, re, warnings
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error

warnings.filterwarnings("ignore")


import re

def upsert_section_in_consolidated(consolidated_path: Path, section_text: str,
                                   header_variants: list[str],
                                   insert_order_hint: list[str] | None = None):
    """
    Replace the block that starts with any header in `header_variants`.
    If none exists, append (or insert before the first hint header, if provided).
    Robust to '-' vs '–' and avoids inline regex flags collisions.
    """
    consolidated_path.parent.mkdir(parents=True, exist_ok=True)
    existing = consolidated_path.read_text(encoding="utf-8") if consolidated_path.exists() else ""
    text = existing.replace("\r\n", "\n").replace("\r", "\n")

    # Build regex to remove any existing block for this section
    hdr_alt = "|".join(re.escape(h) for h in header_variants)
    any_sds_hdr = r"^\#\s*SDS-CP036-powercast\s*[–-]\s*Week\s*3\s*Section\s*\d+:\s*.*$"
    block_pat = rf"^(?:{hdr_alt})\s*.*?(?=^{any_sds_hdr}|\Z)"
    text = re.sub(block_pat, "", text, flags=re.M | re.S).strip()

    # Prepare the new (clean) block to insert
    new_block = section_text.strip()

    def insert_before_first_hint(container: str, block: str, hints: list[str]) -> str:
        for h in hints:
            m = re.search(rf"^{re.escape(h)}\s*$", container, flags=re.M)
            if m:
                return container[:m.start()] + (block + "\n\n---\n\n") + container[m.start():]
        return container + ("\n\n---\n\n" if container.strip() else "") + block

    if insert_order_hint:
        text = insert_before_first_hint(text, new_block, insert_order_hint)
    else:
        text = text + ("\n\n---\n\n" if text.strip() else "") + new_block

    consolidated_path.write_text(text.strip() + "\n", encoding="utf-8")


# -------- Optional model deps (graceful fallback) --------
PROPHET_AVAILABLE = True
try:
    from prophet import Prophet
except Exception:
    PROPHET_AVAILABLE = False

XGB_AVAILABLE = True
try:
    from xgboost import XGBRegressor
except Exception:
    XGB_AVAILABLE = False

SARIMAX_AVAILABLE = True
try:
    from statsmodels.tsa.statespace.sarimax import SARIMAX
except Exception:
    SARIMAX_AVAILABLE = False

# -------- Project paths & helpers --------
BASE_PROJECT_NAME = "SDS-CP036-powercast"

def find_repo_root(start: Path) -> Path:
    cur = start
    for _ in range(10):
        if (cur / "data").exists() and (cur / "results").exists():
            return cur
        cur = cur.parent
    return start

BASE_DIR = find_repo_root(Path.cwd())

# -------- Profiles (dev, preprod, final) --------
PROFILE = "dev"  # choose: "dev", "preprod", "final"

profiles = {
    "dev":     dict(FAST_MODE=True,  RESAMPLE_TO="H", MAX_DAYS=365, TEST_DAYS=7,  BACKTEST=False, BACKTEST_FOLDS=0, BACKTEST_STEP_DAYS=0, BACKTEST_HOURS=0, PROPHET_ZONES=1),
    "preprod": dict(FAST_MODE=False, RESAMPLE_TO="H", MAX_DAYS=365, TEST_DAYS=28, BACKTEST=False, BACKTEST_FOLDS=0, BACKTEST_STEP_DAYS=0, BACKTEST_HOURS=0, PROPHET_ZONES=3),
    "final":   dict(FAST_MODE=False, RESAMPLE_TO="H", MAX_DAYS=365, TEST_DAYS=None, BACKTEST=True,  BACKTEST_FOLDS=6, BACKTEST_STEP_DAYS=7, BACKTEST_HOURS=168, PROPHET_ZONES=3),
}
cfg = profiles[PROFILE]

FAST_MODE   = cfg["FAST_MODE"]
RESAMPLE_TO = cfg["RESAMPLE_TO"]
MAX_DAYS    = cfg["MAX_DAYS"]
TEST_DAYS   = cfg["TEST_DAYS"]
BACKTEST    = cfg["BACKTEST"]
BACKTEST_FOLDS = cfg["BACKTEST_FOLDS"]
BACKTEST_STEP_DAYS = cfg["BACKTEST_STEP_DAYS"]
BACKTEST_HOURS = cfg["BACKTEST_HOURS"]
PROPHET_ZONES = cfg["PROPHET_ZONES"]

# Results per profile (separate folders)
RESULTS_DIR = BASE_DIR / "results" / f"Wk03_Section1_{PROFILE}"
PLOTS_DIR   = RESULTS_DIR / "plots"
REPORTS_DIR = RESULTS_DIR / "reports"
for d in [RESULTS_DIR, PLOTS_DIR, REPORTS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# -------- Load & prepare data --------
data_path = BASE_DIR / "data" / "Tetuan City power consumption.csv"
if not data_path.exists():
    raise FileNotFoundError("Dataset not found at: {}".format(data_path))

df = pd.read_csv(data_path)
# Normalize column names (strip & collapse spaces)
df.columns = df.columns.str.strip().str.replace(r"\s+", " ", regex=True)

# Parse time & set index
df["DateTime"] = pd.to_datetime(df["DateTime"])
df = df.set_index("DateTime").sort_index()

# Keep numeric cols only for modeling safety
num_df = df.select_dtypes(include=[np.number]).copy()

# Downsample & cap horizon
if RESAMPLE_TO:
    num_df = num_df.resample(RESAMPLE_TO).mean()
if isinstance(MAX_DAYS, (int, float)):
    try:
        num_df = num_df.last("{}D".format(int(MAX_DAYS)))
    except Exception:
        num_df = num_df.iloc[-24*int(MAX_DAYS):]

# Zones (after header cleanup)
zones = ["Zone 1 Power Consumption", "Zone 2 Power Consumption", "Zone 3 Power Consumption"]
zones = [z for z in zones if z in num_df.columns]

# -------- Helpers --------
def mape_safe(y_true, y_pred):
    denom = np.where(y_true == 0, np.nan, np.abs(y_true))
    return float(np.nanmean(np.abs(y_true - y_pred) / denom) * 100.0)

def evaluate_forecast(y_true, y_pred):
    return {
        "RMSE": float(mean_squared_error(y_true, y_pred, squared=False)),
        "MAE": float(mean_absolute_error(y_true, y_pred)),
        "MAPE": mape_safe(np.asarray(y_true), np.asarray(y_pred)),
    }

def plot_series(idx, y_true, y_pred, title, fname):
    plt.figure(figsize=(11,4))
    plt.plot(idx, y_true, label="Actual")
    plt.plot(idx, y_pred, label="Predicted")
    plt.title(title)
    plt.legend()
    plt.savefig(fname, bbox_inches="tight")
    plt.close()

def rolling_splits(index, folds=4, step_days=7, horizon_hours=168):
    step = pd.Timedelta(days=step_days)
    horizon = pd.Timedelta(hours=horizon_hours)
    end = index.max()
    splits = []
    cur_test_end = end
    for fold in range(folds):
        test_start = cur_test_end - horizon + pd.Timedelta(hours=1)
        val_end = test_start - pd.Timedelta(hours=1)
        val_start = val_end - step + pd.Timedelta(hours=1)
        train_end = val_start - pd.Timedelta(hours=1)
        if train_end <= index.min():
            break
        tr_mask = (index <= train_end)
        va_mask = (index >= val_start) & (index <= val_end)
        te_mask = (index >= test_start) & (index <= cur_test_end)
        splits.append((tr_mask, va_mask, te_mask))
        cur_test_end = test_start - pd.Timedelta(hours=1)
    return splits

def prophet_predict(train_series, test_index):
    df_p = train_series.reset_index()
    df_p.columns = ["ds", "y"]
    m = Prophet(daily_seasonality=True, weekly_seasonality=False, yearly_seasonality=False)
    m.fit(df_p)
    future = pd.DataFrame(test_index).reset_index()
    future = future.rename(columns={future.columns[-1]: "ds"})
    future["ds"] = pd.to_datetime(future["ds"])
    fc = m.predict(future)
    return fc["yhat"].values

def xgb_predict(train_series, val_series, test_series, lags=24, fast=FAST_MODE):
    full_series = pd.concat([train_series, val_series, test_series])
    df_l = pd.DataFrame({"y": full_series})
    for L in range(1, lags+1):
        df_l["lag_{}".format(L)] = df_l["y"].shift(L)
    df_l = df_l.dropna()
    n_train = len(train_series)
    n_val = len(val_series)
    split_idx = n_train + n_val - lags
    train_ml = df_l.iloc[:split_idx]
    test_ml  = df_l.iloc[split_idx:]
    X_tr, y_tr = train_ml.drop(columns=["y"]), train_ml["y"]
    X_te, y_te = test_ml.drop(columns=["y"]), test_ml["y"]
    model = XGBRegressor(
        n_estimators=120 if fast else 200,
        max_depth=4 if fast else 6,
        learning_rate=0.1,
        subsample=0.9, colsample_bytree=0.9,
        objective="reg:squarederror", n_jobs=0
    )
    model.fit(X_tr, y_tr, verbose=False)
    y_pred = model.predict(X_te)
    y_true_idx = test_series.index[-len(y_pred):]
    return y_true_idx, y_pred

# -------- Experiment execution --------
summary_rows = []

try:
    if not BACKTEST:
        if TEST_DAYS is None:
            TEST_DAYS = 7
        test = num_df.last("{}D".format(TEST_DAYS))
        pre  = num_df.iloc[: -len(test)] if len(num_df) > len(test) else num_df.iloc[:0]
        n_pre = len(pre)
        n_train = int(n_pre * 0.85)
        train = pre.iloc[:n_train]
        val   = pre.iloc[n_train:]

        preds_store = {}

        for zone in zones:
            # Baseline
            if len(val) and len(test):
                y_true = test[zone].values
                pivot = val[zone].iloc[-1] if len(val) else train[zone].iloc[-1]
                y_pred = np.repeat(pivot, len(test))
                met = evaluate_forecast(y_true, y_pred)
                summary_rows.append({"Profile": PROFILE, "Zone": zone, "Model": "Baseline (Naive)", **met})
                preds_store[("Baseline", zone)] = (test.index, y_pred)

            # SARIMAX
            if SARIMAX_AVAILABLE and len(train) > 10 and len(test) > 0:
                try:
                    seasonal = 24
                    model = SARIMAX(train[zone], order=(1,1,1), seasonal_order=(0,1,1,seasonal),
                                    enforce_stationarity=False, enforce_invertibility=False)
                    res = model.fit(disp=False)
                    fc = res.get_forecast(steps=len(test)).predicted_mean.values
                    met = evaluate_forecast(test[zone].values, fc)
                    summary_rows.append({"Profile": PROFILE, "Zone": zone, "Model": "SARIMAX(1,1,1)(0,1,1,{})".format(seasonal), **met})
                    preds_store[("SARIMAX", zone)] = (test.index, fc)
                except Exception:
                    summary_rows.append({"Profile": PROFILE, "Zone": zone, "Model": "SARIMAX", "RMSE": np.nan, "MAE": np.nan, "MAPE": np.nan})

            # Prophet (optional)
            if PROPHET_AVAILABLE and len(train) > 10 and len(test) > 0:
                try:
                    idx_zone = zones.index(zone)
                    if (PROFILE == "dev" and idx_zone >= 1):
                        pass
                    else:
                        yhat = prophet_predict(train[zone], test.index)
                        met = evaluate_forecast(test[zone].values, yhat)
                        summary_rows.append({"Profile": PROFILE, "Zone": zone, "Model": "Prophet", **met})
                        preds_store[("Prophet", zone)] = (test.index, yhat)
                except Exception:
                    pass

            # XGBoost (optional)
            if XGB_AVAILABLE and len(train) > 10 and len(test) > 0:
                try:
                    idx_align, yhat = xgb_predict(train[zone], val[zone], test[zone], lags=24, fast=FAST_MODE)
                    met = evaluate_forecast(test[zone].reindex(idx_align).values, yhat)
                    summary_rows.append({"Profile": PROFILE, "Zone": zone, "Model": "XGBoost (lags)", **met})
                    preds_store[("XGBoost", zone)] = (idx_align, yhat)
                except Exception:
                    summary_rows.append({"Profile": PROFILE, "Zone": zone, "Model": "XGBoost (lags)", "RMSE": np.nan, "MAE": np.nan, "MAPE": np.nan})

        results_df = pd.DataFrame(summary_rows)
        results_csv = REPORTS_DIR / "model_comparison.csv"
        results_df.to_csv(results_csv, index=False)

        # Plot example for Zone 1
        if "Zone 1 Power Consumption" in zones and len(results_df) > 0:
            z = "Zone 1 Power Consumption"
            z_df = results_df[results_df["Zone"] == z].dropna(subset=["RMSE"])
            if len(z_df) > 0:
                best_model = z_df.sort_values("RMSE").iloc[0]["Model"]
                key = None
                for cand in ["XGBoost","SARIMAX","Prophet","Baseline"]:
                    if cand in best_model:
                        key = (cand, z)
                        break
                if key in preds_store:
                    idx, yhat = preds_store[key]
                    ytrue = test[z].reindex(idx).values
                    fname = PLOTS_DIR / "Zone_1_best_model_forecast.png"
                    plot_series(idx, ytrue, yhat, "Forecast Example - {} ({})".format(z, best_model), fname)

    else:
        # Rolling backtest path
        idx = num_df.index
        splits = rolling_splits(idx, folds=BACKTEST_FOLDS, step_days=BACKTEST_STEP_DAYS, horizon_hours=BACKTEST_HOURS)
        rows = []
        for zone in zones:
            for fold_id, (tr_m, va_m, te_m) in enumerate(splits, start=1):
                train = num_df.loc[tr_m]
                val   = num_df.loc[va_m]
                test  = num_df.loc[te_m]

                # Baseline
                try:
                    y_true = test[zone].values
                    pivot = val[zone].iloc[-1] if len(val) else train[zone].iloc[-1]
                    y_pred = np.repeat(pivot, len(test))
                    met = evaluate_forecast(y_true, y_pred)
                    rows.append({"Profile": PROFILE, "Zone": zone, "Model": "Baseline (Naive)", "Fold": fold_id, **met})
                except Exception:
                    rows.append({"Profile": PROFILE, "Zone": zone, "Model": "Baseline (Naive)", "Fold": fold_id, "RMSE": np.nan, "MAE": np.nan, "MAPE": np.nan})

                # SARIMAX
                if SARIMAX_AVAILABLE:
                    try:
                        if len(train) > 10:
                            seasonal = 24
                            model = SARIMAX(train[zone], order=(1,1,1), seasonal_order=(0,1,1,seasonal),
                                            enforce_stationarity=False, enforce_invertibility=False)
                            res = model.fit(disp=False)
                            y_pred = res.get_forecast(steps=len(test)).predicted_mean.values
                            met = evaluate_forecast(y_true, y_pred)
                            rows.append({"Profile": PROFILE, "Zone": zone, "Model": "SARIMAX(1,1,1)(0,1,1,{})".format(seasonal), "Fold": fold_id, **met})
                    except Exception:
                        rows.append({"Profile": PROFILE, "Zone": zone, "Model": "SARIMAX", "Fold": fold_id, "RMSE": np.nan, "MAE": np.nan, "MAPE": np.nan})

                # XGBoost
                if XGB_AVAILABLE:
                    try:
                        def make_lag_df(series, lags=24):
                            dfl = pd.DataFrame({"y": series})
                            for L in range(1, lags+1):
                                dfl["lag_{}".format(L)] = dfl["y"].shift(L)
                            return dfl.dropna()
                        LAGS = 24
                        series = pd.concat([train[zone], val[zone], test[zone]])
                        df_lag = make_lag_df(series, lags=LAGS)
                        n_train_lag = len(train)
                        n_val_lag = len(val)
                        split_idx = n_train_lag + n_val_lag - LAGS
                        train_ml = df_lag.iloc[:split_idx]
                        test_ml  = df_lag.iloc[split_idx:]
                        X_tr, y_tr = train_ml.drop(columns=["y"]), train_ml["y"]
                        X_te, y_te = test_ml.drop(columns=["y"]), test_ml["y"]
                        model = XGBRegressor(
                            n_estimators=150, max_depth=5,
                            learning_rate=0.08 if not FAST_MODE else 0.1,
                            subsample=0.9, colsample_bytree=0.9,
                            objective="reg:squarederror", n_jobs=0
                        )
                        model.fit(X_tr, y_tr, verbose=False)
                        y_pred = model.predict(X_te)
                        y_true_idx = test.index[-len(y_pred):]
                        y_true = test[zone].reindex(y_true_idx).values
                        met = evaluate_forecast(y_true, y_pred)
                        rows.append({"Profile": PROFILE, "Zone": zone, "Model": "XGBoost (lags)", "Fold": fold_id, **met})
                    except Exception:
                        rows.append({"Profile": PROFILE, "Zone": zone, "Model": "XGBoost (lags)", "Fold": fold_id, "RMSE": np.nan, "MAE": np.nan, "MAPE": np.nan})

    # Save folds (if backtest)
    if BACKTEST:
        results_df = pd.DataFrame(rows)
        results_csv = REPORTS_DIR / "model_comparison_folds.csv"
        results_df.to_csv(results_csv, index=False)

        agg = results_df.groupby(["Zone","Model"]).agg(
            RMSE_mean=("RMSE","mean"), RMSE_std=("RMSE","std"),
            MAE_mean=("MAE","mean"),   MAE_std=("MAE","std"),
            MAPE_mean=("MAPE","mean"), MAPE_std=("MAPE","std")
        ).reset_index()
        agg_csv = REPORTS_DIR / "model_comparison_aggregate.csv"
        agg.to_csv(agg_csv, index=False)

        champs = agg.sort_values(["Zone","RMSE_mean"]).groupby("Zone").head(1)
        champs_csv = REPORTS_DIR / "champion_models.csv"
        champs.to_csv(champs_csv, index=False)

        # Optional champion plot
        try:
            if "Zone 1 Power Consumption" in zones and len(splits) > 0:
                last_tr, last_va, last_te = splits[0]  # most recent fold
                train = num_df.loc[last_tr]
                val   = num_df.loc[last_va]
                test  = num_df.loc[last_te]
                z = "Zone 1 Power Consumption"
                champ_model = champs[champs["Zone"] == z].iloc[0]["Model"]
                ytrue = test[z].values
                idx = test.index
                yhat = None
                title = "Forecast Example - {} ({})".format(z, champ_model)
                if "SARIMAX" in champ_model and SARIMAX_AVAILABLE:
                    seasonal = 24
                    model = SARIMAX(train[z], order=(1,1,1), seasonal_order=(0,1,1,seasonal),
                                    enforce_stationarity=False, enforce_invertibility=False)
                    res = model.fit(disp=False)
                    yhat = res.get_forecast(steps=len(test)).predicted_mean.values
                elif "XGBoost" in champ_model and XGB_AVAILABLE:
                    y_true_idx, yhat = xgb_predict(train[z], val[z], test[z], lags=24, fast=FAST_MODE)
                    ytrue = test[z].reindex(y_true_idx).values
                    idx = y_true_idx
                elif "Baseline" in champ_model:
                    pivot = val[z].iloc[-1] if len(val) else train[z].iloc[-1]
                    yhat = np.repeat(pivot, len(test))
                if yhat is not None:
                    fname = PLOTS_DIR / "Zone_1_champion_forecast.png"
                    plot_series(idx, ytrue, yhat, title, fname)
        except Exception:
            pass
finally:
    # -------- Build Section Report (always) --------
    section_report_path = REPORTS_DIR / "SDS-CP036-powercast_Wk03_Section1_Business_Report.md"
    consolidated_report_path = BASE_DIR / "SDS-CP036-powercast_Wk03_Report_Business.md"

    if not BACKTEST:
        csv_line = "[Model Comparison - CSV]({})".format("model_comparison.csv")
        extra = []
    else:
        csv_line = "[Model Comparison (folds) - CSV]({})".format("model_comparison_folds.csv")
        extra = ["[Aggregate Summary - CSV]({})".format("model_comparison_aggregate.csv"),
                 "[Champion Models - CSV]({})".format("champion_models.csv")]

    lines = [
        "# {} - Week 3 Section 1: Model Selection & Training".format(BASE_PROJECT_NAME),
        "",
        "Profile: **{}**".format(PROFILE),
        "",
        "## Key Questions Answered",
        "",
        "Q: Which machine learning models did you choose for forecasting power consumption, and what motivated your selections?",
        "A: We compared a simple Baseline (naive), a statistical time-series model (SARIMAX) that handles daily seasonality, an optional Prophet for trend/seasonality decomposition, and an optional XGBoost model with lag features for non-linear patterns.",
        "",
        "Q: How did you structure your models to handle the multi-zone prediction task (separate models vs. multi-output)?",
        "A: We trained separate models for each zone (Zone 1, Zone 2, Zone 3). This avoids cross-zone interference and keeps insights clear for operations teams.",
        "",
        "Q: What challenges did you encounter during model training, and how did you address them?",
        "A: Runtime and data quirks were the main issues. We used hourly resampling and capped history for speed; we normalized column headers to remove double spaces; and we set lighter SARIMAX and XGBoost defaults for fast, reliable runs. Prophet is optional and limited in dev to keep execution snappy.",
        "",
        csv_line,
    ] + ([""] + extra if extra else []) + [
        "",
        "---",
        "",
        "## Business Value Summary (Executive View)",
        "- Faster iteration: Profiles (dev/preprod/final) let us move from quick smoke-tests to rigorous selection without changing code.",
        "- Clear decisions: Side-by-side metrics identify the best model per zone; the final profile adds fold averages and stability checks.",
        "- Reduced risk: Using RMSE/MAE/MAPE together prevents optimizing for a single number that might miss operational errors.",
        "- Transparency: Reproducible splits, consistent resampling, and saved artifacts make results easy to explain to stakeholders.",
        "- Scalability: The per-zone approach and shared pipeline scale cleanly as new data or zones are added.",
    ]

    REPORTS_DIR.mkdir(parents=True, exist_ok=True)
    with open(section_report_path, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

    # ===== Upsert into consolidated (replace-or-append) =====
    SECTION1_HEADERS = [
        "# SDS-CP036-powercast - Week 3 Section 1: Model Selection & Training",
        "# SDS-CP036-powercast – Week 3 Section 1: Model Selection & Training",
    ]
    SECTION2_HEADERS = [
        "# SDS-CP036-powercast - Week 3 Section 2: MLflow Experiment Tracking",
        "# SDS-CP036-powercast – Week 3 Section 2: MLflow Experiment Tracking",
    ]
    section_text = Path(section_report_path).read_text(encoding="utf-8")
    upsert_section_in_consolidated(
        consolidated_path=consolidated_report_path,
        section_text=section_text,
        header_variants=SECTION1_HEADERS,
        insert_order_hint=SECTION2_HEADERS  # keep S1 before S2
    )

    # Debug info
    print("Section report path:", section_report_path.resolve())
    print("Consolidated report path:", consolidated_report_path.resolve())
    print("Done upserting Section 1.")


07:18:16 - cmdstanpy - INFO - Chain [1] start processing
07:18:18 - cmdstanpy - INFO - Chain [1] done processing


Section report path: /home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/results/Wk03_Section1_dev/reports/SDS-CP036-powercast_Wk03_Section1_Business_Report.md
Consolidated report path: /home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/SDS-CP036-powercast_Wk03_Report_Business.md
Done upserting Section 1.
