In [1]:

# ===== Week 1 – Section 2: Temporal Trends (Business, full) =====
from pathlib import Path
import os, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ---------- Shared Header / Naming ----------
BASE_PROJECT_NAME = "SDS-CP036-powercast"
WEEK = "Wk01"
SECTION = "Section2"
RUN_TAG = f"{WEEK}_{SECTION}"

# Prefer running relative to this file; fall back to CWD when executed in notebooks
BASE_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd().resolve()

def find_repo_root(start: Path) -> Path:
    """Find a reasonable project root; if not found, use start."""
    cur = start
    for _ in range(10):
        if (cur / ".git").exists() or (cur / "data").exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start

def _is_writable(dirpath: Path) -> bool:
    try:
        dirpath.mkdir(parents=True, exist_ok=True)
        tmp = dirpath / "__write_test__"
        tmp.write_text("ok", encoding="utf-8")
        tmp.unlink(missing_ok=True)
        return True
    except Exception:
        return False

REPO_ROOT = find_repo_root(BASE_DIR)
DATA_DIR  = REPO_ROOT / "data"
OUTPUT_ROOT = REPO_ROOT if _is_writable(REPO_ROOT / "results") else BASE_DIR

RESULTS_DIR  = OUTPUT_ROOT / "results" / RUN_TAG
PLOTS_DIR    = RESULTS_DIR / "plots"
REPORTS_DIR  = RESULTS_DIR / "reports"
for d in (RESULTS_DIR, PLOTS_DIR, REPORTS_DIR):
    d.mkdir(parents=True, exist_ok=True)

BUSINESS_SUMMARY_MD  = REPORTS_DIR / f"{BASE_PROJECT_NAME}_{RUN_TAG}_Report_Business.md"

# --- Locate energy CSV flexibly (Tetuan only, or ENV override) ---
CANDIDATES = [
    DATA_DIR / "Tetuan City power consumption.csv",
    Path("/mnt/data/Tetuan City power consumption.csv"),
]
env_path = os.environ.get("ENERGY_CSV_PATH")
if env_path:
    CANDIDATES.insert(0, Path(env_path))

ENERGY_CSV = next((p for p in CANDIDATES if p.exists()), None)
if ENERGY_CSV is None:
    raise FileNotFoundError(
        "Could not locate Tetuan dataset. Looked for:\n  - " + "\n  - ".join(map(str, CANDIDATES))
    )

# ---------- Loader (normalized headers + aliasing) ----------
def _parse_datetime_series(series: pd.Series) -> pd.Series:
    dt = pd.to_datetime(series, errors="coerce")
    if dt.isna().mean() > 0.5:
        dt = pd.to_datetime(series, errors="coerce", dayfirst=True)
    return dt

def normalize_and_alias(df: pd.DataFrame) -> pd.DataFrame:
    # 1) normalize headers (handles "Zone 2  Power Consumption" → "Zone 2 Power Consumption")
    df.columns = [re.sub(r"\s+", " ", c).strip() for c in df.columns]

    # 2) build/use DateTime
    if "DateTime" in df.columns:
        df["DateTime"] = _parse_datetime_series(df["DateTime"].astype(str))
    elif {"Date", "Time"}.issubset(df.columns):
        df["DateTime"] = _parse_datetime_series(df["Date"].astype(str) + " " + df["Time"].astype(str))
    else:
        raise ValueError(f"Need 'DateTime' or 'Date'+'Time'. Got: {list(df.columns)}")

    # 3) alias Tetuan zone cols → legacy names expected by downstream code
    zone_map = {
        "Zone 1 Power Consumption": "Sub_metering_1",
        "Zone 2 Power Consumption": "Sub_metering_2",
        "Zone 3 Power Consumption": "Sub_metering_3",
    }
    for src, dst in zone_map.items():
        if src in df.columns and dst not in df.columns:
            df.rename(columns={src: dst}, inplace=True)

    # 4) validate required columns
    required = ["DateTime", "Sub_metering_1", "Sub_metering_2", "Sub_metering_3"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(
            f"Missing required columns after normalization/aliasing: {missing}\n"
            f"Available: {df.columns.tolist()}"
        )

    # 5) final clean/sort
    return df.dropna(subset=["DateTime"]).sort_values("DateTime").reset_index(drop=True)

# --- Load & prepare ---
df_raw = pd.read_csv(ENERGY_CSV, sep=",", low_memory=False)
if len(df_raw.columns) == 1:
    df_raw = pd.read_csv(ENERGY_CSV, sep=";", low_memory=False)
df = normalize_and_alias(df_raw.copy())

# --- Enrich time features ---
df["Date"] = df["DateTime"].dt.date
df["Hour"] = df["DateTime"].dt.hour
df["DoW"]  = df["DateTime"].dt.dayofweek  # 0=Mon,6=Sun
dow_names = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]
df["DoW_Name"] = df["DoW"].map({i:n for i,n in enumerate(dow_names)})

# --- Totals & aggregations ---
df["Total_kW"] = df[["Sub_metering_1","Sub_metering_2","Sub_metering_3"]].astype(float).sum(axis=1)

daily_avg = df.groupby("Date")[["Sub_metering_1","Sub_metering_2","Sub_metering_3","Total_kW"]].mean()
daily_avg.index = pd.to_datetime(daily_avg.index)

dow_stats = df.groupby("DoW_Name")["Total_kW"].agg(["mean","median","std","count"]).reindex(dow_names)
hourly_zone1 = df.groupby("Hour")["Sub_metering_1"].mean()

# --- Visualizations (matplotlib; one chart per figure, no explicit colors) ---
# 1) Line plot: daily averages (Total + Zones)
plt.figure()
plt.plot(daily_avg.index, daily_avg["Total_kW"], label="Total_kW")
plt.plot(daily_avg.index, daily_avg["Sub_metering_1"], label="Zone1")
plt.plot(daily_avg.index, daily_avg["Sub_metering_2"], label="Zone2")
plt.plot(daily_avg.index, daily_avg["Sub_metering_3"], label="Zone3")
plt.title("Daily Averages: Total & Zones")
plt.xlabel("Date")
plt.ylabel("Average kW")
plt.legend()
line_path = PLOTS_DIR / "section2_daily_averages.png"
plt.tight_layout()
plt.savefig(line_path)
plt.close()

# 2) Box plot: Total_kW by day of week
data_by_dow = [df.loc[df["DoW"]==i, "Total_kW"].values for i in range(7) if (df["DoW"]==i).any()]
plt.figure()
plt.boxplot(data_by_dow, labels=[dow_names[i] for i in range(len(data_by_dow))], showmeans=True)
plt.title("Total Consumption by Day of Week")
plt.xlabel("Day of Week")
plt.ylabel("kW")
box_path = PLOTS_DIR / "section2_box_by_dow.png"
plt.tight_layout()
plt.savefig(box_path)
plt.close()

# 3) Heatmap: Zone 1 (Kitchen) Hour vs Day
pivot_z1 = df.pivot_table(index="DoW", columns="Hour", values="Sub_metering_1", aggfunc="mean")
pivot_z1 = pivot_z1.reindex(index=range(7))  # ensure order Mon..Sun
plt.figure()
plt.imshow(pivot_z1.values, aspect="auto")
plt.title("Zone 1 Avg kW — Hour vs Day-of-Week")
plt.xlabel("Hour of Day")
plt.ylabel("Day (0=Mon ... 6=Sun)")
plt.colorbar()
heat_path = PLOTS_DIR / "section2_heatmap_zone1.png"
plt.tight_layout()
plt.savefig(heat_path)
plt.close()

# --- Business answers (data-driven) ---
peak_dow = dow_stats["mean"].idxmax() if not dow_stats["mean"].isna().all() else "NA"
peak_hour_z1 = int(hourly_zone1.idxmax()) if not hourly_zone1.empty else -1

first_ts = str(df["DateTime"].min())
last_ts  = str(df["DateTime"].max())

# --- Business Report (Markdown) ---
md = f"""# 💼 Week 1 – {SECTION}: Temporal Trends (Business-Friendly Report)

## Dataset
Using file: **{ENERGY_CSV.name}**  
Period: **{first_ts} → {last_ts}**  
Rows: **{len(df):,}**

## Key Questions Answered
**Q1: What daily or weekly patterns are observable in power consumption across the three zones?**  
- The **line plot** of daily averages (Total & Zones) shows overall movement and relative contribution by zone.  
- By weekday, average total usage peaks on **{peak_dow}** based on the dataset's mean profile.

**Q2: Are there seasonal or time-of-day peaks and dips in energy usage?**  
- The **heatmap** for Zone 1 (kitchen proxy) highlights typical time-of-day peaks; the highest average hour is around **{peak_hour_z1}:00**.  
- Broader seasonal effects can be explored by comparing monthly averages (extendable in this section if needed).

**Q3: Which visualizations helped you uncover these patterns?**  
- **Line plot** (daily averages): `plots/{line_path.name}`  
- **Box plot** (by day of week): `plots/{box_path.name}`  
- **Heatmap** (hour vs day for Zone 1): `plots/{heat_path.name}`

## What we computed
- Canonical **DateTime** and zone aliasing (**Zone 1/2/3 → Sub_metering_1/2/3**).  
- **Daily averages** of total and per-zone consumption.  
- **Day-of-week distribution** (box plot) for total consumption.  
- **Hour × day heatmap** for Zone 1.

"""

BUSINESS_SUMMARY_MD.write_text(md, encoding="utf-8")

print("✅ Section 2 (Business, full) complete.")
print("- Report:", BUSINESS_SUMMARY_MD)
print("- Plots:", line_path.name, box_path.name, heat_path.name)


✅ Section 2 (Business, full) complete.
- Report: /home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/results/Wk01_Section2/reports/SDS-CP036-powercast_Wk01_Section2_Report_Business.md
- Plots: section2_daily_averages.png section2_box_by_dow.png section2_heatmap_zone1.png
