In [1]:
# ===== Week 1 – Section 4: Anomalies & Outliers (Business, full) =====
from pathlib import Path
import os, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

BASE_PROJECT_NAME = "SDS-CP036-powercast"
WEEK = "Wk01"
SECTION = "Section4"
RUN_TAG = f"{WEEK}_{SECTION}"

BASE_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd().resolve()

def find_repo_root(start: Path) -> Path:
    cur = start
    for _ in range(10):
        if (cur / ".git").exists() or (cur / "data").exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start

def _is_writable(dirpath: Path) -> bool:
    try:
        dirpath.mkdir(parents=True, exist_ok=True)
        tmp = dirpath / "__write_test__"
        tmp.write_text("ok", encoding="utf-8")
        tmp.unlink(missing_ok=True)
        return True
    except Exception:
        return False

REPO_ROOT = find_repo_root(BASE_DIR)
DATA_DIR  = REPO_ROOT / "data"
OUTPUT_ROOT = REPO_ROOT if _is_writable(REPO_ROOT / "results") else BASE_DIR

RESULTS_DIR  = OUTPUT_ROOT / "results" / RUN_TAG
PLOTS_DIR    = RESULTS_DIR / "plots"
REPORTS_DIR  = RESULTS_DIR / "reports"
for d in (RESULTS_DIR, PLOTS_DIR, REPORTS_DIR):
    d.mkdir(parents=True, exist_ok=True)

BUSINESS_SUMMARY_MD  = REPORTS_DIR / f"{BASE_PROJECT_NAME}_{RUN_TAG}_Report_Business.md"

CANDIDATES = [DATA_DIR / "Tetuan City power consumption.csv", Path("/mnt/data/Tetuan City power consumption.csv")]
env_path = os.environ.get("ENERGY_CSV_PATH")
if env_path: CANDIDATES.insert(0, Path(env_path))
ENERGY_CSV = next((p for p in CANDIDATES if p.exists()), None)
if ENERGY_CSV is None: raise FileNotFoundError("Tetuan CSV not found:\n  - " + "\n  - ".join(map(str, CANDIDATES)))

def _parse_datetime_series(series: pd.Series) -> pd.Series:
    dt = pd.to_datetime(series, errors="coerce")
    if dt.isna().mean() > 0.5: dt = pd.to_datetime(series, errors="coerce", dayfirst=True)
    return dt

def normalize_and_alias(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = [re.sub(r"\s+", " ", c).strip() for c in df.columns]
    if "DateTime" in df.columns:
        df["DateTime"] = _parse_datetime_series(df["DateTime"].astype(str))
    elif {"Date","Time"}.issubset(df.columns):
        df["DateTime"] = _parse_datetime_series(df["Date"].astype(str)+" "+df["Time"].astype(str))
    else:
        raise ValueError("Need 'DateTime' or 'Date'+'Time'")
    zone_map = {"Zone 1 Power Consumption":"Sub_metering_1","Zone 2 Power Consumption":"Sub_metering_2","Zone 3 Power Consumption":"Sub_metering_3"}
    for s,d in zone_map.items():
        if s in df.columns and d not in df.columns: df.rename(columns={s:d}, inplace=True)
    req = ["DateTime","Sub_metering_1","Sub_metering_2","Sub_metering_3"]
    miss = [c for c in req if c not in df.columns]
    if miss: raise ValueError(f"Missing after alias: {miss}")
    return df.dropna(subset=["DateTime"]).sort_values("DateTime").reset_index(drop=True)

df_raw = pd.read_csv(ENERGY_CSV, sep=",", low_memory=False)
if len(df_raw.columns) == 1: df_raw = pd.read_csv(ENERGY_CSV, sep=";", low_memory=False)
df = normalize_and_alias(df_raw.copy())
df["Total_kW"] = df[["Sub_metering_1","Sub_metering_2","Sub_metering_3"]].astype(float).sum(axis=1)

daily = df.set_index("DateTime")["Total_kW"].resample("D").sum().rename("Total_kW_daily").to_frame()
mu = daily["Total_kW_daily"].mean()
sigma = daily["Total_kW_daily"].std()
daily["zscore"] = (daily["Total_kW_daily"] - mu) / (sigma if sigma else 1.0)
anoms = daily[daily["zscore"].abs() >= 3.0]

plt.figure(); plt.plot(daily.index, daily["Total_kW_daily"], label="Daily total")
plt.axhline(mu + 3*sigma); plt.axhline(mu - 3*sigma)
plt.title("Daily Total Consumption with ±3σ Thresholds"); plt.xlabel("Date"); plt.ylabel("kW")
ts_path = PLOTS_DIR / "section4_daily_ts_anoms.png"; plt.tight_layout(); plt.savefig(ts_path); plt.close()

plt.figure(); plt.boxplot(daily["Total_kW_daily"].values, showmeans=True)
plt.title("Distribution of Daily Totals"); plt.ylabel("kW")
box_path = PLOTS_DIR / "section4_box_daily.png"; plt.tight_layout(); plt.savefig(box_path); plt.close()

plt.figure(); plt.hist(daily["Total_kW_daily"].values, bins=20)
plt.title("Histogram of Daily Totals"); plt.xlabel("kW")
hist_path = PLOTS_DIR / "section4_hist_daily.png"; plt.tight_layout(); plt.savefig(hist_path); plt.close()

first_ts = str(df["DateTime"].min()); last_ts  = str(df["DateTime"].max())
n_anoms = int(len(anoms))
top_list = []
if n_anoms > 0:
    srt = anoms.sort_values("zscore", key=lambda s: s.abs(), ascending=False).head(5)
    top_list = [f"{idx.date()} (z={row['zscore']:.2f})" for idx, row in srt.iterrows()]

md = f"""# 💼 Week 1 – {SECTION}: Anomalies & Outliers (Business-Friendly Report)

## Dataset
Using file: **{ENERGY_CSV.name}**  
Period: **{first_ts} → {last_ts}**  
Rows: **{len(df):,}**

## Key Questions Answered
**Q1: Are there anomalous days in total energy consumption? When?**  
- Detected **{n_anoms}** anomalous day(s) using a ±3σ z-score rule on **daily totals**.
{('- Top examples: ' + ', '.join(top_list)) if top_list else ''}

**Q2: What could explain these anomalies?**  
- Check overlaps with weather or events (e.g., heat waves, holidays, maintenance). You can cross-reference with the Tetuan environmental columns or your calendar.

**Q3: Which visualizations helped you uncover these?**  
- Time series with control limits: `plots/{ts_path.name}`  
- Boxplot of daily totals: `plots/{box_path.name}`  
- Histogram for distribution context: `plots/{hist_path.name}`

## What we computed
- Canonical DateTime & zone aliasing; **Total_kW** across zones.  
- **Daily totals** and a simple **z-score**-based anomaly detector (±3σ).  
- A short list of the most extreme days for review.
"""
BUSINESS_SUMMARY_MD.write_text(md, encoding="utf-8")
print("✅ Section 4 complete."); print("- Report:", BUSINESS_SUMMARY_MD)
print("- Plots:", ts_path.name, box_path.name, hist_path.name)


✅ Section 4 complete.
- Report: /home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/results/Wk01_Section4/reports/SDS-CP036-powercast_Wk01_Section4_Report_Business.md
- Plots: section4_daily_ts_anoms.png section4_box_daily.png section4_hist_daily.png
