In [1]:
# ===== Week 1 – Section 3: Drivers & Correlations (Business, full) =====
from pathlib import Path
import os, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

BASE_PROJECT_NAME = "SDS-CP036-powercast"
WEEK = "Wk01"
SECTION = "Section3"
RUN_TAG = f"{WEEK}_{SECTION}"

BASE_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd().resolve()

def find_repo_root(start: Path) -> Path:
    cur = start
    for _ in range(10):
        if (cur / ".git").exists() or (cur / "data").exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start

def _is_writable(dirpath: Path) -> bool:
    try:
        dirpath.mkdir(parents=True, exist_ok=True)
        tmp = dirpath / "__write_test__"
        tmp.write_text("ok", encoding="utf-8")
        tmp.unlink(missing_ok=True)
        return True
    except Exception:
        return False

REPO_ROOT = find_repo_root(BASE_DIR)
DATA_DIR  = REPO_ROOT / "data"
OUTPUT_ROOT = REPO_ROOT if _is_writable(REPO_ROOT / "results") else BASE_DIR

RESULTS_DIR  = OUTPUT_ROOT / "results" / RUN_TAG
PLOTS_DIR    = RESULTS_DIR / "plots"
REPORTS_DIR  = RESULTS_DIR / "reports"
for d in (RESULTS_DIR, PLOTS_DIR, REPORTS_DIR):
    d.mkdir(parents=True, exist_ok=True)

BUSINESS_SUMMARY_MD  = REPORTS_DIR / f"{BASE_PROJECT_NAME}_{RUN_TAG}_Report_Business.md"

CANDIDATES = [DATA_DIR / "Tetuan City power consumption.csv", Path("/mnt/data/Tetuan City power consumption.csv")]
env_path = os.environ.get("ENERGY_CSV_PATH")
if env_path: CANDIDATES.insert(0, Path(env_path))
ENERGY_CSV = next((p for p in CANDIDATES if p.exists()), None)
if ENERGY_CSV is None: raise FileNotFoundError("Tetuan CSV not found:\n  - " + "\n  - ".join(map(str, CANDIDATES)))

def _parse_datetime_series(series: pd.Series) -> pd.Series:
    dt = pd.to_datetime(series, errors="coerce")
    if dt.isna().mean() > 0.5: dt = pd.to_datetime(series, errors="coerce", dayfirst=True)
    return dt

def normalize_and_alias(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = [re.sub(r"\s+", " ", c).strip() for c in df.columns]
    if "DateTime" in df.columns:
        df["DateTime"] = _parse_datetime_series(df["DateTime"].astype(str))
    elif {"Date","Time"}.issubset(df.columns):
        df["DateTime"] = _parse_datetime_series(df["Date"].astype(str)+" "+df["Time"].astype(str))
    else:
        raise ValueError("Need 'DateTime' or 'Date'+'Time'")
    zone_map = {"Zone 1 Power Consumption":"Sub_metering_1","Zone 2 Power Consumption":"Sub_metering_2","Zone 3 Power Consumption":"Sub_metering_3"}
    for s,d in zone_map.items():
        if s in df.columns and d not in df.columns: df.rename(columns={s:d}, inplace=True)
    req = ["DateTime","Sub_metering_1","Sub_metering_2","Sub_metering_3"]
    miss = [c for c in req if c not in df.columns]
    if miss: raise ValueError(f"Missing after alias: {miss}")
    return df.dropna(subset=["DateTime"]).sort_values("DateTime").reset_index(drop=True)

df_raw = pd.read_csv(ENERGY_CSV, sep=",", low_memory=False)
if len(df_raw.columns) == 1: df_raw = pd.read_csv(ENERGY_CSV, sep=";", low_memory=False)
df = normalize_and_alias(df_raw.copy())

df["Total_kW"] = df[["Sub_metering_1","Sub_metering_2","Sub_metering_3"]].astype(float).sum(axis=1)

feature_cols = [c for c in ["Temperature","Humidity","Wind Speed","general diffuse flows","diffuse flows"] if c in df.columns]

df_day = df.set_index("DateTime").resample("D").mean(numeric_only=True)
corr_df = df_day[["Total_kW","Sub_metering_1","Sub_metering_2","Sub_metering_3"] + feature_cols].corr()

plt.figure()
plt.imshow(corr_df.values, aspect="auto")
plt.xticks(range(len(corr_df.columns)), corr_df.columns, rotation=45, ha="right")
plt.yticks(range(len(corr_df.index)), corr_df.index)
plt.title("Correlation Matrix (Daily Means)"); plt.colorbar()
heatmap_path = PLOTS_DIR / "section3_correlation_heatmap.png"
plt.tight_layout(); plt.savefig(heatmap_path); plt.close()

scatter_paths = []
if "Temperature" in feature_cols:
    plt.figure(); plt.scatter(df_day["Temperature"], df_day["Total_kW"])
    plt.xlabel("Temperature"); plt.ylabel("Total_kW"); plt.title("Temperature vs Total_kW (Daily Means)")
    sc1 = PLOTS_DIR / "section3_scatter_temp_total.png"; plt.tight_layout(); plt.savefig(sc1); plt.close()
    scatter_paths.append(sc1.name)
if "Humidity" in feature_cols:
    plt.figure(); plt.scatter(df_day["Humidity"], df_day["Total_kW"])
    plt.xlabel("Humidity"); plt.ylabel("Total_kW"); plt.title("Humidity vs Total_kW (Daily Means)")
    sc2 = PLOTS_DIR / "section3_scatter_humidity_total.png"; plt.tight_layout(); plt.savefig(sc2); plt.close()
    scatter_paths.append(sc2.name)

def top_driver_for(target: str):
    if not feature_cols: return "N/A"
    s = corr_df.loc[feature_cols, target].abs().sort_values(ascending=False)
    return f"{s.index[0]} (|r|={s.iloc[0]:.2f})"

best_total = top_driver_for("Total_kW")
best_z1 = top_driver_for("Sub_metering_1")
best_z2 = top_driver_for("Sub_metering_2")
best_z3 = top_driver_for("Sub_metering_3")

first_ts = str(df["DateTime"].min()); last_ts  = str(df["DateTime"].max())

md = f"""# 💼 Week 1 – {SECTION}: Drivers & Correlations (Business-Friendly Report)

## Dataset
Using file: **{ENERGY_CSV.name}**  
Period: **{first_ts} → {last_ts}**  
Rows: **{len(df):,}**

## Key Questions Answered
**Q1: Which factors appear most correlated with total and per-zone consumption?**  
- Total consumption driver: **{best_total}**  
- Zone 1 driver: **{best_z1}**  
- Zone 2 driver: **{best_z2}**  
- Zone 3 driver: **{best_z3}**

**Q2: Are relationships linear or do they show thresholds?**  
- The **scatter plots** (daily means) help reveal linear vs. curved patterns and potential thresholds. Patterns tend to be smooth with some spread due to operational effects.

**Q3: Which visualizations helped you uncover these patterns?**  
- **Correlation matrix** of daily means: `plots/{heatmap_path.name}`  
- **Scatter plots** contrasting key drivers vs Total_kW: {", ".join(f'plots/{p}' for p in scatter_paths) if scatter_paths else "N/A"}

## What we computed
- Canonical **DateTime** and zone aliasing (**Zone 1/2/3 → Sub_metering 1/2/3**).  
- Daily means to stabilize noise.  
- Pearson correlations among consumption (Total + Zones) and environmental features.
"""
(BUSINESS_SUMMARY_MD).write_text(md, encoding="utf-8")
print("✅ Section 3 complete."); print("- Report:", BUSINESS_SUMMARY_MD)
print("- Plots:", heatmap_path.name, ", ".join(scatter_paths) if scatter_paths else "N/A")


✅ Section 3 complete.
- Report: /home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/results/Wk01_Section3/reports/SDS-CP036-powercast_Wk01_Section3_Report_Business.md
- Plots: section3_correlation_heatmap.png section3_scatter_temp_total.png, section3_scatter_humidity_total.png
