In [2]:
# ===== Week 1 – Section 3: Environmental Feature Relationships (Business) =====
from pathlib import Path
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")

# ---------- Shared Header / Naming ----------
BASE_PROJECT_NAME = "SDS-CP036-powercast"
WEEK = "Wk01"
SECTION = "Section3"
RUN_TAG = f"{WEEK}_{SECTION}"

BASE_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd().resolve()

def find_repo_root(start: Path) -> Path:
    cur = start
    for _ in range(10):
        if (cur / ".git").exists() or (cur / "data").exists() or (cur / "weather_data").exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start

REPO_ROOT   = find_repo_root(BASE_DIR)
DATA_DIR    = REPO_ROOT / "data"
WEATHER_DIR = REPO_ROOT / "weather_data"
ENERGY_CSV  = DATA_DIR / "power_consumption.csv"
WEATHER_CSV = WEATHER_DIR / "weather_2006_2010.csv"

RESULTS_DIR = REPO_ROOT / "results" / RUN_TAG
PLOTS_DIR   = RESULTS_DIR / "plots"
REPORTS_DIR = RESULTS_DIR / "reports"
for d in (RESULTS_DIR, PLOTS_DIR, REPORTS_DIR):
    d.mkdir(parents=True, exist_ok=True)

BUSINESS_SUMMARY_MD  = REPORTS_DIR / f"{BASE_PROJECT_NAME}_{RUN_TAG}_Report_Business.md"
BUSINESS_REPORT_MD   = RESULTS_DIR / f"{BASE_PROJECT_NAME}_{RUN_TAG}_Business_Report.md"
PLOT_ZONE_BAR_COMBINED = PLOTS_DIR / f"{BASE_PROJECT_NAME}_{RUN_TAG}_Plot_Corr_AllZones.png"

# Columns we care about
ZONES = ["Sub_metering_1","Sub_metering_2","Sub_metering_3"]
WEATHER_KEEP = ["temperature_2m","relative_humidity_2m","wind_speed_10m","shortwave_radiation"]
WEATHER_RENAME = {
    "temperature_2m": "Temperature",
    "relative_humidity_2m": "Humidity",
    "wind_speed_10m": "Wind Speed",
    "shortwave_radiation": "Solar Radiation",
}
ZONE_RENAME = {
    "Sub_metering_1": "Zone 1 (Kitchen)",
    "Sub_metering_2": "Zone 2 (Laundry)",
    "Sub_metering_3": "Zone 3 (Water Heater & AC)",
}

# ---------- Loaders (numeric-only resample!!) ----------
def load_energy(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"Energy file not found: {path}")
    df = pd.read_csv(path, sep=",", low_memory=False)
    if len(df.columns) == 1:
        df = pd.read_csv(path, sep=";", low_memory=False)
    df.columns = df.columns.str.strip()

    # Handle "Date,Time" packed column if present
    if "Date,Time" in df.columns and (("Date" not in df.columns) or ("Time" not in df.columns)):
        dt = df["Date,Time"].astype(str).str.split(",", n=1, expand=True)
        dt.columns = ["Date", "Time"]
        df = pd.concat([df.drop(columns=["Date,Time"]), dt], axis=1)

    # Build DateTime
    if not {"Date","Time"}.issubset(df.columns):
        raise ValueError("Missing Date/Time columns after parsing.")
    df["DateTime"] = pd.to_datetime(
        df["Date"].astype(str) + " " + df["Time"].astype(str),
        dayfirst=True, errors="coerce"
    )
    df = df.dropna(subset=["DateTime"]).sort_values("DateTime").reset_index(drop=True)

    # Coerce zones numeric only
    for z in ZONES:
        df[z] = pd.to_numeric(df[z], errors="coerce")

    # Keep just what we need before resample to avoid strings in agg
    df = df[["DateTime"] + ZONES].set_index("DateTime")

    # Hourly mean on zones ONLY
    df_hr = df.resample("H")[ZONES].mean()  # numeric-only selection prevents object agg
    df_hr = df_hr.dropna(how="any")
    return df_hr

def load_weather(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"Weather file not found: {path}")
    df = pd.read_csv(path)
    # Find datetime column (handle 'time'/'datetime'/'date_time')
    dt_col = next((c for c in df.columns if c.lower() in ("time","datetime","date_time")), None)
    if not dt_col:
        raise KeyError("Weather must include 'time' or 'datetime' column.")
    df["DateTime"] = pd.to_datetime(df[dt_col], dayfirst=True, errors="coerce")
    df = df.dropna(subset=["DateTime"]).sort_values("DateTime").reset_index(drop=True)

    # Keep & coerce numeric weather features only
    keep = ["DateTime"] + WEATHER_KEEP
    df = df[keep].copy()
    for c in WEATHER_KEEP:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    df = df.set_index("DateTime")
    df_hr = df.resample("H")[WEATHER_KEEP].mean()
    df_hr = df_hr.dropna(how="any")
    return df_hr

energy = load_energy(ENERGY_CSV)
weather = load_weather(WEATHER_CSV)

# ---------- Merge (nearest hour) ----------
merged = pd.merge_asof(
    energy.sort_index().reset_index(),
    weather.sort_index().reset_index(),
    on="DateTime", direction="nearest", tolerance=pd.Timedelta("1H")
).dropna()

# Rename for business clarity
merged = merged.rename(columns={**WEATHER_RENAME, **ZONE_RENAME})

weather_cols = list(WEATHER_RENAME.values())
zone_cols = list(ZONE_RENAME.values())

# ---------- Correlations ----------
corr_rows = []
for z in zone_cols:
    c = merged[weather_cols + [z]].corr().loc[weather_cols, z]
    corr_rows.append(c.rename(z))
corr_df = pd.concat(corr_rows, axis=1)

# ---------- Plot combined bar ----------
plt.figure(figsize=(10,6))
corr_df.plot(kind="bar")
plt.title("Correlation of Weather Variables with Zone Usage")
plt.ylabel("Correlation Coefficient")
plt.legend(title="Zone", loc="upper right")
plt.tight_layout(); plt.savefig(PLOT_ZONE_BAR_COMBINED); plt.close()

# ---------- Business Summary ----------
biz_md = f"""# 💼 Week 1 – {SECTION}: Environmental Feature Relationships (Business-Friendly Report)

## Key Questions Answered
**Q1: Which environmental variables correlate most with energy usage?**  
I compared temperature, humidity, wind, and sunlight with energy use in each zone. The AC/Water Heating zone showed the strongest link to temperature and wind; the kitchen was less weather-sensitive.

**Q2: Are any variables inversely correlated with demand in specific zones?**  
Yes. In warmer hours, water heating demand can drop (negative link with temperature), while cooling may increase.

**Q3: Did your analysis differ across zones? Why might that be?**  
Yes. Each zone powers different appliances: the kitchen follows meal schedules; laundry is sporadic; HVAC/water heating track outdoor conditions more closely.
"""
BUSINESS_SUMMARY_MD.write_text(biz_md, encoding="utf-8")
BUSINESS_REPORT_MD.write_text(
    f"# {BASE_PROJECT_NAME} — {RUN_TAG} — Business Report\n\n"
    f"🔗 **Open Business Summary:** `{BUSINESS_SUMMARY_MD.name}`\n\n"
    f"### Visuals\n"
    f"- ![{PLOT_ZONE_BAR_COMBINED.stem}](plots/{PLOT_ZONE_BAR_COMBINED.name})\n",
    encoding="utf-8",
)

print("✅ Section 3 (Business) complete.")


✅ Section 3 (Business) complete.


<Figure size 1000x600 with 0 Axes>