In [2]:
# section5_business.py
# Week 1 – Section 5 (Business): Data Quality & Sensor Anomalies
# Outputs under results/Wk01_Section5:
#  - plots/*.png (before/after hist & boxplots)
#  - SDS-CP036-powercast_Wk01_Section5_Report_Business.md

from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# --------------------------
# 0) Paths & helpers
# --------------------------
BASE_PROJECT_NAME = "SDS-CP036-powercast"
RUN_TAG = "Wk01_Section5"

def locate_project_root(start: Path, marker: str = BASE_PROJECT_NAME, max_up: int = 8) -> Path:
    p = start.resolve()
    for _ in range(max_up + 1):
        if p.name == marker or marker in p.parts:
            return p if p.name == marker else Path(*p.parts[:p.parts.index(marker)+1])
        p = p.parent
    return start.resolve()

CWD = Path.cwd()
BASE_DIR = locate_project_root(CWD, BASE_PROJECT_NAME)
RESULTS_DIR = BASE_DIR / "results" / RUN_TAG
PLOTS_DIR = RESULTS_DIR / "plots"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

# Where Section 4 likely saved things
SEC4_DIR = BASE_DIR / "results" / "Wk01_Section4"

# Candidate filenames Section 4 might have used
SEC4_CANDIDATES = [
    SEC4_DIR / "cleaned_power_weather_final.csv",
    SEC4_DIR / f"{BASE_PROJECT_NAME}_Wk01_Section4_cleaned_power_weather_final.csv",
]

def find_cleaned_csv() -> Path:
    # 1) Direct candidates
    for p in SEC4_CANDIDATES:
        if p.exists():
            return p
    # 2) Anything under Section 4 folder matching pattern
    if SEC4_DIR.exists():
        hits = sorted(SEC4_DIR.rglob("*cleaned_power_weather_final.csv"))
        if hits:
            return hits[0]
    # 3) Last resort: search entire results tree
    hits = sorted((BASE_DIR / "results").rglob("*cleaned_power_weather_final.csv"))
    if hits:
        return hits[0]
    raise FileNotFoundError(
        "Could not find 'cleaned_power_weather_final.csv'. "
        "Please run Section 4 script first to generate it."
    )

# --------------------------
# 1) Load data
# --------------------------
CLEANED_PREV = find_cleaned_csv()
df = pd.read_csv(CLEANED_PREV, parse_dates=["DateTime"])

# Keep numeric columns of interest
energy_cols = ["Global_active_power","Global_reactive_power","Voltage",
               "Global_intensity","Sub_metering_1","Sub_metering_2","Sub_metering_3"]
weather_cols = ["temperature_2m","relative_humidity_2m","wind_speed_10m","shortwave_radiation"]

# Only keep columns that exist (be tolerant)
keep_cols = ["DateTime"] + [c for c in energy_cols + weather_cols if c in df.columns]
df = df[keep_cols].copy()

# Coerce numerics
for c in keep_cols:
    if c != "DateTime":
        df[c] = pd.to_numeric(df[c], errors="coerce")

# --------------------------
# 2) Quick quality checks
# --------------------------
# Missing counts
missing_before = df.isna().sum().to_dict()

# Negative checks for Sub_metering_* (should be >=0)
negatives = {}
for z in ["Sub_metering_1","Sub_metering_2","Sub_metering_3"]:
    if z in df.columns:
        negatives[z] = int((df[z] < 0).sum())

# --------------------------
# 3) Plots - BEFORE cleaning
# --------------------------
sns.set(style="whitegrid")

# Histograms (before)
plt.figure(figsize=(12, 8))
numeric_before = df.drop(columns=["DateTime"]).select_dtypes(include=[float, int])
numeric_before.hist(bins=50, figsize=(12, 8))
plt.tight_layout()
PLOT_BEFORE_HIST = PLOTS_DIR / f"{BASE_PROJECT_NAME}_{RUN_TAG}_Plot_Hist_Before.png"
plt.savefig(PLOT_BEFORE_HIST, dpi=130)
plt.close()

# Boxplots (before)
plt.figure(figsize=(12, 6))
m_before = numeric_before.melt(var_name="Feature", value_name="Value")
sns.boxplot(data=m_before, x="Feature", y="Value")
plt.xticks(rotation=30, ha="right")
plt.tight_layout()
PLOT_BEFORE_BOX = PLOTS_DIR / f"{BASE_PROJECT_NAME}_{RUN_TAG}_Plot_Box_Before.png"
plt.savefig(PLOT_BEFORE_BOX, dpi=130)
plt.close()

# --------------------------
# 4) Clean: fix negatives, cap/clip outliers, simple impute
# --------------------------
df_clean = df.copy()

# Negatives -> NA for sub meterings
for z in ["Sub_metering_1","Sub_metering_2","Sub_metering_3"]:
    if z in df_clean.columns:
        df_clean.loc[df_clean[z] < 0, z] = np.nan

# IQR clipping for each numeric column (robust)
def iqr_clip(s: pd.Series, k: float = 1.5) -> pd.Series:
    q1 = s.quantile(0.25)
    q3 = s.quantile(0.75)
    iqr = q3 - q1
    low, high = q1 - k*iqr, q3 + k*iqr
    return s.clip(lower=low, upper=high)

for c in df_clean.columns:
    if c != "DateTime" and pd.api.types.is_numeric_dtype(df_clean[c]):
        df_clean[c] = iqr_clip(df_clean[c])

# Simple forward-fill then back-fill for small gaps
df_clean = df_clean.sort_values("DateTime").reset_index(drop=True)
df_clean.update(df_clean.select_dtypes(include=[float, int]).ffill().bfill())

# Missing counts after
missing_after = df_clean.isna().sum().to_dict()

# --------------------------
# 5) Plots - AFTER cleaning
# --------------------------
plt.figure(figsize=(12, 8))
numeric_after = df_clean.drop(columns=["DateTime"]).select_dtypes(include=[float, int])
numeric_after.hist(bins=50, figsize=(12, 8))
plt.tight_layout()
PLOT_AFTER_HIST = PLOTS_DIR / f"{BASE_PROJECT_NAME}_{RUN_TAG}_Plot_Hist_After.png"
plt.savefig(PLOT_AFTER_HIST, dpi=130)
plt.close()

plt.figure(figsize=(12, 6))
m_after = numeric_after.melt(var_name="Feature", value_name="Value")
sns.boxplot(data=m_after, x="Feature", y="Value")
plt.xticks(rotation=30, ha="right")
plt.tight_layout()
PLOT_AFTER_BOX = PLOTS_DIR / f"{BASE_PROJECT_NAME}_{RUN_TAG}_Plot_Box_After.png"
plt.savefig(PLOT_AFTER_BOX, dpi=130)
plt.close()

# --------------------------
# 6) Save cleaned dataset (do not multiply names)
# --------------------------
final_clean_csv = BASE_DIR / "results" / "Wk01_Section4" / "cleaned_power_weather_final.csv"
# Also mirror to Wk01_Section5 for convenience
final_clean_csv_5 = RESULTS_DIR / "cleaned_power_weather_final.csv"

df_clean.to_csv(final_clean_csv, index=False)
df_clean.to_csv(final_clean_csv_5, index=False)

# --------------------------
# 7) Business Markdown (Q1–Q3)
# --------------------------
def fmt_missing(d: dict) -> str:
    lines = []
    for k, v in sorted(d.items()):
        if k == "DateTime": 
            continue
        lines.append(f"- {k}: {v}")
    return "\n".join(lines)

md_path = RESULTS_DIR / f"{BASE_PROJECT_NAME}_Wk01_Section5_Report_Business.md"

md = []
md.append(f"# {BASE_PROJECT_NAME} – Wk01_Section5 – Data Quality & Sensor Anomalies (Business Report)\n")
md.append("## Key Questions Answered\n")

md.append("**Q1: Did I detect any outliers in the weather or consumption readings?**  \n"
          "Yes. I found outliers across several features using boxplots/IQR rules and histograms. "
          "Sub-meter readings occasionally had extreme spikes, and weather features showed sporadic high/low values.\n")

md.append("**Q2: How did I identify and treat these anomalies?**  \n"
          "I used IQR-based clipping (to cap extreme values) and replaced negative sub-meter readings with blanks (then filled small gaps). "
          "I also forward-/back-filled short missing stretches.\n")

md.append("**Q3: What might be the impact of retaining or removing them in my model?**  \n"
          "Capping/removing extremes reduces noise and helps models generalize, while retaining them can cause unstable forecasts. "
          "For production systems, I would keep this cleaning to improve reliability.\n")

md.append("## Missing Values (Before Cleaning)\n")
md.append(fmt_missing(missing_before) + "\n")

md.append("## Missing Values (After Cleaning)\n")
md.append(fmt_missing(missing_after) + "\n")

md.append("## Visual Evidence\n")
md.append(f"**Before – Histograms**  \n![]({PLOT_BEFORE_HIST.relative_to(RESULTS_DIR).as_posix()})\n")
md.append(f"**Before – Boxplots**  \n![]({PLOT_BEFORE_BOX.relative_to(RESULTS_DIR).as_posix()})\n")
md.append(f"**After – Histograms**  \n![]({PLOT_AFTER_HIST.relative_to(RESULTS_DIR).as_posix()})\n")
md.append(f"**After – Boxplots**  \n![]({PLOT_AFTER_BOX.relative_to(RESULTS_DIR).as_posix()})\n")

with open(md_path, "w", encoding="utf-8") as f:
    f.write("\n".join(md))

print("✅ Section 5 (Business) complete.")
print(f"- Loaded cleaned CSV from: {CLEANED_PREV}")
print(f"- Saved cleaned CSV to:   {final_clean_csv}")
print(f"- Also copied to:         {final_clean_csv_5}")
print(f"- Plots in:               {PLOTS_DIR}")
print(f"- Report:                 {md_path}")


✅ Section 5 (Business) complete.
- Loaded cleaned CSV from: /home/6376f5a9-d12b-4255-9426-c0091ad440a7/SDS-CP036-powercast/results/Wk01_Section4/cleaned_power_weather_final.csv
- Saved cleaned CSV to:   /home/6376f5a9-d12b-4255-9426-c0091ad440a7/SDS-CP036-powercast/results/Wk01_Section4/cleaned_power_weather_final.csv
- Also copied to:         /home/6376f5a9-d12b-4255-9426-c0091ad440a7/SDS-CP036-powercast/results/Wk01_Section5/cleaned_power_weather_final.csv
- Plots in:               /home/6376f5a9-d12b-4255-9426-c0091ad440a7/SDS-CP036-powercast/results/Wk01_Section5/plots
- Report:                 /home/6376f5a9-d12b-4255-9426-c0091ad440a7/SDS-CP036-powercast/results/Wk01_Section5/SDS-CP036-powercast_Wk01_Section5_Report_Business.md


<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>