In [2]:

# ===== Week 1 – Section 1: Time Consistency & Structure (Business) =====
from pathlib import Path
import os
import pandas as pd

# ---------- Shared Header / Naming ----------
BASE_PROJECT_NAME = "SDS-CP036-powercast"
WEEK = "Wk01"
SECTION = "Section1"
RUN_TAG = f"{WEEK}_{SECTION}"

# Prefer running relative to this file; fall back to CWD when executed in notebooks
BASE_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd().resolve()

def find_repo_root(start: Path) -> Path:
    """Find a reasonable project root; if not found, use start."""
    cur = start
    for _ in range(10):
        if (cur / ".git").exists() or (cur / "data").exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start

REPO_ROOT = find_repo_root(BASE_DIR)
DATA_DIR = REPO_ROOT / "data"

# --- Locate energy CSV flexibly ---
CANDIDATES = [
    DATA_DIR / "Tetuan City power consumption.csv",
    Path("/mnt/data/Tetuan City power consumption.csv"),
]

# Allow override via env var ENERGY_CSV_PATH
env_path = os.environ.get("ENERGY_CSV_PATH")
if env_path:
    CANDIDATES.insert(0, Path(env_path))

ENERGY_CSV = next((p for p in CANDIDATES if p.exists()), None)
if ENERGY_CSV is None:
    raise FileNotFoundError(
        "Could not locate an energy CSV. Looked for:\n  - " + "\n  - ".join(map(str, CANDIDATES))
    )

RESULTS_DIR  = REPO_ROOT / "results" / RUN_TAG
PLOTS_DIR    = RESULTS_DIR / "plots"
REPORTS_DIR  = RESULTS_DIR / "reports"
for d in (RESULTS_DIR, PLOTS_DIR, REPORTS_DIR):
    d.mkdir(parents=True, exist_ok=True)

BUSINESS_SUMMARY_MD  = REPORTS_DIR / f"{BASE_PROJECT_NAME}_{RUN_TAG}_Report_Business.md"
BUSINESS_REPORT_MD   = RESULTS_DIR / f"{BASE_PROJECT_NAME}_{RUN_TAG}_Business_Report.md"

# ---------- Load energy safely ----------
def _parse_datetime_series(series: pd.Series) -> pd.Series:
    """Try parsing with month-first then day-first as fallback."""
    dt = pd.to_datetime(series, errors="coerce")
    # If too many NaT, try day-first
    if dt.isna().mean() > 0.5:
        dt = pd.to_datetime(series, errors="coerce", dayfirst=True)
    return dt

def load_energy(path: Path) -> pd.DataFrame:
    # Read with flexible delimiter
    df = pd.read_csv(path, sep=',', low_memory=False)
    if len(df.columns) == 1:
        df = pd.read_csv(path, sep=';', low_memory=False)

    # Normalize column names (strip spaces, preserve original for content)
    original_cols = df.columns.tolist()
    normalized = {c: c.strip() for c in df.columns}
    df.rename(columns=normalized, inplace=True)

    # Common datetime patterns across datasets:
    #  1) Single 'DateTime' column
    #  2) Separate 'Date' and 'Time' columns
    #  3) A combined 'Date,Time' column (CSV with comma inside a field)
    cols_lower = {c.lower(): c for c in df.columns}

    if "datetime" in cols_lower:
        dt_col = cols_lower["datetime"]
        df["DateTime"] = _parse_datetime_series(df[dt_col].astype(str))
    elif ("date" in cols_lower) and ("time" in cols_lower):
        c_date, c_time = cols_lower["date"], cols_lower["time"]
        df["DateTime"] = _parse_datetime_series(df[c_date].astype(str) + " " + df[c_time].astype(str))
    elif "date,time" in df.columns:
        # Some exports may keep 'Date,Time' literally as a header
        dt = df["Date,Time"].astype(str).str.split(",", n=1, expand=True)
        dt.columns = ["_Date_tmp", "_Time_tmp"]
        df["DateTime"] = _parse_datetime_series(dt["_Date_tmp"] + " " + dt["_Time_tmp"])
    else:
        # Last resort: find the first column that *looks* like a datetime
        candidate = None
        for c in df.columns:
            lc = c.lower()
            if "date" in lc or "time" in lc or "timestamp" in lc:
                candidate = c
                break
        if candidate is not None:
            df["DateTime"] = _parse_datetime_series(df[candidate].astype(str))
        else:
            raise ValueError(
                "Missing Date/Time information. Expected a 'DateTime' column or 'Date'+'Time'.\n"
                f"Columns found: {original_cols}"
            )

    # Clean and sort
    df = df.dropna(subset=["DateTime"]).sort_values("DateTime").reset_index(drop=True)
    return df

df = load_energy(ENERGY_CSV)

# ---------- Analyses ----------
# 1) Missing or irregular timestamps
missing_timestamps = df["DateTime"].isna().sum()

df_sorted = df.sort_values("DateTime").reset_index(drop=True)
time_diffs = df_sorted["DateTime"].diff()

# Handle empty diffs safely
if time_diffs.dropna().empty:
    most_common = pd.Timedelta(0)
else:
    freq_counts = time_diffs.value_counts()
    most_common = freq_counts.index[0] if not freq_counts.empty else pd.Timedelta(0)

# 2) Duplicates
duplicate_dt = df_sorted["DateTime"].duplicated().sum()

# ---------- Business Summary ----------
def _timedelta_to_minutes(td):
    try:
        return int(td.total_seconds() // 60)
    except Exception:
        return 0

most_common_minutes = _timedelta_to_minutes(most_common)

biz_md = f"""# 💼 Week 1 – {SECTION}: Time Consistency & Structure (Business-Friendly Report)

## Dataset
Using file: **{ENERGY_CSV.name}**

## Key Questions Answered
**Q1: Are there any missing or irregular timestamps in the dataset? How did you verify consistency?**  
I created a canonical `DateTime` column and inspected gaps between consecutive records to detect irregularities.

**Q2: What is the sampling frequency and are all records spaced consistently?**  
I measured the time deltas between consecutive rows. The most common spacing is **{most_common}**, suggesting the intended sampling cadence.

**Q3: Did you encounter any duplicates or inconsistent `DateTime` entries?**  
I found **{duplicate_dt}** duplicate timestamps (exact same `DateTime`). These could be reviewed or deduplicated based on your business rules.

## Plain-English Notes
- Built a single `DateTime` column from whatever the dataset provided (flexible parsing with comma/semicolon delimiters and both month-first/day-first formats).
- Looked for missing times and uneven gaps using the distribution of time differences.
- Business takeaway: “On average, there’s a reading roughly every {most_common} — i.e., about {most_common_minutes} minute(s) per record.”
"""

BUSINESS_SUMMARY_MD.write_text(biz_md, encoding="utf-8")

wrapper = f"""# {BASE_PROJECT_NAME} — {RUN_TAG} — Business Report

🔗 **Open Business Summary:** `{BUSINESS_SUMMARY_MD.name}`
"""
BUSINESS_REPORT_MD.write_text(wrapper, encoding="utf-8")

print("✅ Section 1 (Business) complete. Outputs written to:")
print("-", BUSINESS_SUMMARY_MD)
print("-", BUSINESS_REPORT_MD)


✅ Section 1 (Business) complete. Outputs written to:
- /home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/results/Wk01_Section1/reports/SDS-CP036-powercast_Wk01_Section1_Report_Business.md
- /home/6376f5a9-d12b-4255-9426-c0091ad440a7/Powercast/results/Wk01_Section1/SDS-CP036-powercast_Wk01_Section1_Business_Report.md
