In [40]:
# Starter cell: load logs, apply cleaning steps, compute file spans, and report stats
# Adjust DATA_DIR if your logs live somewhere else.

from pathlib import Path
import pandas as pd
import numpy as np
import re

# ──────────────────────────────────────────────────────────────────────────────
# Load all CSV logs (read as strings first so we can do exact/substring checks)
# Also collect per-file time spans for plotting generator activity later.
# ──────────────────────────────────────────────────────────────────────────────
DATA_DIR = Path("../data-2025")
if not DATA_DIR.exists():
    raise FileNotFoundError(f"Data directory not found: {DATA_DIR.resolve()}")

csv_paths = sorted(DATA_DIR.rglob("*.csv"))

frames = []
spans = []  # collect per-file spans: {"source_file", "start", "end"}
expected_cols = ["timestamp","mode","value","vbat_mV","vin_mV","iout_mA","soc_C","rp1_C","pmic_C","error"]

for p in csv_paths:
    try:
        tmp = pd.read_csv(p, dtype=str)   # keep everything as str for now
    except Exception as e:
        print(f"Warning: failed to read {p}: {e}")
        continue

    # Ensure expected columns exist
    for col in expected_cols:
        if col not in tmp.columns:
            tmp[col] = np.nan

    # Preserve raw value string EXACTLY as read (no strip/normalize yet)
    tmp["value_str"] = tmp["value"].astype(str)

    # Normalize text fields we care about
    tmp["mode"]  = tmp["mode"].astype(str).str.strip().str.lower()
    tmp["error"] = tmp["error"].astype(str).fillna("").str.strip()

    # Parse timestamp
    tmp["timestamp"] = pd.to_datetime(tmp["timestamp"], errors="coerce")

    # Record this file's span (based on whatever timestamps exist in the file)
    valid_ts = tmp["timestamp"].dropna()
    if not valid_ts.empty:
        spans.append({
            "source_file": str(p),
            "start": valid_ts.min(),
            "end":   valid_ts.max(),
        })

    # Keep source filename
    tmp["source_file"] = str(p)

    frames.append(tmp)

if frames:
    df = pd.concat(frames, ignore_index=True)
    df = df.dropna(subset=["timestamp"]).sort_values("timestamp")
else:
    df = pd.DataFrame(columns=expected_cols + ["value_str","source_file"])

# Build spans DataFrame (raw)
log_spans = pd.DataFrame(spans) if spans else pd.DataFrame(columns=["source_file","start","end"])
log_spans = log_spans.sort_values(["start","end"]).reset_index(drop=True)

print(f"Loaded {len(csv_paths)} CSV files; total rows: {len(df)}")
if not log_spans.empty:
    print(f"File spans collected: {len(log_spans)} (earliest {log_spans['start'].min()} → latest {log_spans['end'].max()})")
else:
    print("No file spans collected.")

# ──────────────────────────────────────────────────────────────────────────────
# STEP 1: Apply cutoff date; report rows remaining
# Also clip the file spans to the same cutoff, producing log_spans_cut.
# ──────────────────────────────────────────────────────────────────────────────
CUTOFF_DATE = pd.Timestamp("2025-08-21")  # inclusive: keep rows >= 2025-08-21 00:00
before_rows = len(df)
df = df[df["timestamp"] >= CUTOFF_DATE].copy()
print(f"[Step 1] Applied cutoff at {CUTOFF_DATE.date()}: kept {len(df)} / {before_rows} rows")

# Clip spans to cutoff (keep only those that overlap the cutoff window)
if not log_spans.empty:
    log_spans_cut = log_spans.copy()
    # drop spans that end before cutoff
    log_spans_cut = log_spans_cut[log_spans_cut["end"] >= CUTOFF_DATE].copy()
    # clip start to cutoff
    log_spans_cut["start"] = log_spans_cut["start"].where(log_spans_cut["start"] >= CUTOFF_DATE, CUTOFF_DATE)
    # ensure start < end
    log_spans_cut = log_spans_cut[log_spans_cut["start"] < log_spans_cut["end"]].reset_index(drop=True)
else:
    log_spans_cut = log_spans.copy()

# ──────────────────────────────────────────────────────────────────────────────
# STEP 2: Keep only rows where mode is exactly "watt"; report discarded & remaining
# ──────────────────────────────────────────────────────────────────────────────
pre_rows = len(df)
mode_mask = df["mode"].eq("watt")   # exact string match
discarded_non_watt = int((~mode_mask).sum())
df = df[mode_mask].copy()
print(f"[Step 2] Discarded non-'watt' modes: {discarded_non_watt}; rows remaining: {len(df)}")

# ──────────────────────────────────────────────────────────────────────────────
# STEP 3: Discard rows whose raw 'value' STRING contains "8888.8000"
#         (substring match; no numeric conversions; resilient to quotes/padding)
# ──────────────────────────────────────────────────────────────────────────────
pre_rows = len(df)
sentinel_mask = df["value_str"].astype(str).str.contains("8888.8000", regex=False, na=False)
discarded_sentinels = int(sentinel_mask.sum())
df = df[~sentinel_mask].copy()
print(f"[Step 3] Discarded rows where value contains '8888.8000': {discarded_sentinels}; rows remaining: {len(df)}")

# ──────────────────────────────────────────────────────────────────────────────
# STEP 4: Discard rows with decode/segment warnings *AND* zero/NaN reading
# ──────────────────────────────────────────────────────────────────────────────
pre_rows = len(df)
value_norm_for_num = df["value_str"].str.replace(r"[^\d.\-]", "", regex=True)
value_num_tmp = pd.to_numeric(value_norm_for_num, errors="coerce")
zero_or_nan = value_num_tmp.isna() | value_num_tmp.eq(0)
err_decode_mask = df["error"].str.contains(r"(decode|segment|unrecognized|fail|parse)", case=False, na=False)
drop_mask = err_decode_mask & zero_or_nan
discarded_err_zero = int(drop_mask.sum())
df = df[~drop_mask].copy()
print(f"[Step 4] Discarded rows with decode/segment warnings AND zero/NaN value: {discarded_err_zero}; rows remaining: {len(df)}")

# ──────────────────────────────────────────────────────────────────────────────
# Additional summary (post-cleaning)
# ──────────────────────────────────────────────────────────────────────────────
# Convert numerics now that string-based filtering is done
for num_col in ["value","vbat_mV","vin_mV","iout_mA","soc_C","rp1_C","pmic_C"]:
    df[num_col] = pd.to_numeric(df[num_col], errors="coerce")

# Decode-error count & percentage (post-cleaning; informational only)
pat = re.compile(r"(?:decode.*digit|segment)", re.IGNORECASE)
decode_err_mask = df["error"].str.contains(pat, na=False)
decode_err_rows = int(decode_err_mask.sum())
total_rows = len(df)
decode_pct = (decode_err_rows / total_rows * 100.0) if total_rows else 0.0

# Single total runtime as D/H/M
if total_rows and df["timestamp"].notna().any():
    tmin = df["timestamp"].min()
    tmax = df["timestamp"].max()
    span = tmax - tmin
    total_seconds = int(span.total_seconds())
    days = total_seconds // 86400
    rem = total_seconds % 86400
    hours = rem // 3600
    rem %= 3600
    minutes = rem // 60
else:
    tmin = tmax = None
    days = hours = minutes = 0

print(f"\nCSV files represented (post-cleaning): {df['source_file'].nunique()}")
print(f"Segment-decode error rows (remaining): {decode_err_rows} ({decode_pct:.2f}% of total)")
print(f"Time span: {tmin} → {tmax}")
print(f"Total runtime: {days} days, {hours} hours, {minutes} minutes")

# Globals for the chart cell:
# - df: cleaned rows for plotting values
# - log_spans_cut: file intervals (clipped to cutoff) for generator-on overlay


Loaded 32 CSV files; total rows: 234375
File spans collected: 32 (earliest 2025-08-18 12:41:45.321000 → latest 2025-08-29 15:41:16.761000)
[Step 1] Applied cutoff at 2025-08-21: kept 230935 / 234375 rows
[Step 2] Discarded non-'watt' modes: 3869; rows remaining: 227066
[Step 3] Discarded rows where value contains '8888.8000': 0; rows remaining: 227066


  err_decode_mask = df["error"].str.contains(r"(decode|segment|unrecognized|fail|parse)", case=False, na=False)



CSV files represented (post-cleaning): 23
Segment-decode error rows (remaining): 0 (0.00% of total)
Time span: 2025-08-22 16:37:50.372000 → 2025-08-29 15:37:08.748000
Total runtime: 6 days, 22 hours, 59 minutes


In [51]:
# Chart: Watts vs Time (cleaned df) with generator intervals overlay
# - Power plotted as LINES (no markers), but ONLY within logfile spans (no bridging across gaps)
# - Thick red line at y=0 for each CSV file span (from log_spans_cut)
# - 2000 x 1000 PNG, daily major ticks ("Friday 8/29") LEFT-ALIGNED, faint hourly grid, night shading

import datetime as dt
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter

# ───────────── CONFIG ─────────────
PLOT_SCOPE = "all"      # "all" or "subset"
SUBSET_STYLE = "window"  # "window" or "around"

# (A) Time window:
SUBSET_START = "2025-08-25 18:00:00"
SUBSET_END   = "2025-08-25 21:00:00"

# (B) Time around a center point (± minutes):
CENTER_TIME  = "2025-08-25 19:05:00"
RADIUS_MIN   = 10

OUT_PNG = "power_watts_all_with_spans.png" if PLOT_SCOPE == "all" else "power_watts_subset_with_spans.png"
PRINT_MAX_ROWS = 500
SAVE_SUBSET_CSV = True
SUBSET_CSV_PATH = "subset_rows_for_manual_check.csv"

SUNRISE_STR = "06:22:28"
SUNSET_STR  = "19:31:14"

FIGSIZE_IN = (30, 5)  # 2000x1000 at 200 DPI
DPI = 200

LINE_COLOR = "darkgreen"
LINE_WIDTH = 1.0
# ──────────────────────────────────

def _fmt_day(x, pos):
    d = mdates.num2date(x)
    try:
        return d.strftime("%A %-m/%-d")   # POSIX
    except ValueError:
        return d.strftime("%A %#m/%#d")   # Windows

def _select_subset(df):
    if SUBSET_STYLE == "window":
        lo = pd.Timestamp(SUBSET_START); hi = pd.Timestamp(SUBSET_END)
        m = (df["timestamp"] >= lo) & (df["timestamp"] <= hi)
        return df.loc[m].copy(), f"time window {lo} → {hi}"
    elif SUBSET_STYLE == "around":
        c = pd.Timestamp(CENTER_TIME)
        lo = c - pd.Timedelta(minutes=RADIUS_MIN); hi = c + pd.Timedelta(minutes=RADIUS_MIN)
        m = (df["timestamp"] >= lo) & (df["timestamp"] <= hi)
        return df.loc[m].copy(), f"{RADIUS_MIN} min around {c}  (window {lo} → {hi})"
    else:
        raise ValueError("SUBSET_STYLE must be 'window' or 'around'")

if df.empty:
    print("Cleaned DataFrame `df` is empty — nothing to plot.")
else:
    if PLOT_SCOPE == "all":
        plot_df = df.copy(); scope_desc = "ALL cleaned data"
    else:
        plot_df, scope_desc = _select_subset(df)
        if plot_df.empty:
            print(f"Subset selection produced 0 rows ({scope_desc}). Nothing to plot.")
            print("Data range:", df['timestamp'].min(), "→", df['timestamp'].max(), f"(rows={len(df)})")
        else:
            cols_to_show = [c for c in ["timestamp","value","mode","error","source_file","vbat_mV","vin_mV","iout_mA","soc_C","rp1_C","pmic_C"] if c in plot_df.columns]
            print(f"\nSubset rows selected ({len(plot_df)} rows) — {scope_desc}")
            if len(plot_df) > PRINT_MAX_ROWS:
                print(plot_df[cols_to_show].head(PRINT_MAX_ROWS).to_string(index=False))
                print(f"... ({len(plot_df) - PRINT_MAX_ROWS} more rows not shown)")
            else:
                print(plot_df[cols_to_show].to_string(index=False))
            if SAVE_SUBSET_CSV:
                plot_df.to_csv(SUBSET_CSV_PATH, index=False)
                print(f"Saved subset to CSV: {SUBSET_CSV_PATH}")

    if not plot_df.empty:
        plot_df = plot_df.sort_values("timestamp")
        watts_all = plot_df[["timestamp","value"]].rename(columns={"value":"watts"}).dropna()

        sunrise_t = dt.datetime.strptime(SUNRISE_STR, "%H:%M:%S").time()
        sunset_t  = dt.datetime.strptime(SUNSET_STR,  "%H:%M:%S").time()

        fig = plt.figure(figsize=FIGSIZE_IN, dpi=DPI)
        ax = fig.add_subplot(111)

        left_data, right_data = watts_all["timestamp"].min(), watts_all["timestamp"].max()

        # Major day ticks (LEFT-ALIGNED labels), hourly minor grid
        ax.xaxis.set_major_locator(mdates.DayLocator(interval=1))
        ax.xaxis.set_major_formatter(FuncFormatter(_fmt_day))
        ax.xaxis.set_minor_locator(mdates.HourLocator(interval=1))
        ax.grid(which="minor", axis="x", linestyle="-", linewidth=0.5, alpha=0.3, color="0.8")

        # LEFT-align the major tick labels
        for lbl in ax.get_xticklabels(which="major"):
            lbl.set_ha("left")
        ax.tick_params(axis="x", which="major", pad=6)
        ax.margins(x=0.01)  # tiny x-padding so the last left-aligned label isn't clipped

        # Night shading
        day_start = left_data.normalize()
        day_end   = (right_data.normalize() + pd.Timedelta(days=1))
        current = day_start
        while current < day_end:
            sr = pd.Timestamp.combine(current, sunrise_t)
            ss = pd.Timestamp.combine(current, sunset_t)
            l1, r1 = max(current, left_data), min(sr, right_data)
            l2, r2 = max(ss, left_data), min(current + pd.Timedelta(days=1), right_data)
            if l1 < r1: ax.axvspan(l1, r1, alpha=0.12, zorder=0)
            if l2 < r2: ax.axvspan(l2, r2, alpha=0.12, zorder=0)
            current += pd.Timedelta(days=1)

        # Generator-on overlay and segmented power lines
        spans_df = None
        try:
            spans_df = log_spans_cut.copy()
        except NameError:
            try:
                spans_df = log_spans.copy()
            except NameError:
                spans_df = None

        if spans_df is not None and not spans_df.empty:
            spans_view = spans_df.copy()
            spans_view["start"] = spans_view["start"].clip(lower=left_data)
            spans_view["end"]   = spans_view["end"].clip(upper=right_data)
            spans_view = spans_view[spans_view["start"] < spans_view["end"]]

            for _, r in spans_view.iterrows():
                m = (watts_all["timestamp"] >= r["start"]) & (watts_all["timestamp"] <= r["end"])
                seg = watts_all.loc[m]
                if len(seg) >= 2:
                    ax.plot(seg["timestamp"], seg["watts"],
                            linestyle="-", marker="o", markersize=0,
                            linewidth=LINE_WIDTH, color=LINE_COLOR, zorder=3)
                ax.hlines(y=0, xmin=r["start"], xmax=r["end"],
                          linewidth=3, color="red", alpha=0.7, zorder=2)
        else:
            ax.plot(watts_all["timestamp"], watts_all["watts"],
                    linestyle="-", marker="o", markersize=0,
                    linewidth=LINE_WIDTH, color=LINE_COLOR, zorder=3)

        ax.set_ylabel("Watts")
        ax.set_xlabel("Time")
        ax.set_title(f"Watts vs Time — {scope_desc}")

        ymin, ymax = ax.get_ylim()
        ax.set_ylim(bottom=min(0, ymin))

        plt.tight_layout()
        plt.savefig(OUT_PNG, dpi=DPI)
        plt.close(fig)
        print(f"\nSaved {OUT_PNG}")



Saved power_watts_all_with_spans.png
