In [None]:
# Cell: summary of CSV row counts and date ranges (first 12 days, then 7-day blocks from day 13)
import math
from pathlib import Path
import pandas as pd

CSV_PATH = Path(r"C:\Users\ishaa\OneDrive\Desktop\synthetic_data_final\synthetic_battery_inference_scenarioA.csv")
if not CSV_PATH.exists():
    raise FileNotFoundError(f"CSV not found: {CSV_PATH}")

# Load only timestamp column (fast)
df_ts = pd.read_csv(CSV_PATH, usecols=["timestamp"], low_memory=False)
n_total = len(df_ts)
print(f"Total rows in CSV: {n_total:,}")

# Parse timestamps (UTC aware)
df_ts["timestamp"] = pd.to_datetime(df_ts["timestamp"], utc=True, errors="coerce")
if df_ts["timestamp"].isna().any():
    n_bad = int(df_ts["timestamp"].isna().sum())
    print(f"Warning: {n_bad} rows had unparsable timestamps and will be ignored for date calculations.")

# overall start / end (based on non-null timestamps)
valid = df_ts["timestamp"].dropna()
if valid.empty:
    raise ValueError("No valid timestamps found in CSV.")
start_ts = valid.min()
end_ts = valid.max()
print(f"Overall start timestamp: {start_ts} (UTC)")
print(f"Overall end   timestamp: {end_ts} (UTC)")

# Normalize start date (midnight) and build day index for each row
start_date = start_ts.normalize()  # Timestamp at 00:00 of first day
df_ts["date_only"] = df_ts["timestamp"].dt.date
df_ts["day_index"] = ((df_ts["timestamp"].dt.normalize() - start_date).dt.days).astype("Int64")

# Rows in first 12 days (day_index 0..11)
mask_first12 = (df_ts["day_index"].notna()) & (df_ts["day_index"] >= 0) & (df_ts["day_index"] <= 11)
rows_first12 = int(mask_first12.sum())
print(f"\nRows in first 12 days (day 0..11 starting {start_date.date()}): {rows_first12:,}")

# Now compute counts for contiguous 7-day blocks starting from day_index 12 (i.e., day 13)
# Determine max day index present
max_day_index = int(df_ts["day_index"].dropna().max())
if max_day_index < 12:
    print("\nNo rows beyond the first 12 days to form 7-day blocks.")
else:
    blocks = []
    block_start = 12
    block_id = 0
    while block_start <= max_day_index:
        block_end = block_start + 6  # inclusive
        mask_block = (df_ts["day_index"].notna()) & (df_ts["day_index"] >= block_start) & (df_ts["day_index"] <= block_end)
        cnt = int(mask_block.sum())
        start_day = (start_date + pd.Timedelta(days=block_start)).date()
        end_day = (start_date + pd.Timedelta(days=min(block_end, max_day_index))).date()
        blocks.append((block_id+1, block_start, min(block_end, max_day_index), start_day, end_day, cnt))
        block_id += 1
        block_start += 7

    print("\n7-day blocks starting from day 13 (day_index 12):")
    for b in blocks:
        bid, dstart, dend, sdate, edate, cnt = b
        print(f" Block {bid}: day_index {dstart}..{dend}  ({sdate} -> {edate})  rows: {cnt:,}")

# Also print total rows covered by first12 + all blocks (should be <= total)
covered = rows_first12 + sum(b[-1] for b in blocks) if 'blocks' in locals() else rows_first12
print(f"\nRows covered by reported windows: {covered:,} (of {n_total:,})")
if covered != n_total:
    print("Note: remaining rows may have unparsable timestamps or fall outside counted day range.")


### Scenario B

In [None]:
# Cell 1: imports, configuration, backup
import os, shutil, math
from pathlib import Path
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ---------- USER CONFIG ----------
CSV_PATH = Path(r"C:\Users\ishaa\OneDrive\Desktop\synthetic_data_final\synthetic_battery_inference_scenarioA.csv")
if not CSV_PATH.exists():
    raise FileNotFoundError(f"CSV not found at {CSV_PATH}. Update CSV_PATH if needed.")

# Row (1-based) where Scenario B begins
ROW_B_1BASED = 226_801

# RNG seed (reproducible)
SEED = 20251122
rng = np.random.RandomState(SEED)

# Backup path (one-shot)
BACKUP_PATH = CSV_PATH.with_name(CSV_PATH.stem + "_backup_before_scenarioB.csv")
if not BACKUP_PATH.exists():
    shutil.copy2(CSV_PATH, BACKUP_PATH)
    print("Backup created:", BACKUP_PATH)
else:
    print("Backup already present:", BACKUP_PATH)

print("Config ready. Seed:", SEED, "Scenario-B start row (1-based):", ROW_B_1BASED)


In [None]:
# Cell 2: load CSV and determine the Scenario-B timestamp window
df = pd.read_csv(CSV_PATH, low_memory=False)
n_rows = len(df)
print("CSV rows:", n_rows)

if ROW_B_1BASED < 1 or ROW_B_1BASED > n_rows:
    raise IndexError(f"Provided start row {ROW_B_1BASED} is out of range (1..{n_rows})")

# 1-based -> 0-based index
start_idx0 = ROW_B_1BASED - 1
row0 = df.iloc[start_idx0]
if "timestamp" not in df.columns:
    raise KeyError("'timestamp' column missing in CSV")

# parse timestamps (we will add a parsed column, used later)
df["timestamp_parsed"] = pd.to_datetime(df["timestamp"], utc=True)

t0 = df.loc[start_idx0, "timestamp_parsed"]
t_end = t0 + pd.Timedelta(days=7)

print("Scenario B window timestamps:")
print("  start (t0) =", t0)
print("  end (t0+7d)=", t_end)

# mask of rows to modify (timestamps in [t0, t0+7d) )
mask_B = (df["timestamp_parsed"] >= t0) & (df["timestamp_parsed"] < t_end)
n_mask = mask_B.sum()
if n_mask == 0:
    raise ValueError("No rows found in the 7-day window starting at row {} timestamp {}".format(ROW_B_1BASED, t0))
print("Rows in Scenario-B window:", n_mask, " (row indices approx", start_idx0, "to", start_idx0 + n_mask - 1, ")")


In [None]:
# Corrected Cell 3: build a smooth 7-day mean curve for Scenario B (timezone-safe, vectorized)
from scipy.ndimage import gaussian_filter1d

# rows_idx, mask_B, df, t0, t_end computed in Cell 2 already
rows_idx = np.flatnonzero(mask_B.values)
series_B_ts = df.loc[mask_B, "timestamp_parsed"]  # this is a pandas Series of tz-aware Timestamps

# Vectorized seconds offset relative to t0 (tz-aware safe)
secs_offset = (series_B_ts - t0).dt.total_seconds().astype(int).values
if secs_offset.size == 0:
    raise ValueError("No seconds offsets computed for Scenario B rows")

# window length in seconds (cover full continuous 7-day window)
window_sec = int((t_end - t0).total_seconds())
print("Scenario-B continuous window length (seconds):", window_sec)

# Define fractional allocation across the 7 days (global, tuneable)
frac_map = {
    "baseline_long": 0.35,        # day 1-2 mostly healthy
    "pre_agitation": 0.20,        # small agitation (day 3)
    "event_rise": 0.15,           # ramp up into event (end of day 3)
    "failure_peak": 0.10,         # main transient (short)
    "service_drop": 0.05,         # quick service
    "recovery": 0.15              # rest of days 5-7
}
# normalize and compute integer boundaries
total_f = sum(frac_map.values())
cursor = 0
segments = {}
for k,v in frac_map.items():
    L = int(round((v/total_f) * window_sec))
    segments[k] = (cursor, max(cursor + L - 1, cursor))
    cursor += L
# fix last segment to end exactly at window_sec-1
segments[list(segments.keys())[-1]] = (segments[list(segments.keys())[-1]][0], window_sec - 1)

print("Segments (sec ranges):")
for k,(a,b) in segments.items():
    print(f"  {k}: {a} -> {b} (len {b-a+1})")

# bands for composite_score (error-like)
B = {
    "baseline_low": 0.15, "baseline_high": 0.25,
    "pre_low": 0.25, "pre_high": 0.45,
    "rise_low": 0.45, "rise_high": 0.75,
    "failure_low": 0.75, "failure_high": 0.99,
    "service_low": 0.08, "service_high": 0.20,
    "recovery_low": 0.12, "recovery_high": 0.25
}

def linear(a,b,t): return a + (b-a)*t
def triangular_peak(low, high, rel):
    tri = 1.0 - abs(2.0*rel - 1.0)
    return low + (high - low) * tri

# create smooth mean trend across all seconds
trend = np.zeros(window_sec, dtype=float)
for s in range(window_sec):
    # find segment
    for name, (a,b) in segments.items():
        if a <= s <= b:
            rel = (s - a) / max(1, (b - a))
            seg = name
            break
    else:
        seg = "baseline_long"; rel = 0.0
    if seg == "baseline_long":
        trend[s] = linear(B["baseline_low"], B["baseline_high"], rel)
    elif seg == "pre_agitation":
        trend[s] = linear(B["pre_low"], B["pre_high"], rel)
    elif seg == "event_rise":
        trend[s] = linear(B["rise_low"], B["rise_high"], rel**1.1)
    elif seg == "failure_peak":
        trend[s] = triangular_peak(B["failure_low"], B["failure_high"], rel)
    elif seg == "service_drop":
        # drop from failure mid to service band
        failure_mid = (B["failure_low"] + B["failure_high"]) / 2.0
        target = linear(B["service_low"], B["service_high"], rng.rand())
        trend[s] = linear(failure_mid, target, rel)
    elif seg == "recovery":
        # recovery ramp to baseline
        start_val = linear(B["service_low"], B["service_high"], 0.5)
        end_val = linear(B["recovery_low"], B["recovery_high"], 0.5)
        trend[s] = linear(start_val, end_val, rel)
    else:
        trend[s] = linear(B["baseline_low"], B["baseline_high"], rel)

# slight smoothing so the mean isn't piecewise
trend_smooth = gaussian_filter1d(trend, sigma=30)  # smoothing across seconds
trend_smooth = np.clip(trend_smooth, 0.0, 1.0)

print("Trend created for Scenario B. Sample: ", trend_smooth[:5], "...", trend_smooth[-5:])


In [None]:
# Cell 4: generate realistic noise to add on top of the trend for the Scenario-B window
# Low-frequency noise (LF): gaussian low-pass filtered white noise
lf_sigma_seconds = 1800   # ~30 minutes; increase to make longer wiggles
lf_white = rng.normal(0.0, 1.0, size=window_sec)
lf_noise = gaussian_filter1d(lf_white, sigma=lf_sigma_seconds)
# scale lf_noise to target amplitude
lf_amp = 0.045
lf_noise = lf_noise / (np.std(lf_noise) + 1e-12) * lf_amp

# High-frequency heteroskedastic noise (HF): white noise whose amplitude scales with trend
hf_base = 0.018
hf_white = rng.normal(0.0, 1.0, size=window_sec)
hf_noise = hf_white * hf_base * (0.6 + 0.8 * trend_smooth)  # more jitter during higher trend

# Short correlated bursts
burst_prob = 0.0012   # per second chance
burst_amp = 0.10
for t in range(window_sec):
    if rng.rand() < burst_prob:
        width = rng.randint(20, 300)  # 20s - 5min
        start = t
        end = min(window_sec - 1, t + width)
        # triangular burst added
        for j in range(start, end+1):
            rel = (j - start) / max(1, (end - start))
            burst_val = (1.0 - abs(2*rel - 1.0)) * burst_amp * (0.5 + rng.rand()*0.8)
            lf_noise[j] += burst_val  # add into LF channel for coherence

# Rare extreme glitch spikes (very small count)
num_extreme = max(1, int(window_sec * 0.0002))  # tiny fraction
extreme_positions = rng.choice(window_sec, size=num_extreme, replace=False)
for p in extreme_positions:
    lf_noise[p] += rng.uniform(0.15, 0.30)

# Combine noises
combined_noise = lf_noise + hf_noise
# normalize to zero mean (avoid shifting trend mean)
combined_noise = combined_noise - np.mean(combined_noise)
print("Noise generated. lf_sigma_seconds:", lf_sigma_seconds, "lf_amp:", lf_amp, "hf_base:", hf_base)
print("Noise std:", np.std(combined_noise))


In [None]:
# Cell 5: compose final composite_score for all rows in the B window and assign back to df
# Map timestamp -> seconds offset relative to t0 and fetch final value from trend_smooth + combined_noise

# final_series (by second index)
noise_gain = 1.0  # tuneable
final_series = np.clip(trend_smooth + noise_gain * combined_noise, 0.0, 1.0)

# map unique timestamps in masked rows to seconds (cache)
unique_ts = df.loc[mask_B, "timestamp_parsed"].unique()
ts_to_sec = {ts: int((pd.Timestamp(ts) - t0).total_seconds()) for ts in unique_ts}

# apply per-row small jitter and occasional micro-spike
row_indices = df.index[mask_B]
new_cs = df.loc[mask_B, "composite_score"].values.copy()  # placeholder
i = 0
for idx, ts in zip(row_indices, df.loc[mask_B, "timestamp_parsed"]):
    sec = ts_to_sec[ts]
    sec = max(0, min(window_sec - 1, sec))
    value = float(final_series[sec])
    # occasional tiny micro-spike outside main failure (rare)
    if rng.rand() < 0.001:
        value = min(0.99, value + rng.uniform(0.08, 0.25))
    # small row-level jitter
    value += rng.normal(0.0, 0.005)
    value = float(np.clip(value, 0.0, 1.0))
    new_cs[i] = value
    i += 1

# write back only masked rows
df.loc[mask_B, "composite_score"] = new_cs
# recompute composite_health and labels for masked rows
df.loc[mask_B, "composite_health"] = 1.0 - df.loc[mask_B, "composite_score"]

def assign_label(val):
    if val > 0.6:
        return "anomaly", 2
    elif val > 0.4:
        return "suspicious", 1
    else:
        return "normal", 0

labels = [assign_label(v) for v in df.loc[mask_B, "composite_score"].values]
lab, sev = zip(*labels)
df.loc[mask_B, "anomaly_label"] = lab
df.loc[mask_B, "anomaly_severity"] = sev

print("Assigned composite_score for mask (rows):", n_mask)


In [None]:
# Cell 6: atomic write back to CSV (overwrite existing file)
tmp = CSV_PATH.with_suffix(".tmp")
# drop helper parsed column before writing
df_to_write = df.drop(columns=["timestamp_parsed"])
df_to_write.to_csv(tmp, index=False)
os.replace(tmp, CSV_PATH)
print("CSV updated in place at:", CSV_PATH)


In [None]:
# Cell 7: diagnostics for Scenario B window
df_check = pd.read_csv(CSV_PATH, usecols=["timestamp","composite_score"])
df_check["timestamp"] = pd.to_datetime(df_check["timestamp"], utc=True)
# filter to B window
mask_check = (df_check["timestamp"] >= t0) & (df_check["timestamp"] < t_end)
df_b = df_check.loc[mask_check].sort_values("timestamp").reset_index(drop=True)
print("Rows in B-diagnostics:", len(df_b))

# per-day summary over the 7-day window
df_b["date_only"] = df_b["timestamp"].dt.date
perday = df_b.groupby("date_only")["composite_score"].agg(['count','mean','std','min','max']).reset_index()
print("\nPer-day composite_score summary for Scenario B window:")
print(perday.to_string(index=False))

# detect anomaly segments (composite_score > 0.6)
threshold = 0.6
df_b["is_anom"] = (df_b["composite_score"] > threshold).astype(int)
df_b["seg_change"] = df_b["is_anom"].diff().fillna(df_b["is_anom"])
df_b["seg_id"] = (df_b["is_anom"] != df_b["is_anom"].shift(1)).cumsum()
segments = []
for seg_id, grp in df_b.groupby("seg_id"):
    if grp["is_anom"].iloc[0] == 1:
        start = grp["timestamp"].iloc[0]
        end = grp["timestamp"].iloc[-1]
        dur_s = (end - start).total_seconds()
        segments.append({"seg_id": int(seg_id), "start": str(start), "end": str(end), "duration_s": int(dur_s), "n_points": int(len(grp))})
segments_sorted = sorted(segments, key=lambda x: x["duration_s"], reverse=True)
print(f"\nDetected {len(segments)} anomaly segments in B-window (threshold > {threshold}). Top segments:")
for s in segments_sorted[:10]:
    print(f"  seg {s['seg_id']}: {s['start']} -> {s['end']}, dur_s={s['duration_s']}, n={s['n_points']}")


In [None]:
# Cell: Compressed timeline plot for first 12 days (Scenario A) + next 7 days (Scenario B),
# with B-window main-peak marker.
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

CSV_PATH = Path(r"C:\Users\ishaa\OneDrive\Desktop\synthetic_data_final\synthetic_battery_inference_scenarioA.csv")
if not CSV_PATH.exists():
    raise FileNotFoundError(f"CSV not found: {CSV_PATH}")

# --- Load timestamps + composite_score only (fast) ---
use_cols = ["timestamp", "composite_score"]
df = pd.read_csv(CSV_PATH, usecols=use_cols, low_memory=False)
df = df.dropna(subset=["timestamp", "composite_score"]).reset_index(drop=True)
df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True, errors="coerce")
df = df.dropna(subset=["timestamp"]).sort_values("timestamp").reset_index(drop=True)

# --- Identify Scenario A first-12-days window start (min timestamp normalized) ---
start_ts = df["timestamp"].min()
start_date = start_ts.normalize()  # midnight UTC of first day
first12_end = start_date + pd.Timedelta(days=12)  # exclusive
# Attempt to locate the Scenario-B start (prefer timestamp at row 226,801 if present)
ROW_B_1BASED = 226_801
t0 = None
if len(df) >= ROW_B_1BASED:
    try:
        t0_candidate = pd.to_datetime(df.loc[ROW_B_1BASED - 1, "timestamp"], utc=True)
        # ensure candidate is after first12_end; otherwise fallback
        if t0_candidate >= first12_end:
            t0 = t0_candidate
    except Exception:
        t0 = None

# Fallback: if t0 couldn't be taken from the row, assume B starts exactly at day 13 (first12_end)
if t0 is None:
    t0 = first12_end
scenarioB_start = t0
scenarioB_end = scenarioB_start + pd.Timedelta(days=7)

print(f"Plot window: {start_date.date()} (start)  ->  {scenarioB_end.date()} (end)")
print("Using Scenario-B start (t0) =", scenarioB_start)

# --- Filter rows to the combined window: first 12 days + next 7 days (19 days total from start_date) ---
combined_start = start_date
combined_end = scenarioB_end  # exclusive end
mask = (df["timestamp"] >= combined_start) & (df["timestamp"] < combined_end)
df_sel = df.loc[mask].copy().reset_index(drop=True)
if df_sel.empty:
    raise ValueError("No rows found in the requested combined window. Check timestamps/row indices.")

print(f"Rows selected for plotting: {len(df_sel):,} (from {df_sel['timestamp'].min()} to {df_sel['timestamp'].max()})")

# --- Compress timeline: remove idle gaps larger than GAP_MINUTES ---
GAP_MINUTES = 30
gap_thresh = pd.Timedelta(minutes=GAP_MINUTES)

ts = df_sel["timestamp"]
diffs = ts.diff().fillna(pd.Timedelta(seconds=0))
is_segment_start = (diffs > gap_thresh)
segment_id = is_segment_start.cumsum()

# compute cumulative shifts for each segment (vectorized-ish)
prev_end = ts.iloc[0]
shifts = pd.Series(pd.Timedelta(0), index=df_sel.index)
cumulative_shift = pd.Timedelta(0)
for seg_idx, (_, grp) in enumerate(df_sel.groupby(segment_id)):
    seg_start = grp["timestamp"].iloc[0]
    seg_end = grp["timestamp"].iloc[-1]
    if seg_idx == 0:
        cumulative_shift = pd.Timedelta(0)
    else:
        gap = seg_start - prev_end
        remove = gap
        cumulative_shift += remove
    shifts.loc[grp.index] = cumulative_shift
    prev_end = seg_end

df_sel["shifted_ts"] = df_sel["timestamp"] - shifts.values

# --- Resample for 1-minute resolution then compute 1-hour rolling mean (on shifted index) ---
df_shift = df_sel.set_index("shifted_ts").sort_index()
resampled = df_shift["composite_score"].resample("3T").mean().interpolate(limit_direction='both')
hr_line = resampled.rolling(window=60, min_periods=1).mean()

# --- Find main peak inside B-window (by original timestamps) to mark on plot ---
mask_b = (df_sel["timestamp"] >= scenarioB_start) & (df_sel["timestamp"] < scenarioB_end)
if mask_b.any():
    df_b = df_sel.loc[mask_b]
    peak_idx = df_b["composite_score"].idxmax()
    peak_row = df_sel.loc[peak_idx]
    peak_shifted_ts = peak_row["shifted_ts"]
    peak_val = float(peak_row["composite_score"])
    peak_ts = peak_row["timestamp"]
    print(f"Main B-window peak at {peak_ts} (shifted {peak_shifted_ts}) value={peak_val:.4f}")
else:
    peak_shifted_ts = None
    print("No rows inside the B-window within the selected combined window (unexpected).")

# --- Plot ---
plt.figure(figsize=(16,5))
plt.scatter(df_shift.index, df_shift["composite_score"], s=6, alpha=0.20, color="tab:blue", label="composite_score (points)")
plt.plot(hr_line.index, hr_line.values, linewidth=2.0, color="#D62728", label="1-hour rolling mean")

# Shade/annotate the B-window on the shifted axis if available
if mask_b.any():
    b_masked = df_sel.loc[mask_b]
    b_start_shifted = b_masked["shifted_ts"].iloc[0]
    b_end_shifted = b_masked["shifted_ts"].iloc[-1]
    plt.axvspan(b_start_shifted, b_end_shifted, color="gray", alpha=0.12, label="Scenario B window")

# mark peak
if peak_shifted_ts is not None:
    plt.axvline(peak_shifted_ts, color="black", linestyle="--", linewidth=1.2, label="B-window peak")
    plt.plot([peak_shifted_ts], [peak_val], marker="o", color="black", markersize=6)
    plt.annotate(f"Peak {peak_val:.2f}\n{peak_ts.strftime('%Y-%m-%d %H:%M')}", 
                 xy=(peak_shifted_ts, peak_val), xytext=(8, 8), textcoords="offset points", fontsize=9,
                 bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="gray", alpha=0.8))

plt.title("Composite Score — First 12 days (Scenario A) + next 7 days (Scenario B)\nCompressed operating timeline (gaps >30min removed)")
plt.xlabel("Compressed operating time (datetime-like, original gaps removed)")
plt.ylabel("Composite Score (error-like)")
plt.legend()
plt.grid(alpha=0.25)
plt.tight_layout()
plt.show()


### Scenario C

In [None]:
import pandas as pd
from pathlib import Path

CSV_PATH = Path(r"C:\Users\ishaa\OneDrive\Desktop\synthetic_data_final\synthetic_battery_inference_scenarioA.csv")

df = pd.read_csv(CSV_PATH)
df['timestamp_parsed'] = pd.to_datetime(df['timestamp'], utc=True)

total_rows = len(df)

print("Total rows:", total_rows)

# SCENARIO A/B/C INDEX RANGES
A_end = 226_800
B_end = 320_400
C_start_idx = 320_400          # 0-based
C_end_idx   = 475_200 - 1      # 0-based inclusive end

print("\nScenario A rows: 1 → 226800")
print("Scenario B rows: 226801 → 320400")
print("Scenario C rows: 320401 → 475200\n")

print("Using 0-based indices:")
print("Scenario C index start:", C_start_idx)
print("Scenario C index end  :", C_end_idx)

# Derive timestamps for C-window
C_start_ts = df.loc[C_start_idx, 'timestamp_parsed']
C_end_ts   = C_start_ts + pd.Timedelta(days=7)

print("\nScenario C time window:")
print("C_start_ts:", C_start_ts)
print("C_end_ts  :", C_end_ts)

# Mask for timestamps inside window
mask_C = (df['timestamp_parsed'] >= C_start_ts) & (df['timestamp_parsed'] < C_end_ts)

print("\nRows inside the timestamp window:", mask_C.sum())
print("Rows inside index window        :", (df.index >= C_start_idx) & (df.index <= C_end_idx).sum())

# Combined strict mask: must be within BOTH index + timestamp range
mask_C_strict = mask_C & (df.index.to_series().between(C_start_idx, C_end_idx))
print("Final strict Scenario-C row count:", mask_C_strict.sum())


In [None]:
import numpy as np
from scipy.ndimage import gaussian_filter1d

# Build time axis for full 7-day C-window
C_window_secs = int((C_end_ts - C_start_ts).total_seconds())
print("Scenario C window seconds:", C_window_secs)

# Baseline near-healthy, small upward drift
base_start = 0.16
base_end   = 0.22

trend = np.linspace(base_start, base_end, C_window_secs)

# slight smoothing for realism
trend = gaussian_filter1d(trend, sigma=1800)

# clamp
trend = np.clip(trend, 0.0, 1.0)

print("Trend sample:", trend[:5], "...", trend[-5:])


In [None]:
rng = np.random.default_rng(42)

BURSTS_PER_DAY = 12               # tuneable
NUM_BURSTS = BURSTS_PER_DAY * 7

burst_starts = rng.integers(0, C_window_secs, size=NUM_BURSTS)
burst_dur    = rng.integers(20, 120, size=NUM_BURSTS)   # 20–120 seconds
burst_amp    = rng.uniform(0.25, 0.55, size=NUM_BURSTS) # add on top of trend

bursts = []
for s, d, a in zip(burst_starts, burst_dur, burst_amp):
    e = min(s + d, C_window_secs - 1)
    bursts.append((s, e, a))

print("Generated bursts:", len(bursts))


In [None]:
# Low-frequency noise
lf = gaussian_filter1d(rng.normal(0, 1, C_window_secs), sigma=3600)
lf = lf / lf.std() * 0.04   # amplitude

# High-frequency noise
hf = rng.normal(0, 0.02, C_window_secs)

noise = lf + hf
noise = noise - noise.mean()

# Add bursts
noise_burst = noise.copy()
for s, e, amp in bursts:
    dur = e - s + 1
    rel = np.linspace(0,1,dur)
    envelope = 1 - np.abs(2*rel - 1)   # triangular
    noise_burst[s:e+1] += envelope * amp

# Final clamp
final_series = np.clip(trend + noise_burst, 0.0, 1.0)

print("Final series sample:", final_series[:5], "...", final_series[-5:])


In [None]:
# compute seconds offset for each strict Scenario-C row
offset_secs = ((df.loc[mask_C_strict, 'timestamp_parsed'] - C_start_ts)
                 .dt.total_seconds()
                 .astype(int))

values_C = final_series[offset_secs.values]

# Apply small per-row jitter
values_C = np.clip(values_C + rng.normal(0,0.005,len(values_C)), 0, 1)

# Update df
df.loc[mask_C_strict, 'composite_score'] = values_C
df.loc[mask_C_strict, 'composite_health'] = 1 - values_C

print("Updated Scenario-C rows:", mask_C_strict.sum())


In [None]:
import os

tmp = CSV_PATH.with_suffix(".tmp")
df.to_csv(tmp, index=False)
os.replace(tmp, CSV_PATH)

print("CSV updated successfully:", CSV_PATH)


In [None]:
# Corrected Cell 15: diagnostics and compressed timeline plot for A (first 12 days) + B + C
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

CSV_PATH = r"C:\Users\ishaa\OneDrive\Desktop\synthetic_data_final\synthetic_battery_inference_scenarioA.csv"

# Re-load timestamps+composite for plotting in a safe way
use_cols = ["timestamp", "composite_score"]
df_plot = pd.read_csv(CSV_PATH, usecols=use_cols)
df_plot = df_plot.dropna(subset=["timestamp", "composite_score"]).reset_index(drop=True)
df_plot["timestamp"] = pd.to_datetime(df_plot["timestamp"], utc=True)
df_plot = df_plot.sort_values("timestamp").reset_index(drop=True)

# Determine start_date (first day) and B and C windows using known indices and timestamps
start_ts = df_plot["timestamp"].min()
start_date = start_ts.normalize()
first12_end = start_date + pd.Timedelta(days=12)

# Attempt to reuse previously computed B_start/B_end/C_start/C_end if present in notebook memory,
# else compute from index-based canonical boundaries we agreed on
try:
    B_start = t0   # earlier scenario B start (if available)
    B_end   = t_end
except NameError:
    # derive B_start as first timestamp after first12_end
    B_start = df_plot[df_plot["timestamp"] >= first12_end]["timestamp"].min()
    B_end   = B_start + pd.Timedelta(days=7)

# canonical C index window: rows 320401..475200 (1-based)
# map that to timestamps defensively
# (we will compute C_start based on row index 320401 -> 0-based 320400)
total_rows = len(df_plot)
idx_c_start_0 = 320400
idx_c_end_0 = 475200 - 1

if idx_c_start_0 < total_rows:
    C_start = df_plot.loc[idx_c_start_0, "timestamp"]
else:
    raise IndexError(f"CSV has only {total_rows} rows but expected index {idx_c_start_0} for C start.")

C_end = C_start + pd.Timedelta(days=7)

print("Plot windows:\n A: {} -> {}\n B: {} -> {}\n C: {} -> {}".format(start_date, first12_end, B_start, B_end, C_start, C_end))

# Select combined window = first12_start .. C_end
combined_start = start_date
combined_end = C_end
mask_combined = (df_plot["timestamp"] >= combined_start) & (df_plot["timestamp"] < combined_end)
df_sel = df_plot.loc[mask_combined].copy().reset_index(drop=True)
if df_sel.empty:
    raise ValueError("No rows found in the combined plotting window. Check indices/timestamps.")

print("Rows in combined plotting window:", len(df_sel))

# compress timeline by removing gaps > 30 minutes
GAP_MINUTES = 30
gap_thresh = pd.Timedelta(minutes=GAP_MINUTES)
ts = df_sel["timestamp"]
diffs = ts.diff().fillna(pd.Timedelta(seconds=0))
is_segment_start = (diffs > gap_thresh)
segment_id = is_segment_start.cumsum()

prev_end = ts.iloc[0]
shifts = pd.Series(pd.Timedelta(0), index=df_sel.index)
cumulative_shift = pd.Timedelta(0)
for seg_idx, (_, grp) in enumerate(df_sel.groupby(segment_id)):
    seg_start = grp["timestamp"].iloc[0]
    seg_end = grp["timestamp"].iloc[-1]
    if seg_idx == 0:
        cumulative_shift = pd.Timedelta(0)
    else:
        gap = seg_start - prev_end
        remove = gap
        cumulative_shift += remove
    shifts.loc[grp.index] = cumulative_shift
    prev_end = seg_end

df_sel["shifted_ts"] = df_sel["timestamp"] - shifts.values

# compute 1-minute resample + 1-hour rolling mean
df_shift = df_sel.set_index("shifted_ts").sort_index()
resampled = df_shift["composite_score"].resample("1T").mean().interpolate(limit_direction='both')
hr_line = resampled.rolling(window=60, min_periods=1).mean()

# plot
plt.figure(figsize=(16,5))
plt.scatter(df_shift.index, df_shift["composite_score"], s=6, alpha=0.18, color="tab:blue", label="composite_score (points)")
plt.plot(hr_line.index, hr_line.values, linewidth=2.0, color="#D62728", label="1-hour rolling mean")

# compute shifted boundaries for B and C windows using timestamp-based selection (robust)
def shifted_span_for_window(window_start, window_end, df_sel):
    rows = df_sel[(df_sel["timestamp"] >= window_start) & (df_sel["timestamp"] < window_end)]
    if rows.empty:
        return None, None
    return rows["shifted_ts"].iloc[0], rows["shifted_ts"].iloc[-1]

b_start_shifted, b_end_shifted = shifted_span_for_window(B_start, B_end, df_sel)
c_start_shifted, c_end_shifted = shifted_span_for_window(C_start, C_end, df_sel)

if b_start_shifted is not None:
    plt.axvspan(b_start_shifted, b_end_shifted, color="gray", alpha=0.12, label="Scenario B window")
if c_start_shifted is not None:
    plt.axvspan(c_start_shifted, c_end_shifted, color="orange", alpha=0.12, label="Scenario C window")

# annotate top 3 peaks in C-window (if any)
if c_start_shifted is not None:
    rows_c = df_sel[(df_sel["timestamp"] >= C_start) & (df_sel["timestamp"] < C_end)]
    if not rows_c.empty:
        top3 = rows_c.nlargest(3, "composite_score")
        for _, r in top3.iterrows():
            shifted = r["shifted_ts"]
            val = r["composite_score"]
            plt.plot(shifted, val, marker="o", color="black")
            plt.annotate(f"{val:.2f}\n{r['timestamp'].strftime('%Y-%m-%d %H:%M')}",
                         xy=(shifted, val), xytext=(6,6), textcoords="offset points", fontsize=8,
                         bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="gray", alpha=0.8))

plt.title("Composite Score — A (first 12 days) + B (7d) + C (7d) — compressed timeline")
plt.xlabel("Compressed operating time (gaps >30min removed)")
plt.ylabel("Composite Score")
plt.legend()
plt.grid(alpha=0.25)
plt.tight_layout()
plt.show()

# print diagnostics for C-window
df_c = df_sel[(df_sel["timestamp"] >= C_start) & (df_sel["timestamp"] < C_end)].copy()
print("\nC-window per-day composite summary (if any):")
if not df_c.empty:
    df_c["date_only"] = df_c["timestamp"].dt.date
    perday = df_c.groupby("date_only")["composite_score"].agg(['count','mean','std','min','max']).reset_index()
    print(perday.to_string(index=False))
else:
    print("No rows in C-window for diagnostics.")

# detect anomaly segments (threshold 0.6)
if not df_c.empty:
    thr = 0.6
    df_c["is_anom"] = (df_c["composite_score"] > thr).astype(int)
    df_c["seg_id"] = (df_c["is_anom"] != df_c["is_anom"].shift(1)).cumsum()
    segs = []
    for seg_id, grp in df_c.groupby("seg_id"):
        if grp["is_anom"].iloc[0] == 1:
            start = grp["timestamp"].iloc[0]; end = grp["timestamp"].iloc[-1]
            segs.append({"seg_id":int(seg_id), "start":str(start), "end":str(end), "dur_s":int((end-start).total_seconds()), "n":len(grp)})
    segs_sorted = sorted(segs, key=lambda x: x["dur_s"], reverse=True)
    print(f"\nDetected {len(segs_sorted)} anomaly segments in C-window (threshold>{thr}). Top 10:")
    for s in segs_sorted[:10]:
        print(f"  seg {s['seg_id']}: start={s['start']} end={s['end']} dur_s={s['dur_s']} n={s['n']}")
else:
    print("No anomaly segments in C-window.")


### Scenario D

In [None]:
# Cell 16: Verify Scenario D canonical index window and derive timestamps (strict index enforcement)
import pandas as pd
from pathlib import Path

CSV_PATH = Path(r"C:\Users\ishaa\OneDrive\Desktop\synthetic_data_final\synthetic_battery_inference_scenarioA.csv")
if not CSV_PATH.exists():
    raise FileNotFoundError(f"CSV not found: {CSV_PATH}")

# Load minimal columns for speed
df_meta = pd.read_csv(CSV_PATH, usecols=["timestamp"], low_memory=False)
df_meta["timestamp_parsed"] = pd.to_datetime(df_meta["timestamp"], utc=True, errors="coerce")

total_rows = len(df_meta)
print("Total rows in CSV:", total_rows)

# Canonical 1-based ranges (as agreed)
A_end_1 = 226_800
B_end_1 = 320_400
D_start_1 = 475_201
D_end_1   = 604_800

# Convert to 0-based indices
D_start_idx0 = D_start_1 - 1
D_end_idx0   = D_end_1 - 1

if D_end_idx0 >= total_rows:
    raise IndexError(f"CSV has {total_rows} rows but expected D_end index {D_end_idx0} to exist.")

# Derive D window timestamps from strict index start
D_start_ts = df_meta.loc[D_start_idx0, "timestamp_parsed"]
D_end_ts_by_index = df_meta.loc[D_end_idx0, "timestamp_parsed"]
D_end_ts = D_start_ts + pd.Timedelta(days=7)  # canonical 7-day window end (exclusive)

print("Scenario D index range (1-based):", D_start_1, "->", D_end_1)
print("Scenario D index range (0-based):", D_start_idx0, "->", D_end_idx0)
print("Derived D_start_ts:", D_start_ts)
print("Derived D_end_ts (7 days after start):", D_end_ts)
print("Timestamp at D_end index (sanity):", D_end_ts_by_index)

# Build masks
# timestamp-based
df_meta["in_D_time"] = (df_meta["timestamp_parsed"] >= D_start_ts) & (df_meta["timestamp_parsed"] < D_end_ts)
# index-based
df_meta["in_D_index"] = df_meta.index.to_series().between(D_start_idx0, D_end_idx0)
# strict intersection (use this for updates)
df_meta["in_D_strict"] = df_meta["in_D_time"] & df_meta["in_D_index"]

n_time = int(df_meta["in_D_time"].sum())
n_index = int(df_meta["in_D_index"].sum())
n_strict = int(df_meta["in_D_strict"].sum())

print(f"Rows in D time window: {n_time:,}, rows in index window: {n_index:,}, strict intersection: {n_strict:,}")

if n_strict == 0:
    raise RuntimeError("No rows in strict Scenario D slice — check canonical indices/timestamps.")


In [None]:
# Cell 17: build a very gradual rising 7-day trend for Scenario D
import numpy as np
from scipy.ndimage import gaussian_filter1d

# Window seconds
window_sec = int((D_end_ts - D_start_ts).total_seconds())
print("Scenario D window seconds:", window_sec)

# Very gradual rise: start ~0.28 -> end ~0.50 (tunable)
start_val = 0.28
end_val   = 0.50  # choose end so growth is visible but slow across 7 days

# Linear trend then smoothed slightly for realism
trend = np.linspace(start_val, end_val, window_sec)
# gentle smoothing: sigma in seconds (e.g., 2 hours -> 7200)
trend = gaussian_filter1d(trend, sigma=7200)
trend = np.clip(trend, 0.0, 1.0)

print("Trend sample (head/tail):", trend[:5], "...", trend[-5:])


In [None]:
# Cell 18: generate LF + HF noise and optionally a couple of tiny bursts (very rare)
import numpy as np
rng = np.random.default_rng(20251123)

# Noise parameters tuned down for very gradual drift
lf_sigma_seconds = 7200      # long slow wiggles (~2 hours)
lf_amp = 0.02                # low amplitude for LF
hf_base = 0.008              # small high-frequency jitter
row_jitter_sd = 0.003        # tiny per-row jitter

# Low-frequency noise (filtered white noise)
lf_white = rng.normal(0.0, 1.0, window_sec)
lf_noise = gaussian_filter1d(lf_white, sigma=lf_sigma_seconds)
lf_noise = lf_noise / (np.std(lf_noise) + 1e-12) * lf_amp

# High-frequency noise
hf_noise = rng.normal(0.0, hf_base, window_sec)

combined_noise = lf_noise + hf_noise
combined_noise = combined_noise - np.mean(combined_noise)  # zero-mean

# Very few small bursts to add realism (0-2 across 7 days)
num_bursts = rng.integers(0, 3)  # 0,1,2
bursts = []
for _ in range(num_bursts):
    s = int(rng.integers(0, window_sec-60))
    dur = int(rng.integers(20, 61))  # 20-60s
    e = min(window_sec-1, s + dur)
    amp = float(rng.uniform(0.03, 0.12))  # small additive amplitude
    bursts.append((s, e, amp))
    rel = np.linspace(0,1, e-s+1)
    env = 1 - np.abs(2*rel - 1.0)  # triangular envelope
    combined_noise[s:e+1] += env * amp

print("LF std:", float(np.std(lf_noise)), "HF std:", float(np.std(hf_noise)), "num_bursts:", num_bursts)
print("Example bursts (if any):", bursts)


In [None]:
# Cell 19: compose final series and map to strict Scenario D rows (assign composite_score, health, labels)
import pandas as pd
import numpy as np

# Compose final series
final_series = np.clip(trend + combined_noise, 0.0, 1.0)

# Load full CSV (we need to update rows)
df = pd.read_csv(CSV_PATH, low_memory=False)
df["timestamp_parsed"] = pd.to_datetime(df["timestamp"], utc=True, errors="coerce")

# Build strict mask again (defensive)
mask_index = df.index.to_series().between(D_start_idx0, D_end_idx0)
mask_time = (df["timestamp_parsed"] >= D_start_ts) & (df["timestamp_parsed"] < D_end_ts)
mask_strict = mask_time & mask_index

n_to_update = int(mask_strict.sum())
print("Rows to update (strict):", n_to_update)

if n_to_update == 0:
    raise RuntimeError("No rows selected for Scenario D update. Aborting to avoid accidental edits.")

# Map timestamp -> seconds offset safely
offsets = (df.loc[mask_strict, "timestamp_parsed"] - D_start_ts).dt.total_seconds().astype(int).clip(0, window_sec-1).values

# Fetch values and add tiny per-row jitter
vals = final_series[offsets] + rng.normal(0.0, row_jitter_sd, size=offsets.shape)
vals = np.clip(vals, 0.0, 1.0)

# Assign back only for strict mask
df.loc[mask_strict, "composite_score"] = vals
df.loc[mask_strict, "composite_health"] = 1.0 - df.loc[mask_strict, "composite_score"]

# Recompute labels/severity for strict rows
def label_and_sev(v):
    if v > 0.6:
        return "anomaly", 2
    elif v > 0.4:
        return "suspicious", 1
    else:
        return "normal", 0

labs = [label_and_sev(v) for v in df.loc[mask_strict, "composite_score"].values]
lab_col, sev_col = zip(*labs) if labs else ([], [])
df.loc[mask_strict, "anomaly_label"] = lab_col
df.loc[mask_strict, "anomaly_severity"] = sev_col

print("Assigned composite_score + labels for Scenario D rows.")


In [None]:
# Cell 20: atomic write back to CSV (safe replace)
import os

tmp = CSV_PATH.with_suffix(".tmp")
# drop helper column if present
if "timestamp_parsed" in df.columns:
    df_to_write = df.drop(columns=["timestamp_parsed"])
else:
    df_to_write = df
df_to_write.to_csv(tmp, index=False)
os.replace(tmp, CSV_PATH)
print("CSV updated in place:", CSV_PATH)


In [None]:
# Cell 21: Diagnostics + compressed timeline plot for A (first 12d) + B + C + D (showing D gradual rise)
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# reload minimal columns
use_cols = ["timestamp", "composite_score"]
df_plot = pd.read_csv(CSV_PATH, usecols=use_cols)
df_plot = df_plot.dropna(subset=["timestamp", "composite_score"]).reset_index(drop=True)
df_plot["timestamp"] = pd.to_datetime(df_plot["timestamp"], utc=True)
df_plot = df_plot.sort_values("timestamp").reset_index(drop=True)

# define combined window (A start .. D end)
start_date = df_plot["timestamp"].min().normalize()
A_first12_end = start_date + pd.Timedelta(days=12)
# B_start/B_end infer (safe)
B_start = df_plot[df_plot["timestamp"] >= A_first12_end]["timestamp"].min()
B_end = B_start + pd.Timedelta(days=7)
# C_start we derive from canonical index 320400 (0-based)
C_start = df_plot.loc[320400, "timestamp"]
C_end = C_start + pd.Timedelta(days=7)
# D_start from canonical index (we computed earlier)
D_start = df_plot.loc[D_start_idx0, "timestamp"]
D_end = D_start + pd.Timedelta(days=7)

print("Window dates: A start:", start_date, "-> D end:", D_end)

# Select plotting window: from A start to D end (inclusive)
mask_window = (df_plot["timestamp"] >= start_date) & (df_plot["timestamp"] < D_end)
df_sel = df_plot.loc[mask_window].copy().reset_index(drop=True)
print("Rows plotted:", len(df_sel))

# compress timeline by removing gaps > 30 minutes
GAP_MINUTES = 30
gap_thresh = pd.Timedelta(minutes=GAP_MINUTES)
ts = df_sel["timestamp"]
diffs = ts.diff().fillna(pd.Timedelta(seconds=0))
is_segment_start = (diffs > gap_thresh)
segment_id = is_segment_start.cumsum()

prev_end = ts.iloc[0]
shifts = pd.Series(pd.Timedelta(0), index=df_sel.index)
cumulative_shift = pd.Timedelta(0)
for seg_idx, (_, grp) in enumerate(df_sel.groupby(segment_id)):
    seg_start = grp["timestamp"].iloc[0]
    seg_end = grp["timestamp"].iloc[-1]
    if seg_idx == 0:
        cumulative_shift = pd.Timedelta(0)
    else:
        gap = seg_start - prev_end
        cumulative_shift += gap
    shifts.loc[grp.index] = cumulative_shift
    prev_end = seg_end

df_sel["shifted_ts"] = df_sel["timestamp"] - shifts.values

# compute 1-minute resample and 1-hour rolling mean
df_shift = df_sel.set_index("shifted_ts").sort_index()
resampled = df_shift["composite_score"].resample("0.1T").mean().interpolate(limit_direction='both')
hr_line = resampled.rolling(window=60, min_periods=1).mean()

# plot
plt.figure(figsize=(18,6))
plt.scatter(df_shift.index, df_shift["composite_score"], s=5, alpha=0.20, label="composite_score (points)")
plt.plot(hr_line.index, hr_line.values, linewidth=2.2, color="#D62728", label="1-hour rolling mean")

# compute shifted spans for B, C, D for shading
def shifted_bounds(win_start, win_end, df_sel):
    rows = df_sel[(df_sel["timestamp"] >= win_start) & (df_sel["timestamp"] < win_end)]
    if rows.empty:
        return None, None
    return rows["shifted_ts"].iloc[0], rows["shifted_ts"].iloc[-1]

b_s, b_e = shifted_bounds(B_start, B_end, df_sel)
c_s, c_e = shifted_bounds(C_start, C_end, df_sel)
d_s, d_e = shifted_bounds(D_start, D_end, df_sel)

if b_s is not None:
    plt.axvspan(b_s, b_e, color="gray", alpha=0.12, label="Scenario B")
if c_s is not None:
    plt.axvspan(c_s, c_e, color="orange", alpha=0.12, label="Scenario C")
if d_s is not None:
    plt.axvspan(d_s, d_e, color="green", alpha=0.12, label="Scenario D (gradual)")

plt.title("Composite Score — A + B + C + D (compressed timeline). Scenario D shows gradual upward trend.")
plt.xlabel("Compressed operating time (gaps removed)")
plt.ylabel("Composite Score")
plt.legend()
plt.grid(alpha=0.25)
plt.tight_layout()
plt.show()

# Diagnostics for D-window
df_d = df_sel[(df_sel["timestamp"] >= D_start) & (df_sel["timestamp"] < D_end)].copy()
if not df_d.empty:
    df_d["date_only"] = df_d["timestamp"].dt.date
    perday = df_d.groupby("date_only")["composite_score"].agg(['count','mean','std','min','max']).reset_index()
    print("\nPer-day composite summary for Scenario D:")
    print(perday.to_string(index=False))
    # Detect anomaly segments (threshold 0.6)
    thr = 0.6
    df_d["is_anom"] = (df_d["composite_score"] > thr).astype(int)
    df_d["seg_id"] = (df_d["is_anom"] != df_d["is_anom"].shift(1)).cumsum()
    segs = []
    for seg_id, grp in df_d.groupby("seg_id"):
        if grp["is_anom"].iloc[0] == 1:
            start = grp["timestamp"].iloc[0]; end = grp["timestamp"].iloc[-1]
            segs.append({"seg_id":int(seg_id), "start":str(start), "end":str(end), "dur_s":int((end-start).total_seconds()), "n":len(grp)})
    segs_sorted = sorted(segs, key=lambda x: x["dur_s"], reverse=True)
    print(f"\nDetected {len(segs_sorted)} anomaly segments in D-window (threshold>{thr}). Top segments:")
    for s in segs_sorted[:10]:
        print(f"  seg {s['seg_id']}: start={s['start']}, dur_s={s['dur_s']}, n={s['n']}")
else:
    print("No rows found in D-window for diagnostics.")


### Scenario E

In [None]:
# Cell 1 — Load CSV + Compute Scenario E window (strict index + timestamp)

import pandas as pd
from pathlib import Path

CSV_PATH = Path(r"C:\Users\ishaa\OneDrive\Desktop\synthetic_data_final\synthetic_battery_inference_scenarioA.csv")
if not CSV_PATH.exists():
    raise FileNotFoundError(CSV_PATH)

# Load timestamps only for speed
df_meta = pd.read_csv(CSV_PATH, usecols=["timestamp"])
df_meta["timestamp_parsed"] = pd.to_datetime(df_meta["timestamp"], utc=True, errors="coerce")

total_rows = len(df_meta)
print("Total rows:", total_rows)

# Scenario E start (1-based) -> convert to 0-based
E_start_1 = 604801
E_start_idx0 = E_start_1 - 1

if E_start_idx0 >= total_rows:
    raise RuntimeError("Start index exceeds file length.")

# Extract Scenario E start timestamp
E_start_ts = df_meta.loc[E_start_idx0, "timestamp_parsed"]
E_end_ts = E_start_ts + pd.Timedelta(days=7)

print("Scenario E start index:", E_start_1)
print("Scenario E start timestamp:", E_start_ts)
print("Scenario E end timestamp (7 days after):", E_end_ts)

# Build masks
mask_index = df_meta.index.to_series().between(E_start_idx0, total_rows - 1)
mask_time = (df_meta["timestamp_parsed"] >= E_start_ts) & (df_meta["timestamp_parsed"] < E_end_ts)
mask_E_strict = mask_index & mask_time

n_time = int(mask_time.sum())
n_index = int(mask_index.sum())
n_strict = int(mask_E_strict.sum())

print(f"Rows in time window: {n_time:,}")
print(f"Rows in index window: {n_index:,}")
print(f"Rows in strict Scenario-E slice:", n_strict)


In [None]:
# Cell 2 — Build 7-day healthy flat trend for Scenario E

import numpy as np
from scipy.ndimage import gaussian_filter1d

# Compute exact duration in seconds (from timestamps)
E_window_secs = int((E_end_ts - E_start_ts).total_seconds())
print("Scenario E window seconds:", E_window_secs)

# Flat healthy baseline: 0.18 → 0.22 over 7 days
base_start = 0.18
base_end   = 0.22

trend_E = np.linspace(base_start, base_end, E_window_secs)

# Gentle smoothing (slightly smoother than scenario D)
trend_E = gaussian_filter1d(trend_E, sigma=5400)  # ~1.5 hours smoothing
trend_E = np.clip(trend_E, 0, 1)

print("Trend sample:", trend_E[:5], "...", trend_E[-5:])


In [None]:
# Cell 3 — LF noise + HF jitter + rare micro-blips for Scenario E

rng = np.random.default_rng(20251123)

# Very gentle noise for healthy scenario
lf_sigma = 8800         # long-term drift ~2.5 hours
lf_amp   = 0.015        # small amplitude slow wiggle

hf_sd = 0.007           # small HF jitter
row_sd = 0.003          # minor per-row jitter

# Low-frequency noise
lf_white = rng.normal(0, 1, E_window_secs)
lf_noise = gaussian_filter1d(lf_white, sigma=lf_sigma)
lf_noise = lf_noise / (np.std(lf_noise) + 1e-9) * lf_amp

# High-frequency jitter
hf_noise = rng.normal(0, hf_sd, E_window_secs)

# Combine and zero-mean LF+HF
combined_noise_E = lf_noise + hf_noise
combined_noise_E -= combined_noise_E.mean()

# Micro-blips: 1–3 tiny spikes in 7 days
num_blips = rng.integers(1, 4)
blips = []
for _ in range(num_blips):
    s = int(rng.integers(0, E_window_secs - 60))
    d = int(rng.integers(10, 40))
    e = min(E_window_secs - 1, s + d)
    amp = float(rng.uniform(0.04, 0.12))
    blips.append((s, e, amp))

    rel = np.linspace(0, 1, e - s + 1)
    tri = 1 - np.abs(2 * rel - 1)
    combined_noise_E[s:e+1] += tri * amp

print("LF std:", np.std(lf_noise))
print("HF std:", np.std(hf_noise))
print("Micro-blips:", blips)


In [None]:
# Cell 4 — Compose final series and update CSV rows for Scenario E

import numpy as np
import pandas as pd

final_E = np.clip(trend_E + combined_noise_E, 0, 1)

# Load full CSV to update
df = pd.read_csv(CSV_PATH, low_memory=False)
df["timestamp_parsed"] = pd.to_datetime(df["timestamp"], utc=True, errors="coerce")

# Recompute strict mask
mask_index = df.index.to_series().between(E_start_idx0, total_rows - 1)
mask_time = (df["timestamp_parsed"] >= E_start_ts) & (df["timestamp_parsed"] < E_end_ts)
mask_E_strict = mask_index & mask_time

n_update = int(mask_E_strict.sum())
print("Updating Scenario E rows:", n_update)

if n_update == 0:
    raise RuntimeError("No Scenario E rows found!")

# Offsets in seconds
offsets = (df.loc[mask_E_strict, "timestamp_parsed"] - E_start_ts).dt.total_seconds().astype(int)
offsets = offsets.clip(0, E_window_secs - 1).values

vals = final_E[offsets] + rng.normal(0, row_sd, size=len(offsets))
vals = np.clip(vals, 0, 1)

df.loc[mask_E_strict, "composite_score"] = vals
df.loc[mask_E_strict, "composite_health"] = 1 - vals

# Label logic
def classify(v):
    if v > 0.6:
        return "anomaly", 2
    elif v > 0.4:
        return "suspicious", 1
    return "normal", 0

labels = [classify(v) for v in vals]
lab, sev = zip(*labels)

df.loc[mask_E_strict, "anomaly_label"] = lab
df.loc[mask_E_strict, "anomaly_severity"] = sev

print("Scenario E composite_score/labels updated.")


In [None]:
# Cell 5 — Safe atomic write

import os

tmp = CSV_PATH.with_suffix(".tmp")
df.drop(columns=["timestamp_parsed"]).to_csv(tmp, index=False)
os.replace(tmp, CSV_PATH)

print("CSV updated in place:", CSV_PATH)


In [None]:
# Cell 6 — Compressed timeline plot including Scenario E

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

dfp = pd.read_csv(CSV_PATH, usecols=["timestamp", "composite_score"])
dfp["timestamp"] = pd.to_datetime(dfp["timestamp"], utc=True)
dfp = dfp.sort_values("timestamp").reset_index(drop=True)

start_ts = dfp["timestamp"].min()
end_ts   = E_end_ts

mask = (dfp["timestamp"] >= start_ts) & (dfp["timestamp"] < end_ts)
dfp = dfp.loc[mask].copy()

ts = dfp["timestamp"]
diffs = ts.diff().fillna(pd.Timedelta(seconds=0))
seg_start = diffs > pd.Timedelta(minutes=30)
seg_id = seg_start.cumsum()

shifts = pd.Series(pd.Timedelta(0), index=dfp.index)
prev_end = ts.iloc[0]
cum_shift = pd.Timedelta(0)

for i, (_, g) in enumerate(dfp.groupby(seg_id)):
    s = g["timestamp"].iloc[0]
    e = g["timestamp"].iloc[-1]
    if i > 0:
        gap = s - prev_end
        cum_shift += gap
    shifts.loc[g.index] = cum_shift
    prev_end = e

dfp["shifted_ts"] = dfp["timestamp"] - shifts.values
dfp = dfp.set_index("shifted_ts")

res = dfp["composite_score"].resample("0.31T").mean().interpolate()
hr = res.rolling(60, min_periods=1).mean()

plt.figure(figsize=(18,6))
plt.scatter(dfp.index, dfp["composite_score"], s=4, alpha=0.18)
plt.plot(hr.index, hr.values, linewidth=2.0, color="orange")

plt.title("Composite Score — Scenarios A + B + C + D + E (Compressed Timeline)")
plt.xlabel("Compressed time")
plt.ylabel("Composite Score")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
