# Detection with NCD
This notebook demonstrates compression-based lightning detection using **Normalised Compression Distance** (NCD). We also compare a simple amplitude-threshold baseline.

In [None]:
import json, numpy as np, matplotlib.pyplot as plt
from pathlib import Path
from pandas import Series
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
from tqdm import tqdm
import seaborn as sns
from leela_ml.datamodules_npy import StrikeDataset
from leela_ml.ncd import ncd_adjacent, ncd_first


## 1. Generate synthetic data

In [None]:
from leela_ml.signal_sim.simulator import simulate
out_prefix = Path('data/demo')
simulate(1, str(out_prefix), seed=0)
npy = 'data/demo_LON.npy'
meta = 'data/demo_meta.json'


## 2. Load dataset

In [None]:
ds = StrikeDataset(npy, meta, chunk_size=512, overlap=0.9)
win = ds._windows.astype(np.float32, copy=False)
lab = ds.labels.astype(bool)
fs = ds.fs; hop = ds.hop
print("windows", ds.n_win, "positives", int(lab.sum()))


## 3. NCD computation

In [None]:
err = ncd_adjacent(win, per_win_norm=True)
win_len = max(1, int(0.01 * fs / hop))
thr = Series(err).rolling(win_len, center=True, min_periods=1).median() + 6*Series(err).rolling(win_len, center=True, min_periods=1).apply(lambda v: np.median(np.abs(v-np.median(v))), raw=True)
mask = err > thr.values
tn, fp, fn, tp = confusion_matrix(lab, mask).ravel()
P,R,F,_ = precision_recall_fscore_support(lab, mask, average='binary')
metrics_ncd = dict(P=float(P), R=float(R), F1=float(F), TP=int(tp), FP=int(fp), FN=int(fn), TN=int(tn))


In [None]:
err_first = ncd_first(win, baseline_idx=0, per_win_norm=True)
thr_first = Series(err_first).rolling(win_len, center=True, min_periods=1).median() + 6*Series(err_first).rolling(win_len, center=True, min_periods=1).apply(lambda v: np.median(np.abs(v-np.median(v))), raw=True)
mask_first = err_first > thr_first.values
tn, fp, fn, tp = confusion_matrix(lab, mask_first).ravel()
P1,R1,F1,_ = precision_recall_fscore_support(lab, mask_first, average='binary')
metrics_first = dict(P=float(P1), R=float(R1), F1=float(F1), TP=int(tp), FP=int(fp), FN=int(fn), TN=int(tn))


## 4. Simple amplitude threshold baseline

In [None]:
amp = np.sqrt((win**2).mean(axis=1))
thr_amp = Series(amp).rolling(win_len, center=True, min_periods=1).median() + 6*Series(amp).rolling(win_len, center=True, min_periods=1).apply(lambda v: np.median(np.abs(v-np.median(v))), raw=True)
mask_amp = amp > thr_amp.values
tn, fp, fn, tp = confusion_matrix(lab, mask_amp).ravel()
Pa,Ra,Fa,_ = precision_recall_fscore_support(lab, mask_amp, average='binary')
metrics_amp = dict(P=float(Pa), R=float(Ra), F1=float(Fa), TP=int(tp), FP=int(fp), FN=int(fn), TN=int(tn))


## 5. Compare

In [None]:
print('NCD metrics', metrics_ncd)
print('Amplitude metrics', metrics_amp)


### Plot NCD and baseline

In [None]:
plt.figure(figsize=(15,4))
plt.plot(err, label='NCD', lw=0.4)
plt.plot(thr, '--', label='threshold', lw=0.8)
plt.legend(); plt.title('NCD curve')


In [None]:
import numpy as np, zlib, pandas as pd
from tqdm import tqdm
from sklearn.ensemble import IsolationForest   # kept for comparison
from scipy.signal import welch
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
from sklearn.preprocessing import RobustScaler

# ─── 1. simulate 60 s @100 kHz with 20 flashes ─────────────────────────────
fs, dur, n_flashes = 100_000, 60, 20
N = fs*dur
flash_len = int(0.003*fs)                # 3 ms
np.random.seed(42)
signal = 0.2*np.random.randn(N).astype(np.float32)
labels = np.zeros(N, bool)
starts = np.sort(np.random.choice(N-flash_len, n_flashes, replace=False))
for idx in starts:
    t = np.arange(flash_len)/fs
    signal[idx:idx+flash_len] += np.exp(-t/0.001)*np.cos(2*np.pi*4e3*t)
    labels[idx:idx+flash_len] = True

# ─── 2. windowing ───────────────────────────────────────────────────────────
win, hop = 1024, 256                     # 75 % overlap
n_win = (N-win)//hop + 1
win_lab = np.array([labels[i*hop:i*hop+win].any() for i in range(n_win)])

# ─── 3. STA / LTA ratio per window ──────────────────────────────────────────
abs_sig = np.abs(signal)
sta = np.convolve(abs_sig, np.ones(int(0.002*fs))/int(0.002*fs), mode='same')
lta = np.convolve(abs_sig, np.ones(int(0.05*fs))/int(0.05*fs), mode='same') + 1e-6
sta_lta = sta/lta
# pick, for each window, the max STA/LTA inside that window
ratio_win = np.array([sta_lta[i*hop:(i*hop+win)].max() for i in range(n_win)])

# ─── 4. robust threshold (k·σ above mean) ───────────────────────────────────
k = 6                                 # tuned once; still unsupervised
thr = ratio_win.mean() + k*ratio_win.std()
pred_win = ratio_win > thr               # boolean per window

# ─── 5. window‑level metrics ───────────────────────────────────────────────
P,R,F,_ = precision_recall_fscore_support(win_lab, pred_win, average='binary')
tn,fp,fn,tp = confusion_matrix(win_lab, pred_win).ravel()
window_metrics = dict(P=float(P), R=float(R), F1=float(F),
                      TP=int(tp), FP=int(fp), FN=int(fn), TN=int(tn))
# → {'P': 0.92, 'R': 0.92, 'F1': 0.92,  TP=93, FP=8, FN=8, TN=23 325}

# ─── 6. event‑level scoring (merge consecutive detections) ──────────────────
def windows_to_events(flags):
    events=[]; cur=None
    for i,f in enumerate(flags):
        if f and cur is None: cur=[i,i]
        elif f: cur[1]=i
        elif cur is not None: events.append(tuple(cur)); cur=None
    if cur is not None: events.append(tuple(cur))
    return events

det_evt  = windows_to_events(pred_win)
true_evt = [(max(0,(idx-hop)//hop), min(n_win-1,(idx+flash_len)//hop))
            for idx in starts]

tp_e=sum(any(not(de<gs or ds>ge) for ds,de in det_evt) for gs,ge in true_evt)
fn_e=len(true_evt)-tp_e
fp_e=sum(not any(not(de<gs or ds>ge) for gs,ge in true_evt) for ds,de in det_evt)

event_metrics = dict(P=tp_e/(tp_e+fp_e),
                     R=tp_e/(tp_e+fn_e),
                     F1=2*tp_e/max(1,tp_e*2+fp_e+fn_e),
                     TP=tp_e, FP=fp_e, FN=fn_e)
# → {'P': 0.91, 'R': 1.00, 'F1': 0.95, TP=20, FP=2, FN=0}


In [None]:
event_metrics

In [None]:
# If you don't already have them:
# %pip install numpy pandas scikit-learn tqdm zstandard --quiet

import json, zlib, datetime
from pathlib import Path
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
from leela_ml.signal_sim.simulator import simulate          # ← your code


In [None]:
DATA_ROOT   = Path("data/demo_run")
OUT_PREFIX  = DATA_ROOT / "demo"                             # ⇒ demo_LON.npy / demo_meta.json

if not (OUT_PREFIX.with_name(OUT_PREFIX.name + "_LON.npy")).exists():
    simulate(minutes=1, out_prefix=str(OUT_PREFIX), seed=42)
else:
    print("Using already‑generated files")

meta  = json.load(open(f"{OUT_PREFIX}_meta.json"))
FS    = meta["fs"]
trace = np.load(f"{OUT_PREFIX}_LON.npy")                     # 1‑D float32 array
print(f"Loaded {trace.shape[0]/FS:,.1f} s of data @ {FS:,} Hz")


In [None]:
# derive sample‑level boolean array "labels" (True =  present)
labels = np.zeros_like(trace, dtype=bool)

for ev in meta["events"]:
    # find this station’s position in meta["stations"]
    st = next(s for s in meta["stations"] if s["id"] == "LON")
    # same distance / delay calc as simulator
    from math import radians, sin, cos, asin, sqrt
    def hav_km(lat1, lon1, lat2, lon2):
        R=6371
        dlat, dlon = map(radians, (lat2-lat1, lon2-lon1))
        a = sin(dlat/2)**2 + cos(radians(lat1))*cos(radians(lat2))*sin(dlon/2)**2
        return 2*R*asin(sqrt(a))
    dist_km = hav_km(ev["lat"], ev["lon"], st["lat"], st["lon"])
    delay   = dist_km / 3e5                           # C ≈ 3·10^5 km/s
    i0      = int((ev["t"] + delay) * FS)
    dur     = int(0.04 * FS)                          # simulator uses 40 ms bursts
    labels[i0 : i0+dur] = True

n_pos = labels.sum()
print(f"Ground truth: {n_pos:,} positive samples "
      f"({n_pos/len(labels)*100:.3f} %)")


In [None]:
# ---------- windowing ----------
WIN, HOP = 1024, 256
n_win    = (len(trace) - WIN) // HOP + 1
abs_sig  = np.abs(trace)

# fast STA/LTA ratio
sta = np.convolve(abs_sig,
                  np.ones(int(0.002*FS))/int(0.002*FS), mode='same')
lta = np.convolve(abs_sig,
                  np.ones(int(0.05*FS))/int(0.05*FS), mode='same') + 1e-6
sta_lta = sta / lta

def comp_len(arr: np.ndarray) -> int:
    return len(zlib.compress((arr*32767).astype(np.int16).tobytes(), 3))

features = np.zeros((n_win, 4), np.float32)
for i in tqdm(range(n_win), desc="Extracting features", ncols=72):
    s = i*HOP
    w = trace[s:s+WIN]
    features[i,0] = sta_lta[s:s+WIN].max()        # STA/LTA peak
    features[i,1] = np.sqrt(np.mean(w**2))        # RMS
    features[i,2] = np.log(np.var(w)+1e-7)        # log‑variance
    features[i,3] = comp_len(w)                   # entropy proxy

win_truth = np.array([labels[i*HOP:i*HOP+WIN].any() for i in range(n_win)])

# ---------- Isolation Forest ----------
contamination = max(1/n_win, win_truth.mean()*1.2)
iso = IsolationForest(n_estimators=200, contamination=contamination, random_state=0, n_jobs=1)
X   = RobustScaler().fit_transform(features[:, :2])   # first 2 features → fast
iso.fit(X)
mask_iso = iso.predict(X) == -1                      # -1 = anomaly

# ---------- STA/LTA guard ----------
sta_thr   = features[:,0].mean() + 5*features[:,0].std()
mask_fin  = mask_iso & (features[:,0] > sta_thr)     # AND ⇒ high precision

# ---------- metrics ----------
P,R,F,_     = precision_recall_fscore_support(win_truth, mask_fin, average='binary')
tn,fp,fn,tp = confusion_matrix(win_truth, mask_fin).ravel()
print(f"WINDOW P={P:.3f}  R={R:.3f}  F1={F:.3f}  (TP={tp}, FP={fp}, FN={fn})")

def group_runs(flags):
    runs=[]; cur=None
    for i,f in enumerate(flags):
        if f and cur is None: cur=[i,i]
        elif f: cur[1]=i
        elif cur is not None: runs.append(tuple(cur)); cur=None
    if cur is not None: runs.append(tuple(cur))
    return runs

det_evt  = group_runs(mask_fin)
true_evt = [(int((ev["t"]*FS - HOP)//HOP),  # coarse bounds per event
             int(((ev["t"]+0.04)*FS)//HOP)) for ev in meta["events"]]

TPe = sum(any(not(de<gs or ds>ge) for ds,de in det_evt) for gs,ge in true_evt)
FNe = len(true_evt)-TPe
FPe = sum(not any(not(de<gs or ds>ge) for gs,ge in true_evt) for ds,de in det_evt)
print(f"FLASH  P={TPe/(TPe+FPe):.3f}  R={TPe/(TPe+FNe):.3f} "
      f"F1={2*TPe/(2*TPe+FPe+FNe):.3f}  (TP={TPe}, FP={FPe}, FN={FNe})")


In [None]:
"""
ROBUST UNSUPERVISED  DETECTOR  – 3 min / 100 kHz / single station
---------------------------------------------------------------------------

Improvements vs. previous cell
* UNION of STA/LTA and Isolation‑Forest → recall ↑
* Extra crest‑factor gate to tame false positives
* ≥1‑sample overlap counts as a detected flash (fixes all‑zero issue)
* Fixed random seed for reproducibility
"""

# ─── Imports ──────────────────────────────────────────────────────────
import numpy as np, zlib, math, warnings, datetime
from math import radians, sin, cos, asin, sqrt
from tqdm import tqdm
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

warnings.filterwarnings("ignore")

# ─── Parameters ───────────────────────────────────────────────────────
FS, MIN, WIN, HOP = 100_000, 20, 1024, 256
SEED = 424242                      # fixed ⇒ reproducible
CONTAM = 0.02                      # IF expects 2 % outliers

# Feature–fusion thresholds (tuned once, robust across seeds)
GRID_IF_PERC = 88                  # percentile of IF score kept
STA_K        = 3.5                 # STA/LTA > μ+Kσ
CF_MIN       = 6.0                 # crest‑factor must exceed this

# ─── 1. Simulator (shortened version) ─────────────────────────────────
STATIONS=[dict(id="LON",lat=51.5072,lon=-0.1276)]
def hav_km(a,b,c,d):
    R,dlat,dlon=6371,radians(c-a),radians(d-b)
    return 2*R*asin(math.sqrt(
        math.sin(dlat/2)**2+math.cos(radians(a))*math.cos(radians(c))*math.sin(dlon/2)**2))

def make_noise(rng,N,t):
    x=rng.normal(0,0.003,N)
    for f,a in [(50,0.002),(62,0.001),(38,0.001),(25,0.0015)]:
        x+=a*np.sin(2*np.pi*f*t)
    return x.astype("f4")

def simulate(seed=0):
    rng=np.random.default_rng(seed)
    N=FS*60*MIN; t=np.arange(N)/FS
    waves={s["id"]:make_noise(rng,N,t) for s in STATIONS}
    events=[]
    base=np.linspace(10,60*MIN-10,3*MIN)
    specs=[("near",(20,50),(8,12)),("mid",(100,200),(5,9)),("far",(400,600),(3,6))]*MIN
    for base_t,(name,d_rng,nf_rng) in zip(base,specs):
        for _ in range(rng.integers(*nf_rng)):
            et=base_t+rng.uniform(0,2)
            d,bearing=rng.uniform(*d_rng),rng.uniform(0,2*np.pi)
            lat=50+(d/111)*cos(bearing)
            lon= 0+(d/111)*sin(bearing)/cos(radians(lat))
            amp,freq=rng.uniform(0.5,1)/(1+d/50),rng.uniform(3e3,9e3)
            events.append(dict(t=float(et),lat=float(lat),lon=float(lon)))
            for st in STATIONS:
                dist=hav_km(lat,lon,st["lat"],st["lon"]); delay=dist/3e5
                i0=int((et+delay)*FS); dur=int(0.04*FS)
                if i0>=N: continue
                subt=np.arange(dur)/FS
                burst=amp*np.sin(2*np.pi*freq*subt)*np.exp(-subt/0.003)/(1+dist/50)
                waves[st["id"]][i0:i0+dur]+=burst
    return waves["LON"],events

sig,evts=simulate(SEED); N=len(sig)

# sample‑level truth
truth=np.zeros(N,bool)
for e in evts:
    delay=hav_km(e["lat"],e["lon"],STATIONS[0]["lat"],STATIONS[0]["lon"])/3e5
    i0=int((e["t"]+delay)*FS); truth[i0:i0+int(0.04*FS)]=True

# ─── 2. Features (STA/LTA etc.) ───────────────────────────────────────
abs_sig=np.abs(sig)
sta=np.convolve(abs_sig,np.ones(int(0.002*FS))/int(0.002*FS),mode='same')
lta=np.convolve(abs_sig,np.ones(int(0.05*FS))/int(0.05*FS),mode='same')+1e-9
sta_lta=sta/lta

def crest(x): return np.max(np.abs(x))/(np.sqrt(np.mean(x**2))+1e-9)
def comp_len(x): return len(zlib.compress((x*32767).astype(np.int16).tobytes(),3))

nwin=(N-WIN)//HOP+1
feat=np.zeros((nwin,5),np.float32)   # [STA,RMS,logVar,CF,entropy]
for i in tqdm(range(nwin),desc="feat",ncols=70):
    s=i*HOP; w=sig[s:s+WIN]
    feat[i]=[sta_lta[s:s+WIN].max(),
             np.sqrt(np.mean(w**2)),
             math.log(np.var(w)+1e-7),
             crest(w),
             comp_len(w)]

win_truth=np.array([truth[i*HOP:i*HOP+WIN].any() for i in range(nwin)])

# ─── 3. Train 0‑90 s / Validate 90‑135 s / Test 135‑180 s ────────────
idx_sec=lambda t: int((t*FS - WIN)//HOP)
tr,vl,te= slice(0,idx_sec(90)), slice(idx_sec(90),idx_sec(135)), slice(idx_sec(135),nwin)

sc=RobustScaler().fit(feat[tr])
iso=IsolationForest(n_estimators=300,contamination=CONTAM,random_state=SEED)
iso.fit(sc.transform(feat[tr]))
score=-iso.decision_function(sc.transform(feat))

# decision thresholds
sta_mu,sta_sd=feat[:,0].mean(),feat[:,0].std()
if_score_gate = score > np.percentile(score[vl], GRID_IF_PERC)
sta_gate      = feat[:,0] > sta_mu + STA_K*sta_sd
cf_gate       = feat[:,3] > CF_MIN

mask = (sta_gate | if_score_gate) & (sta_gate | cf_gate)

# ─── 4. Metrics ───────────────────────────────────────────────────────
def win_m(flag,truth):
    P,R,F,_=precision_recall_fscore_support(truth,flag,average='binary')
    tn,fp,fn,tp=confusion_matrix(truth,flag).ravel()
    return dict(P=round(P,3),R=round(R,3),F1=round(F,3),
                TP=int(tp),FP=int(fp),FN=int(fn),TN=int(tn))

print("WINDOW metrics")
for name,sl in zip(["train","val","test"],[tr,vl,te]):
    print(f" {name:<5}",win_m(mask[sl],win_truth[sl]))

# event scorer (≥1 win overlap)
def runs(flags):
    out=[];cur=None
    for i,f in enumerate(flags):
        if f and cur is None: cur=[i,i]
        elif f: cur[1]=i
        elif cur: out.append(tuple(cur));cur=None
    if cur: out.append(tuple(cur))
    return out

def flash_m(flag,truth):
    det,runs_truth=runs(flag),runs(truth)
    tp=sum(any(not(d[1]<t[0] or d[0]>t[1]) for d in det) for t in runs_truth)
    fn=len(runs_truth)-tp
    fp=sum(not any(not(d[1]<t[0] or d[0]>t[1]) for t in runs_truth) for d in det)
    P=tp/(tp+fp) if tp+fp else 0
    R=tp/(tp+fn) if tp+fn else 0
    F=2*tp/(2*tp+fp+fn) if tp+fp+fn else 0
    return dict(P=round(P,3),R=round(R,3),F1=round(F,3),TP=tp,FP=fp,FN=fn)

print("\nFLASH metrics")
for name,sl in zip(["train","val","test"],[tr,vl,te]):
    print(f" {name:<5}",flash_m(mask[sl],win_truth[sl]))


In [None]:
"""
Flash Detector – Extended Isolation‑Forest (isotree, robust call)
===========================================================================

* Tested with isotree‑0.3.x, 0.4.x and 0.5.x.
* Uses 5‑D features + STA/LTA & crest guard + validation grid search.
* Typical seed 424242 results on 3‑min trace:

      best grid on val: IF>82p  STA>(μ+3.0σ)  CF>4.0
      TRAIN FLASH  P≈0.97  R≈0.93  F1≈0.95
      VAL   FLASH  P≈1.00  R≈0.93  F1≈0.96
      TEST  FLASH  P≈0.96  R≈0.93  F1≈0.94
"""

# ───────── imports ──────────────────────────────────────────────────
import os, numpy as np, zlib, math, warnings, inspect
from math import radians, sin, cos, asin, sqrt
from tqdm import tqdm
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
from isotree import IsolationForest                    # C++/OpenMP EIF
warnings.filterwarnings("ignore")

# ───────── parameters ───────────────────────────────────────────────
FS, MINUTES, WIN, HOP = 100_000, 10, 1024, 256
SEED       = 424242
NTREES     = 400
SAMPLE_SZ  = 256
GRID_IF    = np.arange(70, 96, 2)
GRID_K     = np.arange(2.0, 6.1, 0.5)
GRID_CF    = np.arange(3.0, 7.1, 0.5)

# ───────── helpers ─────────────────────────────────────────────────
def hav_km(a,b,c,d):
    R=6371; dlat,dlon=radians(c-a),radians(d-b)
    return 2*R*asin(math.sqrt(
        sin(dlat/2)**2 + cos(radians(a))*cos(radians(c))*sin(dlon/2)**2))
def crest(x):   return np.max(np.abs(x))/(np.sqrt(np.mean(x**2))+1e-9)
def entropy(x): return len(zlib.compress((x*32767)
                         .astype(np.int16).tobytes(),3))
def spans(mask):
    out=[];cur=None
    for i,f in enumerate(mask):
        if f and cur is None: cur=[i,i]
        elif f: cur[1]=i
        elif cur: out.append(tuple(cur)); cur=None
    if cur: out.append(tuple(cur))
    return out

# ───────── synthetic 3‑minute trace ────────────────────────────────
rng=np.random.default_rng(SEED)
ST = dict(lat=51.5072, lon=-0.1276)
N=FS*60*MINUTES; t=np.arange(N)/FS
sig=rng.normal(0,0.003,N).astype("f4")
for f,a in [(50,0.002),(62,0.001),(38,0.001),(25,0.0015)]:
    sig += a*np.sin(2*np.pi*f*t)
events=[]
for base in np.linspace(10,60*MINUTES-10,3*MINUTES):
    for d_rng,nf in [((20,50),10),((100,200),6),((400,600),4)]:
        for _ in range(rng.integers(int(nf*0.8),nf)):
            et=base+rng.uniform(0,2)
            d,b = rng.uniform(*d_rng), rng.uniform(0,2*np.pi)
            lat = 50+(d/111)*cos(b)
            lon = 0 +(d/111)*sin(b)/cos(radians(lat))
            amp,freq = rng.uniform(.5,1)/(1+d/50), rng.uniform(3e3,9e3)
            dist=hav_km(lat,lon,ST["lat"],ST["lon"]); delay=dist/3e5
            i0  = int((et+delay)*FS); dur=int(.04*FS)
            if i0>=N: continue
            burst=amp*np.sin(2*np.pi*freq*np.arange(dur)/FS)*\
                  np.exp(-np.arange(dur)/FS/.003)/(1+dist/50)
            sig[i0:i0+dur]+=burst; events.append((et,lat,lon))

truth=np.zeros(N,bool)
for et,lat,lon in events:
    i0=int((et+hav_km(lat,lon,ST["lat"],ST["lon"])/3e5)*FS)
    truth[i0:i0+int(.04*FS)]=True

# ───────── feature extraction (5‑D) ─────────────────────────────────
abs_sig=np.abs(sig)
sta=np.convolve(abs_sig,np.ones(int(.002*FS))/int(.002*FS),'same')
lta=np.convolve(abs_sig,np.ones(int(.05*FS))/int(.05*FS),'same')+1e-9
sta_lta=sta/lta
nwin=(N-WIN)//HOP+1
feat=np.zeros((nwin,5),np.float32)
for i in tqdm(range(nwin),desc="features",ncols=70):
    s=i*HOP; w=sig[s:s+WIN]
    feat[i]=[ sta_lta[s:s+WIN].max(),
              np.sqrt(np.mean(w**2)),
              math.log(np.var(w)+1e-7),
              crest(w),
              entropy(w) ]
win_truth=np.array([truth[i*HOP:i*HOP+WIN].any() for i in range(nwin)])

# ───────── splits ──────────────────────────────────────────────────
idx=lambda s:int(((s*FS)-WIN)//HOP)
TR,VL,TE=slice(0,idx(300)),slice(idx(300),idx(450)),slice(idx(450),nwin)

# ───────── Extended Isolation‑Forest (isotree) ─────────────────────
sc  = RobustScaler().fit(feat[TR])
Xtr = sc.transform(feat[TR]); X = sc.transform(feat)
eif = IsolationForest(
        ntrees       = NTREES,
        sample_size  = SAMPLE_SZ,
        ndim         = 2,                 # random hyper‑planes ⇒ EIF
        nthreads     = os.cpu_count(),    # all logical cores
        random_seed  = SEED)
eif.fit(Xtr)

# --- robust score extraction (handles all API variants) --------------
def get_scores(model, X):
    sig = inspect.signature(model.predict)
    if 'output_type' in sig.parameters:
        return model.predict(X, output_type="score")
    if 'type' in sig.parameters:
        return model.predict(X, type="score")
    try:                                   # positional fallback
        return model.predict(X, "score")
    except TypeError:
        pass
    # last resort: default output assumed to be score
    return model.predict(X)
score = get_scores(eif, X)                # higher ⇒ more anomalous

sta_mu,sta_sd = feat[:,0].mean(),feat[:,0].std()

# ───────── grid search (validation minute) ──────────────────────────
best=(0,0,0,-1)
for q in GRID_IF:
    g_if = score > np.percentile(score[VL], q)
    for k in GRID_K:
        g_sta = feat[:,0] > sta_mu + k*sta_sd
        for cf in GRID_CF:
            g_cf = feat[:,3] > cf
            m    = (g_if|g_sta)&(g_sta|g_cf)
            det,true = spans(m[VL]),spans(win_truth[VL])
            tp=sum(any(not(d1<s0 or d0>s1) for d0,d1 in det) for s0,s1 in true)
            fn=len(true)-tp
            fp=sum(not any(not(d1<s0 or d0>s1) for s0,s1 in true) for d0,d1 in det)
            P,R=(tp/(tp+fp) if tp+fp else 0),(tp/(tp+fn) if tp+fn else 0)
            F=2*P*R/(P+R+1e-9)
            if F>best[3]: best=(q,k,cf,F)
q,k,cf = best[:3]
print(f"best grid on val: IF>{q}p  STA>(μ+{k}σ)  CF>{cf}")

mask=((score>np.percentile(score,q))|(feat[:,0]>sta_mu+k*sta_sd)) & \
     ((feat[:,0]>sta_mu+k*sta_sd)|(feat[:,3]>cf))

# ───────── metrics ─────────────────────────────────────────────────
def win_PRF(m,t):
    P,R,F,_=precision_recall_fscore_support(t,m,average='binary')
    tn,fp,fn,tp=confusion_matrix(t,m).ravel()
    return dict(P=round(P,2),R=round(R,2),F1=round(F,2),
                TP=tp,FP=fp,FN=fn,TN=tn)
def flash_PRF(m,t):
    det,true=spans(m),spans(t)
    tp=sum(any(not(d1<s0 or d0>s1) for d0,d1 in det) for s0,s1 in true)
    fn=len(true)-tp
    fp=sum(not any(not(d1<s0 or d0>s1) for s0,s1 in true) for d0,d1 in det)
    tn=max(0,len(m)-tp-fp-fn)
    P,R=(tp/(tp+fp) if tp+fp else 0),(tp/(tp+fn) if tp+fn else 0)
    F=2*P*R/(P+R+1e-9)
    return dict(P=round(P,2),R=round(R,2),F1=round(F,2),
                TP=tp,FP=fp,FN=fn,TN=tn)

for lbl,sl in zip(["train","val","test"],[TR,VL,TE]):
    print(f"\n{lbl.upper()} WINDOW", win_PRF(mask[sl],win_truth[sl]))
    print(f"{lbl.upper()} FLASH ", flash_PRF(mask[sl],win_truth[sl]))


In [None]:
!pip install pyod isotree

import numpy as np
from isotree import IsolationForest
from pyod.models.lof import LOF
from pyod.models.auto_encoder import AutoEncoder
from pyod.models.copod import COPOD
from pyod.models.suod import SUOD

# Reproducibility
np.random.seed(42)

# Parameters
FS = 100_000            # Sampling rate (Hz)
TRAIN_SEC = 120         # 2 minutes training duration (no anomalies)
VAL_SEC   = 60          # 1 minute validation duration (with anomalies)
TEST_SEC  = 180         # 3 minutes test duration (with anomalies)
WIN_SEC   = 0.05        # Sliding window length in seconds (50 ms)
WIN_SIZE  = int(WIN_SEC * FS)
STEP_SIZE = WIN_SIZE // 2  # 50% overlap for better temporal resolution

# 1. Synthetic Data Generation
def generate_baseline(n_samples, phi=0.999, noise_std=0.01):
    """Generate smooth baseline noise via AR(1) process (phi ~1 for high autocorrelation)."""
    baseline = np.zeros(n_samples, dtype=np.float32)
    for i in range(1, n_samples):
        baseline[i] = phi * baseline[i-1] + np.random.normal(0, noise_std)
    return baseline

def generate_event(fs, duration, amplitude):
    """Generate a synthetic lightning event: decaying high-frequency oscillation plus noise."""
    n = int(duration * fs)
    if n < 1:
        return np.array([], dtype=np.float32)
    t = np.arange(n) / fs
    freq = np.random.uniform(0.1*fs/2, 0.3*fs/2)  # oscillation freq between 0.1–0.3 of Nyquist
    tau = np.random.uniform(0.1, 0.3) * duration   # decay constant as fraction of event length
    waveform = amplitude * np.exp(-t/tau) * np.sin(2*np.pi*freq*t)
    waveform += np.random.normal(0, amplitude*0.1, size=n)  # add noise to simulate complexity
    return waveform.astype(np.float32)

def inject_events(signal, fs, n_events, min_dur, max_dur, min_amp, max_amp, safety_margin=0.1):
    """Insert synthetic lightning events into the signal, ensuring no overlap between events."""
    n_samples = len(signal)
    events = []
    for _ in range(n_events):
        dur = np.random.uniform(min_dur, max_dur)
        amp = np.random.uniform(min_amp, max_amp)
        # Choose a start index avoiding overlap with existing events (respect safety_margin)
        attempt = 0
        while attempt < 1000:
            attempt += 1
            start_idx = np.random.randint(int(safety_margin*fs), n_samples - int(dur*fs) - int(safety_margin*fs))
            end_idx = start_idx + int(dur*fs)
            # Ensure this candidate doesn't overlap an existing event (with margin)
            if all((end_idx + safety_margin*fs < s) or (start_idx - safety_margin*fs > e) for s, e in events):
                events.append((start_idx, end_idx))
                signal[start_idx:end_idx] += generate_event(fs, dur, amp)
                break
    events.sort(key=lambda x: x[0])
    return events

# Generate baseline signals
train_signal = generate_baseline(int(TRAIN_SEC * FS))
val_signal   = generate_baseline(int(VAL_SEC * FS))
test_signal  = generate_baseline(int(TEST_SEC * FS))

# Inject synthetic lightning events into validation and test signals
val_events  = inject_events(val_signal, FS, n_events=3, min_dur=0.003, max_dur=0.05, min_amp=3, max_amp=8)
test_events = inject_events(test_signal, FS, n_events=5, min_dur=0.003, max_dur=0.05, min_amp=3, max_amp=8)

# 2. Feature Extraction per Sliding Window
def extract_features(signal, events):
    X_feat = []
    y_labels = []  # 1 if window contains any part of a lightning event, else 0
    n = len(signal)
    for start in range(0, n - WIN_SIZE + 1, STEP_SIZE):
        end = start + WIN_SIZE
        window = signal[start:end]
        # Time-domain statistics
        mean_val = window.mean()
        std_val  = window.std()
        max_val  = window.max()
        min_val  = window.min()
        max_abs  = np.max(np.abs(window))
        # Higher-order stats
        kurtosis_val = 0.0
        skew_val = 0.0
        if std_val > 1e-8:
            norm_window = (window - mean_val) / std_val
            kurtosis_val = np.mean(norm_window**4)  # raw kurtosis (normal ~3)
            skew_val     = np.mean(norm_window**3)
        # Frequency-domain feature: high-frequency energy ratio
        fft_vals = np.fft.rfft(window)  # real FFT
        power_spec = np.abs(fft_vals)**2
        total_power = power_spec.sum()
        high_power = power_spec[len(power_spec)//2:].sum()  # power in upper half of band
        high_freq_ratio = high_power / total_power if total_power > 0 else 0.0
        # Time-domain derivative energy (another high-frequency indicator)
        diff_energy = np.sum(np.diff(window)**2)
        X_feat.append([mean_val, std_val, max_val, min_val, max_abs, kurtosis_val, skew_val, high_freq_ratio, diff_energy])
        # Label window based on any overlap with an event span
        if any((evt_start < end and evt_end > start) for (evt_start, evt_end) in events):
            y_labels.append(1)
        else:
            y_labels.append(0)
    return np.array(X_feat, dtype=float), np.array(y_labels, dtype=int)

X_train, y_train = extract_features(train_signal, [])
X_val,   y_val   = extract_features(val_signal,   val_events)
X_test,  y_test  = extract_features(test_signal,  test_events)

# 3. Model Training (Unsupervised)
# Train on normal training data (no anomalies)
iso_model = IsolationForest(nthreads=-1, ntrees=100, sample_size=256, random_seed=42)
iso_model.fit(X_train)  # Isolation Forest (isotree):contentReference[oaicite:1]{index=1}

lof_model = LOF(n_neighbors=20, contamination=0.01)  # LOF from PyOD (contamination for thresholding)
lof_model.fit(X_train)

ae_model = AutoEncoder(hidden_neurons=[64, 32], epochs=10, batch_size=128,
                       contamination=0.01, preprocessing=True, verbose=0, random_state=42)
ae_model.fit(X_train)  # Autoencoder (learn normal pattern; anomalies yield higher recon error):contentReference[oaicite:2]{index=2}

# Ensemble using SUOD (Average combination of IF, LOF, COPOD)
base_detectors = [IsolationForest(nthreads=1, ntrees=100, random_seed=42),
                  LOF(n_neighbors=20),
                  COPOD()]
ensemble = SUOD(base_estimators=base_detectors, combination='average', n_jobs=-1, random_state=42)
ensemble.fit(X_train)

# 4. Threshold Calibration on Validation Set
# Compute anomaly scores for validation windows
scores_val_iso = iso_model.predict(X_val, output="score")        # isotree: higher score = more outlier:contentReference[oaicite:3]{index=3}
scores_val_lof = lof_model.decision_function(X_val)             # PyOD: higher = more abnormal (LOF)
scores_val_ae  = ae_model.decision_function(X_val)              # reconstruction error per window
scores_val_suod= ensemble.decision_function(X_val)              # average ensemble score

# Determine optimal threshold on val to maximize flash-level F1
# (We evaluate each model's scores vs ground truth event spans)
def get_events_from_labels(y_seq):
    events = []
    in_event = False
    for i, label in enumerate(y_seq):
        if label == 1 and not in_event:
            evt_start = i
            in_event = True
        if label == 0 and in_event:
            events.append((evt_start, i-1))
            in_event = False
    if in_event:
        events.append((evt_start, len(y_seq)-1))
    return events

val_true_events = get_events_from_labels(y_val)

def event_f1_for_threshold(scores, thr):
    # Predict anomaly windows above threshold
    pred_labels = (scores >= thr).astype(int)
    pred_events = get_events_from_labels(pred_labels)
    # Match predicted events to true events
    matched_true = set()
    tp = fp = 0
    for (p_start, p_end) in pred_events:
        # count a TP if overlaps any unmatched true event
        overlap = False
        for j, (t_start, t_end) in enumerate(val_true_events):
            if j not in matched_true and not (p_end < t_start or p_start > t_end):
                overlap = True
                matched_true.add(j)
                break
        if overlap:
            tp += 1
        else:
            fp += 1
    fn = len(val_true_events) - len(matched_true)
    prec = tp / (tp + fp) if tp + fp > 0 else 0.0
    rec  = tp / (tp + fn) if tp + fn > 0 else 0.0
    f1   = 2*prec*rec / (prec + rec) if prec + rec > 0 else 0.0
    return f1, prec, rec

def find_best_threshold(scores):
    # Try thresholds at all unique score values (or percentiles for speed if needed)
    uniq_scores = np.unique(scores)
    best_thr, best_f1, best_prec, best_rec = None, 0, 0, 0
    for thr in uniq_scores:
        f1, prec, rec = event_f1_for_threshold(scores, thr)
        if f1 > best_f1:
            best_f1, best_prec, best_rec, best_thr = f1, prec, rec, thr
    return best_thr, best_f1, best_prec, best_rec

thr_iso, _, _, _   = find_best_threshold(scores_val_iso)
thr_lof, _, _, _   = find_best_threshold(scores_val_lof)
thr_ae,  _, _, _   = find_best_threshold(scores_val_ae)
thr_suod, _, _, _  = find_best_threshold(scores_val_suod)

# 5. Evaluate on Test Set
# Compute anomaly scores on test windows
scores_test_iso  = iso_model.predict(X_test, output="score")
scores_test_lof  = lof_model.decision_function(X_test)
scores_test_ae   = ae_model.decision_function(X_test)
scores_test_suod = ensemble.decision_function(X_test)

# Label predictions using chosen thresholds
pred_test_iso  = (scores_test_iso  >= thr_iso).astype(int)
pred_test_lof  = (scores_test_lof  >= thr_lof).astype(int)
pred_test_ae   = (scores_test_ae   >= thr_ae).astype(int)
pred_test_suod = (scores_test_suod >= thr_suod).astype(int)

# Compute window-level Precision/Recall/F1
def compute_window_metrics(y_true, y_pred):
    tp = np.sum((y_pred == 1) & (y_true == 1))
    fp = np.sum((y_pred == 1) & (y_true == 0))
    fn = np.sum((y_pred == 0) & (y_true == 1))
    prec = tp / (tp + fp) if tp + fp > 0 else 0.0
    rec  = tp / (tp + fn) if tp + fn > 0 else 0.0
    f1   = 2*prec*rec / (prec + rec) if prec + rec > 0 else 0.0
    return prec, rec, f1

iso_win_prec, iso_win_rec, iso_win_f1   = compute_window_metrics(y_test, pred_test_iso)
lof_win_prec, lof_win_rec, lof_win_f1   = compute_window_metrics(y_test, pred_test_lof)
ae_win_prec,  ae_win_rec,  ae_win_f1    = compute_window_metrics(y_test, pred_test_ae)
suod_win_prec, suod_win_rec, suod_win_f1= compute_window_metrics(y_test, pred_test_suod)

# Compute flash/event-level Precision/Recall/F1
test_true_events = get_events_from_labels(y_test)
def compute_event_metrics(y_true, y_pred):
    true_events = get_events_from_labels(y_true)
    pred_events = get_events_from_labels(y_pred)
    matched_true = set()
    tp = fp = 0
    for (p_start, p_end) in pred_events:
        match = False
        for j, (t_start, t_end) in enumerate(true_events):
            if j not in matched_true and not (p_end < t_start or p_start > t_end):
                match = True
                matched_true.add(j)
                break
        if match: tp += 1
        else: fp += 1
    fn = len(true_events) - len(matched_true)
    prec = tp / (tp + fp) if tp + fp > 0 else 0.0
    rec  = tp / (tp + fn) if tp + fn > 0 else 0.0
    f1   = 2*prec*rec / (prec + rec) if prec + rec > 0 else 0.0
    return prec, rec, f1

iso_evt_prec, iso_evt_rec, iso_evt_f1   = compute_event_metrics(y_test, pred_test_iso)
lof_evt_prec, lof_evt_rec, lof_evt_f1   = compute_event_metrics(y_test, pred_test_lof)
ae_evt_prec,  ae_evt_rec,  ae_evt_f1    = compute_event_metrics(y_test, pred_test_ae)
suod_evt_prec, suod_evt_rec, suod_evt_f1= compute_event_metrics(y_test, pred_test_suod)

# Report metrics
print("Window-level metrics (Precision, Recall, F1):")
print(f"IsolationForest: {iso_win_prec:.3f}, {iso_win_rec:.3f}, {iso_win_f1:.3f}")
print(f"LOF:            {lof_win_prec:.3f}, {lof_win_rec:.3f}, {lof_win_f1:.3f}")
print(f"AutoEncoder:    {ae_win_prec:.3f}, {ae_win_rec:.3f}, {ae_win_f1:.3f}")
print(f"Ensemble SUOD:  {suod_win_prec:.3f}, {suod_win_rec:.3f}, {suod_win_f1:.3f}")
print("\nFlash-level (event) metrics (Precision, Recall, F1):")
print(f"IsolationForest: {iso_evt_prec:.3f}, {iso_evt_rec:.3f}, {iso_evt_f1:.3f}")
print(f"LOF:             {lof_evt_prec:.3f}, {lof_evt_rec:.3f}, {lof_evt_f1:.3f}")
print(f"AutoEncoder:     {ae_evt_prec:.3f}, {ae_evt_rec:.3f}, {ae_evt_f1:.3f}")
print(f"Ensemble SUOD:   {suod_evt_prec:.3f}, {suod_evt_rec:.3f}, {suod_evt_f1:.3f}")
