In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import numpy.lib.stride_tricks as st

# --------------------------------------------------
# Paths and basic configuration
# --------------------------------------------------
PATH = "/content/drive/MyDrive/Datamining-TSC-Project/new_processed_data.parquet"
CFG = {
    "time_col": "time",
    "window": 36,        # sliding window length (hours)
    "trend_h": 12,       # recent hours for trend checks
    "ma_hours": [3, 6, 12],  # moving average windows
}

# --------------------------------------------------
# Angle utility functions
# --------------------------------------------------
def wrap360(x):
    # Wrap angles into [0, 360)
    return (x % 360.0 + 360.0) % 360.0

def angle_diff_deg(a, b):
    # Smallest signed angle difference a - b in degrees
    return (a - b + 180.0) % 360.0 - 180.0

def wave_dir_convert(old_wave_dir):
    # Convert wave direction to wind-direction convention
    return wrap360(270.0 - old_wave_dir)

# --------------------------------------------------
# Moving average feature generator
# --------------------------------------------------
def add_moving_averages(df, cols, ma_hours):
    # Add rolling mean features for selected columns
    for col in cols:
        for h in ma_hours:
            df[f"{col}_ma{h}"] = df[col].rolling(window=h, min_periods=h).mean()
    return df

# --------------------------------------------------
# Load and clean data
# --------------------------------------------------
df = pd.read_parquet(PATH)

df["time"] = pd.to_datetime(df["time"])
df = df.sort_values("time").drop_duplicates("time").reset_index(drop=True)

# Keep only relevant columns
df = df[
    [
        "time",
        "Wind speed",
        "Wind Direction",
        "Wave Period",
        "Wave Direction",
        "Wave Height",
        "Wave Power",
        "Pressure",
        "temperature",
        "Surge Height",
        "Total Water Level",
        "Wave Steepness",
    ]
].copy()

# Rename columns to short, consistent names
df.rename(
    columns={
        "Wind speed": "ws",
        "Wind Direction": "wd",
        "Wave Period": "tp",
        "Wave Direction": "wdir",
        "Wave Height": "hs",
        "Wave Power": "pwr",
        "Pressure": "mslp",
        "temperature": "temp",
        "Surge Height": "surge",
        "Total Water Level": "twl",
        "Wave Steepness": "steep",
    },
    inplace=True,
)

# --------------------------------------------------
# Windâ€“wave direction alignment features
# --------------------------------------------------
df["wdir"] = wave_dir_convert(df["wdir"].to_numpy(np.float32))

wd = df["wd"].to_numpy(np.float32)
wdir = df["wdir"].to_numpy(np.float32)

# Angle difference between wind and wave directions
dwd_deg = angle_diff_deg(wd, wdir).astype(np.float32)
dwd_rad = np.deg2rad(dwd_deg).astype(np.float32)

# Encode direction difference with sin/cos
df["dwd_sin"] = np.sin(dwd_rad).astype(np.float32)
df["dwd_cos"] = np.cos(dwd_rad).astype(np.float32)

# Drop raw direction columns
df.drop(columns=["wd", "wdir"], inplace=True)

# --------------------------------------------------
# Add moving average features
# --------------------------------------------------
ma_cols = [
    "hs", "ws", "pwr", "mslp",
    "temp", "surge", "twl", "steep", "tp"
]
df = add_moving_averages(df, ma_cols, CFG["ma_hours"])

# --------------------------------------------------
# Sliding window statistics
# --------------------------------------------------
W = CFG["window"]
H = CFG["trend_h"]

hs   = df["hs"].to_numpy(np.float32)
pwr  = df["pwr"].to_numpy(np.float32)
mslp = df["mslp"].to_numpy(np.float32)
ws   = df["ws"].to_numpy(np.float32)
dwd_cos = df["dwd_cos"].to_numpy(np.float32)

# Create rolling windows
hs_w   = st.sliding_window_view(hs,   W)
pwr_w  = st.sliding_window_view(pwr,  W)
mslp_w = st.sliding_window_view(mslp, W)
ws_w   = st.sliding_window_view(ws,   W)
dwd_cos_w = st.sliding_window_view(dwd_cos, W)

# Window-based severity metrics (mean + 2*std)
hs_metric36  = hs_w.mean(axis=1)  + 2.0 * hs_w.std(axis=1)
pwr_metric36 = pwr_w.mean(axis=1) + 2.0 * pwr_w.std(axis=1)

# --------------------------------------------------
# Train / validation / test split by time
# --------------------------------------------------
start_times = df["time"].iloc[:len(hs_metric36)].to_numpy()

train_mask = start_times < np.datetime64("2015-01-01")
val_mask   = (start_times >= np.datetime64("2015-01-01")) & (start_times < np.datetime64("2020-01-01"))
test_mask  = start_times >= np.datetime64("2020-01-01")

# --------------------------------------------------
# Percentile-based severity thresholds (train only)
# --------------------------------------------------
hs_p75, hs_p92, hs_p98, hs_p995 = np.percentile(
    hs_metric36[train_mask], [75, 92, 98, 99.5]
)
pwr_p75, pwr_p92, pwr_p98, pwr_p995 = np.percentile(
    pwr_metric36[train_mask], [75, 92, 98, 99.5]
)

# Map continuous values to severity classes
def severity(x, p75, p92, p98, p995):
    y = np.zeros_like(x, dtype=np.int8)
    y[(x >= p75) & (x < p92)]  = 1
    y[(x >= p92) & (x < p98)]  = 2
    y[(x >= p98) & (x < p995)] = 3
    y[(x >= p995)]             = 4
    return y

sev_hs  = severity(hs_metric36,  hs_p75,  hs_p92,  hs_p98,  hs_p995)
sev_pwr = severity(pwr_metric36, pwr_p75, pwr_p92, pwr_p98, pwr_p995)

# Base severity: worst of wave height or power
base = np.maximum(sev_hs, sev_pwr)

# --------------------------------------------------
# Trend-based reinforcement rules (train only)
# --------------------------------------------------
train_hours_mask = df["time"] < "2015-01-01"

hs_th   = np.percentile(hs[train_hours_mask],  92)
pwr_th  = np.percentile(pwr[train_hours_mask], 92)
ws_th   = np.percentile(ws[train_hours_mask],  92)
mslp_th = np.percentile(mslp[train_hours_mask], 20)
align_th = np.percentile(dwd_cos[train_hours_mask], 75)

# Count how many storm-like conditions persist in last H hours
cnt = (
    (hs_w[:, -H:]  >= hs_th).sum(axis=1)  >= 6
).astype(int) + (
    (pwr_w[:, -H:] >= pwr_th).sum(axis=1) >= 6
).astype(int) + (
    (ws_w[:, -H:]  >= ws_th).sum(axis=1)  >= 6
).astype(int) + (
    (mslp_w[:, -H:] <= mslp_th).sum(axis=1) >= 6
).astype(int) + (
    (dwd_cos_w[:, -H:] >= align_th).sum(axis=1) >= 4
).astype(int)

# Final labels (trend count can be used later if needed)
y = base.copy()

# --------------------------------------------------
# Sanity checks
# --------------------------------------------------
print("Columns:", df.columns.tolist())
print("Class dist:")
for name, m in [("train", train_mask), ("val", val_mask), ("test", test_mask)]:
    print(name, pd.Series(y[m]).value_counts(normalize=True).sort_index().to_dict())


Columns: ['time', 'ws', 'tp', 'hs', 'pwr', 'mslp', 'temp', 'surge', 'twl', 'steep', 'dwd_sin', 'dwd_cos', 'hs_ma3', 'hs_ma6', 'hs_ma12', 'ws_ma3', 'ws_ma6', 'ws_ma12', 'pwr_ma3', 'pwr_ma6', 'pwr_ma12', 'mslp_ma3', 'mslp_ma6', 'mslp_ma12', 'temp_ma3', 'temp_ma6', 'temp_ma12', 'surge_ma3', 'surge_ma6', 'surge_ma12', 'twl_ma3', 'twl_ma6', 'twl_ma12', 'steep_ma3', 'steep_ma6', 'steep_ma12', 'tp_ma3', 'tp_ma6', 'tp_ma12']
Class dist:
train {0: 0.725734690152414, 1: 0.1796986705606766, 2: 0.07084131909585957, 3: 0.017933740987496578, 4: 0.005791579203553284}
val {0: 0.7295774005111354, 1: 0.17819003285870755, 2: 0.06863818912011684, 3: 0.015950164293537787, 4: 0.007644213216502373}
test {0: 0.735375345217173, 1: 0.18250291009517722, 2: 0.060712573893593226, 3: 0.016889964165886836, 4: 0.0045192066281697215}


CNN-WITH TEMPERATURE

In [None]:
import numpy as np
import pandas as pd
import numpy.lib.stride_tricks as st

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# --------------------------------------------------
# Device setup
# --------------------------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# --------------------------------------------------
# Feature configuration
# --------------------------------------------------
COLS = ["ws", "tp", "mslp", "surge", "twl", "steep", "temp", "dwd_sin", "dwd_cos"]
W = int(W)  # window length

# --------------------------------------------------
# Training hyperparameters
# --------------------------------------------------
BATCH_TRAIN = 256
BATCH_EVAL  = 1024
EPOCHS      = 50
PATIENCE    = 8

LR           = 5e-4
WEIGHT_DECAY = 1e-3
DROPOUT      = 0.25

PIN_MEMORY  = (device == "cuda")

# --------------------------------------------------
# Build sliding window input tensor
# --------------------------------------------------
arrs = [df[c].to_numpy(np.float32) for c in COLS]
X_list = [st.sliding_window_view(a, window_shape=W) for a in arrs]

# Shape: (samples, window, features)
X = np.stack(X_list, axis=-1).astype(np.float32)

# Align labels with windows
y = np.asarray(y)
y = y[:len(X)]

train_mask = np.asarray(train_mask)[:len(X)]
val_mask   = np.asarray(val_mask)[:len(X)]
test_mask  = np.asarray(test_mask)[:len(X)]

# Split data
X_train, y_train = X[train_mask], y[train_mask]
X_val,   y_val   = X[val_mask],   y[val_mask]
X_test,  y_test  = X[test_mask],  y[test_mask]

print("Shapes:",
      "\n  X_train:", X_train.shape,
      "\n  X_val:  ", X_val.shape,
      "\n  X_test: ", X_test.shape)
print("y_train unique:", np.unique(y_train), "dtype:", y_train.dtype)

# --------------------------------------------------
# Normalize using TRAIN statistics only (leakage-free)
# --------------------------------------------------
mu  = X_train.mean(axis=(0,1), keepdims=True)
std = X_train.std(axis=(0,1), keepdims=True) + 1e-6

X_train = (X_train - mu) / std
X_val   = (X_val   - mu) / std
X_test  = (X_test  - mu) / std

# --------------------------------------------------
# PyTorch dataset (channels-first for Conv1D)
# --------------------------------------------------
class WindowDataset(Dataset):
    def __init__(self, X_np, y_np):
        self.X = torch.from_numpy(X_np).permute(0, 2, 1).contiguous()
        self.y = torch.from_numpy(y_np.astype(np.int64))

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = WindowDataset(X_train, y_train)
val_ds   = WindowDataset(X_val,   y_val)
test_ds  = WindowDataset(X_test,  y_test)

# Data loaders
train_loader = DataLoader(train_ds, batch_size=BATCH_TRAIN, shuffle=True,  num_workers=2, pin_memory=PIN_MEMORY)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_EVAL,  shuffle=False, num_workers=2, pin_memory=PIN_MEMORY)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_EVAL,  shuffle=False, num_workers=2, pin_memory=PIN_MEMORY)

# --------------------------------------------------
# Class weights (balanced + softened)
# --------------------------------------------------
classes = np.unique(y_train)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)

# Slightly soften class weights for stability
cw_soft = np.sqrt(cw)

n_classes = int(classes.max()) + 1

class_weight = torch.ones(n_classes, dtype=torch.float32)
for c, w in zip(classes, cw_soft):
    class_weight[int(c)] = float(w)

class_weight = class_weight.to(device)

print("Class weights (balanced):", {int(c): float(w) for c, w in zip(classes, cw)})
print("Class weights (sqrt-soft):", {int(c): float(w) for c, w in zip(classes, cw_soft)})

# --------------------------------------------------
# CNN building blocks
# --------------------------------------------------
class ConvBlock(nn.Module):
    # Two Conv1D layers + residual skip connection
    def __init__(self, in_ch, out_ch, k=5, p=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv1d(in_ch, out_ch, kernel_size=k, padding=k//2),
            nn.BatchNorm1d(out_ch),
            nn.GELU(),
            nn.Dropout(p),
            nn.Conv1d(out_ch, out_ch, kernel_size=k, padding=k//2),
            nn.BatchNorm1d(out_ch),
            nn.GELU(),
        )
        self.skip = nn.Conv1d(in_ch, out_ch, kernel_size=1) if in_ch != out_ch else nn.Identity()

    def forward(self, x):
        return self.net(x) + self.skip(x)

class StormCNN(nn.Module):
    # Simple 1D CNN for multivariate time series
    def __init__(self, in_ch, n_classes, base=64, dropout=0.25):
        super().__init__()
        self.stem = nn.Sequential(
            nn.Conv1d(in_ch, base, kernel_size=3, padding=1),
            nn.BatchNorm1d(base),
            nn.GELU()
        )
        self.b1 = ConvBlock(base,    base,   k=5, p=dropout)
        self.b2 = ConvBlock(base,    base*2, k=5, p=dropout)
        self.b3 = ConvBlock(base*2,  base*2, k=3, p=dropout)

        self.pool = nn.AdaptiveAvgPool1d(1)
        self.head = nn.Sequential(
            nn.Flatten(),
            nn.Linear(base*2, base),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(base, n_classes)
        )

    def forward(self, x):
        x = self.stem(x)
        x = self.b1(x)
        x = self.b2(x)
        x = self.b3(x)
        x = self.pool(x)
        return self.head(x)

# --------------------------------------------------
# Model, loss, optimizer
# --------------------------------------------------
in_ch = len(COLS)
model = StormCNN(in_ch=in_ch, n_classes=n_classes, base=64, dropout=DROPOUT).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weight)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="max", factor=0.5, patience=3
)

# --------------------------------------------------
# Evaluation helpers
# --------------------------------------------------
@torch.no_grad()
def _predict(model, loader):
    model.eval()
    ys, yh = [], []
    for xb, yb in loader:
        xb = xb.to(device, non_blocking=True)
        logits = model(xb)
        pred = torch.argmax(logits, dim=1).cpu().numpy()
        ys.append(yb.numpy())
        yh.append(pred)
    return np.concatenate(ys), np.concatenate(yh)

@torch.no_grad()
def eval_f1_and_dist(model, loader):
    y_true, y_pred = _predict(model, loader)
    f1m = f1_score(y_true, y_pred, average="macro")
    uniq, cnt = np.unique(y_pred, return_counts=True)
    return f1m, dict(zip(uniq.tolist(), cnt.tolist()))

# --------------------------------------------------
# One training epoch
# --------------------------------------------------
def train_one_epoch(model, loader):
    model.train()
    total = 0.0
    for xb, yb in loader:
        xb = xb.to(device, non_blocking=True)
        yb = yb.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total += loss.item() * xb.size(0)
    return total / len(loader.dataset)

# --------------------------------------------------
# Training loop with early stopping
# --------------------------------------------------
best_f1 = -1.0
best_state = None
bad = 0

for ep in range(1, EPOCHS + 1):
    loss = train_one_epoch(model, train_loader)
    val_f1, pred_dist = eval_f1_and_dist(model, val_loader)
    scheduler.step(val_f1)

    lr_now = optimizer.param_groups[0]["lr"]
    print(f"Epoch {ep:02d} | loss={loss:.4f} | val_f1_macro={val_f1:.4f} | lr={lr_now:.2e} | pred_dist={pred_dist}")

    if val_f1 > best_f1 + 1e-4:
        best_f1 = val_f1
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        bad = 0
    else:
        bad += 1
        if bad >= PATIENCE:
            print("Early stopping.")
            break

# Restore best model
if best_state is not None:
    model.load_state_dict(best_state)

# --------------------------------------------------
# Final reports
# --------------------------------------------------
def report_split(name, loader):
    y_true, y_pred = _predict(model, loader)
    print(f"\n{name}")
    print("F1-macro:", f1_score(y_true, y_pred, average="macro"))
    print(classification_report(y_true, y_pred))
    print("Confusion matrix:")
    print(confusion_matrix(y_true, y_pred))

report_split("TRAIN", train_loader)
report_split("VAL",   val_loader)
report_split("TEST",  test_loader)


Device: cuda
Shapes: 
  X_train: (262968, 36, 9) 
  X_val:   (43824, 36, 9) 
  X_test:  (43813, 36, 9)
y_train unique: [0 1 2 3 4] dtype: int8
Class weights (balanced): {0: 0.2755828027980822, 1: 1.1129742884350862, 2: 2.823211122443502, 3: 11.152162849872774, 4: 34.53289560078792}
Class weights (sqrt-soft): {0: 0.5249598106503794, 1: 1.054975965809215, 2: 1.6802413881474, 3: 3.3394854169276997, 4: 5.876469654544973}
Epoch 01 | loss=0.2644 | val_f1_macro=0.8244 | lr=5.00e-04 | pred_dist={0: 31684, 1: 8452, 2: 2891, 3: 638, 4: 159}
Epoch 02 | loss=0.1730 | val_f1_macro=0.8781 | lr=5.00e-04 | pred_dist={0: 31621, 1: 7546, 2: 3694, 3: 699, 4: 264}
Epoch 03 | loss=0.1478 | val_f1_macro=0.8797 | lr=5.00e-04 | pred_dist={0: 31325, 1: 8389, 2: 2807, 3: 994, 4: 309}
Epoch 04 | loss=0.1328 | val_f1_macro=0.9054 | lr=5.00e-04 | pred_dist={0: 32227, 1: 7476, 2: 2925, 3: 919, 4: 277}
Epoch 05 | loss=0.1220 | val_f1_macro=0.8903 | lr=5.00e-04 | pred_dist={0: 31634, 1: 7939, 2: 3296, 3: 642, 4: 313}

CNN WITH MA'S

In [None]:
import numpy as np
import pandas as pd
import numpy.lib.stride_tricks as st

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# --------------------------------------------------
# Device selection
# --------------------------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# --------------------------------------------------
# Feature setup
# --------------------------------------------------
BASE_COLS = ["ws", "tp", "mslp", "surge", "twl", "steep", "temp", "dwd_sin", "dwd_cos"]
MA_HOURS = [3, 6, 12]

# Moving average features (only for selected base vars)
MA_BASE = ["ws", "tp", "mslp", "surge", "twl", "steep", "temp"]
MA_COLS = [f"{c}_ma{h}" for c in MA_BASE for h in MA_HOURS]

# Final feature list
COLS = BASE_COLS + MA_COLS
print("Using cols:", COLS)

# Sliding window length
W = int(W)

# --------------------------------------------------
# Training hyperparameters
# --------------------------------------------------
BATCH_TRAIN = 256
BATCH_EVAL  = 1024
EPOCHS      = 50
PATIENCE    = 8

LR           = 5e-4
WEIGHT_DECAY = 1e-3
DROPOUT      = 0.25

PIN_MEMORY  = (device == "cuda")

# --------------------------------------------------
# Build sliding-window input tensor
# --------------------------------------------------
arrs = [df[c].to_numpy(np.float32) for c in COLS]
X_list = [st.sliding_window_view(a, window_shape=W) for a in arrs]

# Shape: (num_windows, window_length, num_features)
X = np.stack(X_list, axis=-1).astype(np.float32)

# Align labels with windows
y = np.asarray(y)
y = y[:len(X)]

train_mask = np.asarray(train_mask)[:len(X)]
val_mask   = np.asarray(val_mask)[:len(X)]
test_mask  = np.asarray(test_mask)[:len(X)]

# Split data
X_train, y_train = X[train_mask], y[train_mask]
X_val,   y_val   = X[val_mask],   y[val_mask]
X_test,  y_test  = X[test_mask],  y[test_mask]

print("Shapes:",
      "\n  X_train:", X_train.shape,
      "\n  X_val:  ", X_val.shape,
      "\n  X_test: ", X_test.shape)
print("y_train unique:", np.unique(y_train), "dtype:", y_train.dtype)

# --------------------------------------------------
# Drop windows containing NaN or inf
# --------------------------------------------------
def _finite_mask(Xnp):
    return np.isfinite(Xnp).all(axis=(1,2))

tr_ok = _finite_mask(X_train)
va_ok = _finite_mask(X_val)
te_ok = _finite_mask(X_test)

d_tr = int((~tr_ok).sum())
d_va = int((~va_ok).sum())
d_te = int((~te_ok).sum())
if (d_tr + d_va + d_te) > 0:
    print(f"Dropping windows due to NaN/inf: train={d_tr}, val={d_va}, test={d_te}")

X_train, y_train = X_train[tr_ok], y_train[tr_ok]
X_val,   y_val   = X_val[va_ok],   y_val[va_ok]
X_test,  y_test  = X_test[te_ok],  y_test[te_ok]

# --------------------------------------------------
# Normalize using TRAIN statistics only
# --------------------------------------------------
mu  = X_train.mean(axis=(0,1), keepdims=True)
std = X_train.std(axis=(0,1), keepdims=True) + 1e-6

X_train = (X_train - mu) / std
X_val   = (X_val   - mu) / std
X_test  = (X_test  - mu) / std

# --------------------------------------------------
# PyTorch Dataset (Conv1D expects channels-first)
# --------------------------------------------------
class WindowDataset(Dataset):
    def __init__(self, X_np, y_np):
        self.X = torch.from_numpy(X_np).permute(0, 2, 1).contiguous()
        self.y = torch.from_numpy(y_np.astype(np.int64))

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = WindowDataset(X_train, y_train)
val_ds   = WindowDataset(X_val,   y_val)
test_ds  = WindowDataset(X_test,  y_test)

# Data loaders
train_loader = DataLoader(train_ds, batch_size=BATCH_TRAIN, shuffle=True,  num_workers=2, pin_memory=PIN_MEMORY)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_EVAL,  shuffle=False, num_workers=2, pin_memory=PIN_MEMORY)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_EVAL,  shuffle=False, num_workers=2, pin_memory=PIN_MEMORY)

# --------------------------------------------------
# Class weights (balanced + softened)
# --------------------------------------------------
classes = np.unique(y_train)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
cw_soft = np.sqrt(cw)  # softer version for stability

n_classes = int(classes.max()) + 1
class_weight = torch.ones(n_classes, dtype=torch.float32)
for c, w in zip(classes, cw_soft):
    class_weight[int(c)] = float(w)

class_weight = class_weight.to(device)

print("Class weights (balanced):", {int(c): float(w) for c, w in zip(classes, cw)})
print("Class weights (sqrt-soft):", {int(c): float(w) for c, w in zip(classes, cw_soft)})

# --------------------------------------------------
# CNN building blocks
# --------------------------------------------------
class ConvBlock(nn.Module):
    # Two Conv1D layers with a residual skip connection
    def __init__(self, in_ch, out_ch, k=5, p=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv1d(in_ch, out_ch, kernel_size=k, padding=k//2),
            nn.BatchNorm1d(out_ch),
            nn.GELU(),
            nn.Dropout(p),
            nn.Conv1d(out_ch, out_ch, kernel_size=k, padding=k//2),
            nn.BatchNorm1d(out_ch),
            nn.GELU(),
        )
        self.skip = nn.Conv1d(in_ch, out_ch, kernel_size=1) if in_ch != out_ch else nn.Identity()

    def forward(self, x):
        return self.net(x) + self.skip(x)

class StormCNN(nn.Module):
    # Simple 1D CNN for multivariate time-series classification
    def __init__(self, in_ch, n_classes, base=64, dropout=0.25):
        super().__init__()
        self.stem = nn.Sequential(
            nn.Conv1d(in_ch, base, kernel_size=3, padding=1),
            nn.BatchNorm1d(base),
            nn.GELU()
        )
        self.b1 = ConvBlock(base,    base,   k=5, p=dropout)
        self.b2 = ConvBlock(base,    base*2, k=5, p=dropout)
        self.b3 = ConvBlock(base*2,  base*2, k=3, p=dropout)

        self.pool = nn.AdaptiveAvgPool1d(1)
        self.head = nn.Sequential(
            nn.Flatten(),
            nn.Linear(base*2, base),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(base, n_classes)
        )

    def forward(self, x):
        x = self.stem(x)
        x = self.b1(x)
        x = self.b2(x)
        x = self.b3(x)
        x = self.pool(x)
        return self.head(x)

# --------------------------------------------------
# Model, loss, optimizer
# --------------------------------------------------
in_ch = len(COLS)
model = StormCNN(in_ch=in_ch, n_classes=n_classes, base=64, dropout=DROPOUT).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weight)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="max", factor=0.5, patience=3
)

# --------------------------------------------------
# Evaluation helpers
# --------------------------------------------------
@torch.no_grad()
def _predict(model, loader):
    model.eval()
    ys, yh = [], []
    for xb, yb in loader:
        xb = xb.to(device, non_blocking=True)
        logits = model(xb)
        pred = torch.argmax(logits, dim=1).cpu().numpy()
        ys.append(yb.numpy())
        yh.append(pred)
    return np.concatenate(ys), np.concatenate(yh)

@torch.no_grad()
def eval_f1_and_dist(model, loader):
    y_true, y_pred = _predict(model, loader)
    f1m = f1_score(y_true, y_pred, average="macro")
    uniq, cnt = np.unique(y_pred, return_counts=True)
    return f1m, dict(zip(uniq.tolist(), cnt.tolist()))

# --------------------------------------------------
# Single training epoch
# --------------------------------------------------
def train_one_epoch(model, loader):
    model.train()
    total = 0.0
    for xb, yb in loader:
        xb = xb.to(device, non_blocking=True)
        yb = yb.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total += loss.item() * xb.size(0)
    return total / len(loader.dataset)

# --------------------------------------------------
# Training loop with early stopping
# --------------------------------------------------
best_f1 = -1.0
best_state = None
bad = 0

for ep in range(1, EPOCHS + 1):
    loss = train_one_epoch(model, train_loader)
    val_f1, pred_dist = eval_f1_and_dist(model, val_loader)
    scheduler.step(val_f1)

    lr_now = optimizer.param_groups[0]["lr"]
    print(f"Epoch {ep:02d} | loss={loss:.4f} | val_f1_macro={val_f1:.4f} | lr={lr_now:.2e} | pred_dist={pred_dist}")

    if val_f1 > best_f1 + 1e-4:
        best_f1 = val_f1
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        bad = 0
    else:
        bad += 1
        if bad >= PATIENCE:
            print("Early stopping.")
            break

# Restore best model
if best_state is not None:
    model.load_state_dict(best_state)

# --------------------------------------------------
# Final evaluation
# --------------------------------------------------
def report_split(name, loader):
    y_true, y_pred = _predict(model, loader)
    print(f"\n{name}")
    print("F1-macro:", f1_score(y_true, y_pred, average="macro"))
    print(classification_report(y_true, y_pred))
    print("Confusion matrix:")
    print(confusion_matrix(y_true, y_pred))

report_split("TRAIN", train_loader)
report_split("VAL",   val_loader)
report_split("TEST",  test_loader)


Device: cuda
Using cols: ['ws', 'tp', 'mslp', 'surge', 'twl', 'steep', 'temp', 'dwd_sin', 'dwd_cos', 'ws_ma3', 'ws_ma6', 'ws_ma12', 'tp_ma3', 'tp_ma6', 'tp_ma12', 'mslp_ma3', 'mslp_ma6', 'mslp_ma12', 'surge_ma3', 'surge_ma6', 'surge_ma12', 'twl_ma3', 'twl_ma6', 'twl_ma12', 'steep_ma3', 'steep_ma6', 'steep_ma12', 'temp_ma3', 'temp_ma6', 'temp_ma12']
Shapes: 
  X_train: (262968, 36, 30) 
  X_val:   (43824, 36, 30) 
  X_test:  (43813, 36, 30)
y_train unique: [0 1 2 3 4] dtype: int8
Dropping windows due to NaN/inf: train=11, val=0, test=0
Class weights (balanced): {0: 0.27557127511855173, 1: 1.1131868597070527, 2: 2.8230930270009127, 3: 11.151696352841391, 4: 34.53145108338805}
Class weights (sqrt-soft): {0: 0.5249488309526479, 1: 1.055076707972957, 2: 1.6802062453761184, 3: 3.339415570551439, 4: 5.876346746354239}
Epoch 01 | loss=0.2770 | val_f1_macro=0.8900 | lr=5.00e-04 | pred_dist={0: 31726, 1: 7106, 2: 3979, 3: 657, 4: 356}
Epoch 02 | loss=0.1839 | val_f1_macro=0.8234 | lr=5.00e-04 | 