In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import numpy.lib.stride_tricks as st

# --------------------------------------------------
# Paths and basic configuration
# --------------------------------------------------
PATH = "/content/drive/MyDrive/Datamining-TSC-Project/new_processed_data.parquet"
CFG = {
    "time_col": "time",
    "window": 36,        # sliding window length (hours)
    "trend_h": 12,       # recent hours for trend checks
    "ma_hours": [3, 6, 12],  # moving average windows
}

# --------------------------------------------------
# Angle utility functions
# --------------------------------------------------
def wrap360(x):
    # Wrap angles into [0, 360)
    return (x % 360.0 + 360.0) % 360.0

def angle_diff_deg(a, b):
    # Smallest signed angle difference a - b in degrees
    return (a - b + 180.0) % 360.0 - 180.0

def wave_dir_convert(old_wave_dir):
    # Convert wave direction to wind-direction convention
    return wrap360(270.0 - old_wave_dir)

# --------------------------------------------------
# Moving average feature generator
# --------------------------------------------------
def add_moving_averages(df, cols, ma_hours):
    # Add rolling mean features for selected columns
    for col in cols:
        for h in ma_hours:
            df[f"{col}_ma{h}"] = df[col].rolling(window=h, min_periods=h).mean()
    return df

# --------------------------------------------------
# Load and clean data
# --------------------------------------------------
df = pd.read_parquet(PATH)

df["time"] = pd.to_datetime(df["time"])
df = df.sort_values("time").drop_duplicates("time").reset_index(drop=True)

# Keep only relevant columns
df = df[
    [
        "time",
        "Wind speed",
        "Wind Direction",
        "Wave Period",
        "Wave Direction",
        "Wave Height",
        "Wave Power",
        "Pressure",
        "temperature",
        "Surge Height",
        "Total Water Level",
        "Wave Steepness",
    ]
].copy()

# Rename columns to short, consistent names
df.rename(
    columns={
        "Wind speed": "ws",
        "Wind Direction": "wd",
        "Wave Period": "tp",
        "Wave Direction": "wdir",
        "Wave Height": "hs",
        "Wave Power": "pwr",
        "Pressure": "mslp",
        "temperature": "temp",
        "Surge Height": "surge",
        "Total Water Level": "twl",
        "Wave Steepness": "steep",
    },
    inplace=True,
)

# --------------------------------------------------
# Windâ€“wave direction alignment features
# --------------------------------------------------
df["wdir"] = wave_dir_convert(df["wdir"].to_numpy(np.float32))

wd = df["wd"].to_numpy(np.float32)
wdir = df["wdir"].to_numpy(np.float32)

# Angle difference between wind and wave directions
dwd_deg = angle_diff_deg(wd, wdir).astype(np.float32)
dwd_rad = np.deg2rad(dwd_deg).astype(np.float32)

# Encode direction difference with sin/cos
df["dwd_sin"] = np.sin(dwd_rad).astype(np.float32)
df["dwd_cos"] = np.cos(dwd_rad).astype(np.float32)

# Drop raw direction columns
df.drop(columns=["wd", "wdir"], inplace=True)

# --------------------------------------------------
# Add moving average features
# --------------------------------------------------
ma_cols = [
    "hs", "ws", "pwr", "mslp",
    "temp", "surge", "twl", "steep", "tp"
]
df = add_moving_averages(df, ma_cols, CFG["ma_hours"])

# --------------------------------------------------
# Sliding window statistics
# --------------------------------------------------
W = CFG["window"]
H = CFG["trend_h"]

hs   = df["hs"].to_numpy(np.float32)
pwr  = df["pwr"].to_numpy(np.float32)
mslp = df["mslp"].to_numpy(np.float32)
ws   = df["ws"].to_numpy(np.float32)
dwd_cos = df["dwd_cos"].to_numpy(np.float32)

# Create rolling windows
hs_w   = st.sliding_window_view(hs,   W)
pwr_w  = st.sliding_window_view(pwr,  W)
mslp_w = st.sliding_window_view(mslp, W)
ws_w   = st.sliding_window_view(ws,   W)
dwd_cos_w = st.sliding_window_view(dwd_cos, W)

# Window-based severity metrics (mean + 2*std)
hs_metric36  = hs_w.mean(axis=1)  + 2.0 * hs_w.std(axis=1)
pwr_metric36 = pwr_w.mean(axis=1) + 2.0 * pwr_w.std(axis=1)

# --------------------------------------------------
# Train / validation / test split by time
# --------------------------------------------------
start_times = df["time"].iloc[:len(hs_metric36)].to_numpy()

train_mask = start_times < np.datetime64("2015-01-01")
val_mask   = (start_times >= np.datetime64("2015-01-01")) & (start_times < np.datetime64("2020-01-01"))
test_mask  = start_times >= np.datetime64("2020-01-01")

# --------------------------------------------------
# Percentile-based severity thresholds (train only)
# --------------------------------------------------
hs_p75, hs_p92, hs_p98, hs_p995 = np.percentile(
    hs_metric36[train_mask], [75, 92, 98, 99.5]
)
pwr_p75, pwr_p92, pwr_p98, pwr_p995 = np.percentile(
    pwr_metric36[train_mask], [75, 92, 98, 99.5]
)

# Map continuous values to severity classes
def severity(x, p75, p92, p98, p995):
    y = np.zeros_like(x, dtype=np.int8)
    y[(x >= p75) & (x < p92)]  = 1
    y[(x >= p92) & (x < p98)]  = 2
    y[(x >= p98) & (x < p995)] = 3
    y[(x >= p995)]             = 4
    return y

sev_hs  = severity(hs_metric36,  hs_p75,  hs_p92,  hs_p98,  hs_p995)
sev_pwr = severity(pwr_metric36, pwr_p75, pwr_p92, pwr_p98, pwr_p995)

# Base severity: worst of wave height or power
base = np.maximum(sev_hs, sev_pwr)

# --------------------------------------------------
# Trend-based reinforcement rules (train only)
# --------------------------------------------------
train_hours_mask = df["time"] < "2015-01-01"

hs_th   = np.percentile(hs[train_hours_mask],  92)
pwr_th  = np.percentile(pwr[train_hours_mask], 92)
ws_th   = np.percentile(ws[train_hours_mask],  92)
mslp_th = np.percentile(mslp[train_hours_mask], 20)
align_th = np.percentile(dwd_cos[train_hours_mask], 75)

# Count how many storm-like conditions persist in last H hours
cnt = (
    (hs_w[:, -H:]  >= hs_th).sum(axis=1)  >= 6
).astype(int) + (
    (pwr_w[:, -H:] >= pwr_th).sum(axis=1) >= 6
).astype(int) + (
    (ws_w[:, -H:]  >= ws_th).sum(axis=1)  >= 6
).astype(int) + (
    (mslp_w[:, -H:] <= mslp_th).sum(axis=1) >= 6
).astype(int) + (
    (dwd_cos_w[:, -H:] >= align_th).sum(axis=1) >= 4
).astype(int)

# Final labels (trend count can be used later if needed)
y = base.copy()

# --------------------------------------------------
# Sanity checks
# --------------------------------------------------
print("Columns:", df.columns.tolist())
print("Class dist:")
for name, m in [("train", train_mask), ("val", val_mask), ("test", test_mask)]:
    print(name, pd.Series(y[m]).value_counts(normalize=True).sort_index().to_dict())


CNN WITHOUT TEMPERATURE AND MA'S

In [None]:
import numpy as np
import pandas as pd
import numpy.lib.stride_tricks as st

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# --------------------------------------------------
# Device setup (GPU if available)
# --------------------------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# --------------------------------------------------
# Feature list and window length
# --------------------------------------------------
COLS = ["ws","tp","mslp","dwd_sin","dwd_cos"]
W = int(W)  # window size (e.g., 36 hours)

# --------------------------------------------------
# Training hyperparameters
# --------------------------------------------------
BATCH_TRAIN = 256
BATCH_EVAL  = 1024
EPOCHS      = 50
PATIENCE    = 8

LR           = 5e-4
WEIGHT_DECAY = 1e-3
DROPOUT      = 0.25

PIN_MEMORY  = (device == "cuda")

# --------------------------------------------------
# Build sliding windows: (num_windows, W, num_features)
# --------------------------------------------------
arrs = [df[c].to_numpy(np.float32) for c in COLS]
X_list = [st.sliding_window_view(a, window_shape=W) for a in arrs]
X = np.stack(X_list, axis=-1).astype(np.float32)

# Align labels with the number of windows
y = np.asarray(y)
y = y[:len(X)]

# Make sure masks match window count as well
train_mask = np.asarray(train_mask)[:len(X)]
val_mask   = np.asarray(val_mask)[:len(X)]
test_mask  = np.asarray(test_mask)[:len(X)]

# Split into train/val/test
X_train, y_train = X[train_mask], y[train_mask]
X_val,   y_val   = X[val_mask],   y[val_mask]
X_test,  y_test  = X[test_mask],  y[test_mask]

print("Shapes:",
      "\n  X_train:", X_train.shape,
      "\n  X_val:  ", X_val.shape,
      "\n  X_test: ", X_test.shape)
print("y_train unique:", np.unique(y_train), "dtype:", y_train.dtype)

# --------------------------------------------------
# Normalize using TRAIN statistics only (no leakage)
# --------------------------------------------------
mu  = X_train.mean(axis=(0,1), keepdims=True)
std = X_train.std(axis=(0,1), keepdims=True) + 1e-6

X_train = (X_train - mu) / std
X_val   = (X_val   - mu) / std
X_test  = (X_test  - mu) / std

# --------------------------------------------------
# Dataset wrapper (Conv1D expects channels-first: [B, C, T])
# --------------------------------------------------
class WindowDataset(Dataset):
    def __init__(self, X_np, y_np):
        # (B, W, F) -> (B, F, W)
        self.X = torch.from_numpy(X_np).permute(0, 2, 1).contiguous()
        self.y = torch.from_numpy(y_np.astype(np.int64))

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = WindowDataset(X_train, y_train)
val_ds   = WindowDataset(X_val,   y_val)
test_ds  = WindowDataset(X_test,  y_test)

# DataLoaders: shuffle only for training
train_loader = DataLoader(train_ds, batch_size=BATCH_TRAIN, shuffle=True,  num_workers=2, pin_memory=PIN_MEMORY)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_EVAL,  shuffle=False, num_workers=2, pin_memory=PIN_MEMORY)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_EVAL,  shuffle=False, num_workers=2, pin_memory=PIN_MEMORY)

# --------------------------------------------------
# Class weights for imbalanced data (softened a bit with sqrt)
# --------------------------------------------------
classes = np.unique(y_train)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)

cw_soft = np.sqrt(cw)  # reduces extreme weights a bit

n_classes = int(classes.max()) + 1

class_weight = torch.ones(n_classes, dtype=torch.float32)
for c, w in zip(classes, cw_soft):
    class_weight[int(c)] = float(w)

class_weight = class_weight.to(device)

print("Class weights (balanced):", {int(c): float(w) for c, w in zip(classes, cw)})
print("Class weights (sqrt-soft):", {int(c): float(w) for c, w in zip(classes, cw_soft)})

# --------------------------------------------------
# Small CNN building block with a residual (skip) path
# --------------------------------------------------
class ConvBlock(nn.Module):
    def __init__(self, in_ch, out_ch, k=5, p=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv1d(in_ch, out_ch, kernel_size=k, padding=k//2),
            nn.BatchNorm1d(out_ch),
            nn.GELU(),
            nn.Dropout(p),
            nn.Conv1d(out_ch, out_ch, kernel_size=k, padding=k//2),
            nn.BatchNorm1d(out_ch),
            nn.GELU(),
        )
        # If channel size changes, match it with a 1x1 conv
        self.skip = nn.Conv1d(in_ch, out_ch, kernel_size=1) if in_ch != out_ch else nn.Identity()

    def forward(self, x):
        return self.net(x) + self.skip(x)

# --------------------------------------------------
# Main 1D CNN model for time-series classification
# --------------------------------------------------
class StormCNN(nn.Module):
    def __init__(self, in_ch, n_classes, base=64, dropout=0.25):
        super().__init__()
        self.stem = nn.Sequential(
            nn.Conv1d(in_ch, base, kernel_size=3, padding=1),
            nn.BatchNorm1d(base),
            nn.GELU()
        )

        self.b1 = ConvBlock(base,    base,   k=5, p=dropout)
        self.b2 = ConvBlock(base,    base*2, k=5, p=dropout)
        self.b3 = ConvBlock(base*2,  base*2, k=3, p=dropout)

        # Global pooling -> fixed-size representation
        self.pool = nn.AdaptiveAvgPool1d(1)

        # Small MLP head for class logits
        self.head = nn.Sequential(
            nn.Flatten(),
            nn.Linear(base*2, base),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(base, n_classes)
        )

    def forward(self, x):
        x = self.stem(x)
        x = self.b1(x)
        x = self.b2(x)
        x = self.b3(x)
        x = self.pool(x)
        return self.head(x)

# Build model
in_ch = len(COLS)
model = StormCNN(in_ch=in_ch, n_classes=n_classes, base=64, dropout=DROPOUT).to(device)

# Loss uses class weights to help minority classes
criterion = nn.CrossEntropyLoss(weight=class_weight)

# AdamW is usually stable for these setups
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

# Reduce LR when validation macro-F1 stops improving
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="max", factor=0.5, patience=3
)

# --------------------------------------------------
# Prediction helper (no gradients)
# --------------------------------------------------
@torch.no_grad()
def _predict(model, loader):
    model.eval()
    ys, yh = [], []
    for xb, yb in loader:
        xb = xb.to(device, non_blocking=True)
        logits = model(xb)
        pred = torch.argmax(logits, dim=1).cpu().numpy()
        ys.append(yb.numpy())
        yh.append(pred)
    y_true = np.concatenate(ys)
    y_pred = np.concatenate(yh)
    return y_true, y_pred

# Quick eval: macro-F1 + predicted class distribution
@torch.no_grad()
def eval_f1_and_dist(model, loader):
    y_true, y_pred = _predict(model, loader)
    f1m = f1_score(y_true, y_pred, average="macro")
    uniq, cnt = np.unique(y_pred, return_counts=True)
    pred_dist = dict(zip(uniq.tolist(), cnt.tolist()))
    return f1m, pred_dist

# --------------------------------------------------
# One epoch of training
# --------------------------------------------------
def train_one_epoch(model, loader):
    model.train()
    total = 0.0
    for xb, yb in loader:
        xb = xb.to(device, non_blocking=True)
        yb = yb.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()

        # Clip grads to avoid occasional exploding updates
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        total += loss.item() * xb.size(0)
    return total / len(loader.dataset)

# --------------------------------------------------
# Training loop + early stopping on validation macro-F1
# --------------------------------------------------
best_f1 = -1.0
best_state = None
bad = 0

for ep in range(1, EPOCHS + 1):
    loss = train_one_epoch(model, train_loader)
    val_f1, pred_dist = eval_f1_and_dist(model, val_loader)
    scheduler.step(val_f1)

    lr_now = optimizer.param_groups[0]["lr"]
    print(f"Epoch {ep:02d} | loss={loss:.4f} | val_f1_macro={val_f1:.4f} | lr={lr_now:.2e} | pred_dist={pred_dist}")

    # Keep the best checkpoint in memory
    if val_f1 > best_f1 + 1e-4:
        best_f1 = val_f1
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        bad = 0
    else:
        bad += 1
        if bad >= PATIENCE:
            print("Early stopping.")
            break

# Load best weights back
if best_state is not None:
    model.load_state_dict(best_state)

# --------------------------------------------------
# Final reports (train/val/test)
# --------------------------------------------------
def report_split(name, loader):
    y_true, y_pred = _predict(model, loader)
    print(f"\n{name}")
    print("F1-macro:", f1_score(y_true, y_pred, average="macro"))
    print(classification_report(y_true, y_pred))
    print("Confusion matrix:")
    print(confusion_matrix(y_true, y_pred))

report_split("TRAIN", train_loader)
report_split("VAL",   val_loader)
report_split("TEST",  test_loader)
