In [1]:
# Cell 0 — Fix environment (pin compatible versions) and restart the kernel
# Run this ONCE. It will restart the Python kernel so the pins take effect.

!pip -q install --upgrade --force-reinstall \
  "numpy==1.26.4" "scipy==1.10.1" \
  "protobuf==4.25.3" "mediapipe==0.10.14" "decord==0.6.0" \
  timm opencv-python-headless scikit-learn pandas joblib

import IPython
IPython.Application.instance().kernel.do_shutdown(restart=True)

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
dask-cudf-cu12 25.2.2 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.3 which is incompatible.
cudf-cu12 25.2.2 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.3 which is incompatible.
datasets 4.4.1 requires pyarrow>=21.0.0, but you have pyarrow 19.0.1 which is incompatible.
opentelemetry-proto 1.37.0 requires protobuf<7.0,>=5.0, but you have protobuf 4.25.3 which is incompatible.
mne 1.10.2 requires scipy>=1.11, but you have scipy 1.10.1 which is incompatible.
ydata-profiling 4.17.0 requires matplotlib<=3.10,>=3.5, but you have matplotlib 3.10.7 which is incompatible.
s3fs 2025.3.0 requires fsspec==2025.3.0.*, but you have fsspec 2025.10.0 which is incompatible.
a2a-sdk 0.3.10 requires protobuf>

{'status': 'ok', 'restart': True}

In [1]:
# Cell 1 — Imports and quiet setup (run this AFTER the kernel restarts)

import os, warnings
import numpy as np
import pandas as pd
from pathlib import Path
from joblib import Parallel, delayed
from decord import VideoReader, cpu

# Silence native logs
os.environ["TF_CPP_MIN_LOG_LEVEL"]  = "3"
os.environ["GLOG_minloglevel"]      = "3"
os.environ["GLOG_logtostderr"]      = "1"
os.environ["MEDIAPIPE_DISABLE_GPU"] = "1"
warnings.filterwarnings("ignore")

# Use stable MP solutions path (avoids tasks/audio import issues)
from mediapipe.python.solutions import holistic as mp_holistic
from mediapipe.python.solutions.pose import PoseLandmark as PL

from absl import logging as absl_logging
absl_logging.set_verbosity(absl_logging.FATAL)

# SciPy (now compatible with pinned NumPy)
from scipy.interpolate import interp1d
from scipy.ndimage import uniform_filter1d

# For silencing native library stderr/stdout in MP calls
from contextlib import contextmanager
@contextmanager
def suppress_output_fd():
    devnull = open(os.devnull, 'w')
    old_out, old_err = os.dup(1), os.dup(2)
    os.dup2(devnull.fileno(), 1)
    os.dup2(devnull.fileno(), 2)
    try:
        yield
    finally:
        os.dup2(old_out, 1); os.dup2(old_err, 2)
        os.close(old_out); os.close(old_err); devnull.close()

E0000 00:00:1762766444.846782     188 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762766444.904698     188 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Cell 2 — Config and manifests
MANIFEST_FULL = "/kaggle/input/manifest/manifest_isl_adjectives.csv"        # train from here
MANIFEST_EVAL = "/kaggle/input/manifest/manifest_isl_adjectives_eval.csv"   # val/test from here

OUT_DIR       = Path("/kaggle/working/isl_keypoints")

# Preprocessing (Step 8)
INCLUDE_FACE = False
FPS_TARGET   = 25
T_TARGET     = 60
SMOOTH_WIN   = 5
ADD_VELOCITY = True
N_JOBS       = 2

# Rendering/Training
SIZE         = 224
POSE_STRIDE  = 2
BATCH        = 4

assert Path(MANIFEST_FULL).exists(), "Missing manifest_isl_adjectives.csv"
assert Path(MANIFEST_EVAL).exists(), "Missing manifest_isl_adjectives_eval.csv"

In [3]:
# Cell 3 — Load manifests (train from FULL; val/test from EVAL)
df_full = pd.read_csv(MANIFEST_FULL)
df_eval = pd.read_csv(MANIFEST_EVAL)

df_train = df_full[df_full.split == "train"][['path','label','split']].reset_index(drop=True)
df_val   = df_eval[df_eval.split == "val"][['path','label','split']].reset_index(drop=True)
df_test  = df_eval[df_eval.split == "test"][['path','label','split']].reset_index(drop=True)

assert set(df_train.path).isdisjoint(set(df_val.path))
assert set(df_train.path).isdisjoint(set(df_test.path))
assert set(df_val.path).isdisjoint(set(df_test.path))

df_all = pd.concat([df_train, df_val, df_test], ignore_index=True).drop_duplicates('path')
print("Counts | train:", len(df_train), "val:", len(df_val), "test:", len(df_test))

Counts | train: 765 val: 318 test: 309


In [4]:
# Cell 4 — Step 8: preprocess videos -> landmarks -> normalized sequences -> NPZ
def sample_frames(path, fps_target=25):
    vr = VideoReader(path, ctx=cpu(0))
    n = len(vr)
    fps_src = float(vr.get_avg_fps()) if hasattr(vr, "get_avg_fps") else fps_target
    step = max(1, int(round(fps_src / fps_target)))
    idxs = np.arange(0, n, step, dtype=int)
    if len(idxs) == 0:
        idxs = np.array([0], dtype=int)
    return vr.get_batch(idxs).asnumpy()  # (T,H,W,C) RGB

L_SH, R_SH = int(PL.LEFT_SHOULDER), int(PL.RIGHT_SHOULDER)

def lm_to_np(lms, count):
    if lms is None:
        return np.full((count, 3), np.nan, dtype=np.float32)
    return np.array([[lm.x, lm.y, lm.z] for lm in lms.landmark], dtype=np.float32)

def extract_landmarks(frames, include_face=False):
    T = frames.shape[0]
    pose_list, lh_list, rh_list, face_list = [], [], [], []
    with suppress_output_fd():
        with mp_holistic.Holistic(static_image_mode=False, model_complexity=1,
                                  smooth_landmarks=True, enable_segmentation=False) as holo:
            for t in range(T):
                res   = holo.process(frames[t])
                pose  = lm_to_np(res.pose_landmarks, 33)
                lhand = lm_to_np(res.left_hand_landmarks, 21)
                rhand = lm_to_np(res.right_hand_landmarks, 21)
                face  = lm_to_np(res.face_landmarks, 468) if include_face else np.zeros((0,3), np.float32)
                pose_list.append(pose); lh_list.append(lhand); rh_list.append(rhand); face_list.append(face)
    pose = np.stack(pose_list, axis=0); lh = np.stack(lh_list, axis=0); rh = np.stack(rh_list, axis=0); face = np.stack(face_list, axis=0)
    pose[..., :2] = np.clip(pose[..., :2], 0.0, 1.0)
    lh[...,   :2] = np.clip(lh[...,   :2], 0.0, 1.0)
    rh[...,   :2] = np.clip(rh[...,   :2], 0.0, 1.0)
    if include_face:
        face[..., :2] = np.clip(face[..., :2], 0.0, 1.0)
    return pose, lh, rh, face

def normalize_geometry(pose, lh, rh, face):
    lsh = pose[:, L_SH, :2]; rsh = pose[:, R_SH, :2]
    center = (lsh + rsh) / 2.0
    scale  = np.linalg.norm(lsh - rsh, axis=1, keepdims=True)
    scale[scale < 1e-6] = 1.0
    def center_scale(arr):
        if arr.shape[1] == 0: return arr
        return np.concatenate([(arr[..., :2] - center[:,None,:]) / scale[:,None,:], arr[..., 2:3]], axis=-1)
    pose_g = center_scale(pose)
    lwrist = lh[:, 0:1, :2]; rwrist = rh[:, 0:1, :2]
    lh_rel = np.concatenate([(lh[..., :2] - lwrist) / scale[:,None,:], lh[..., 2:3]], axis=-1)
    rh_rel = np.concatenate([(rh[..., :2] - rwrist) / scale[:,None,:], rh[..., 2:3]], axis=-1)
    face_g = center_scale(face) if face.shape[1] > 0 else face
    return pose_g, lh_rel, rh_rel, face_g

def interpolate_nans(seq):
    T,J,C = seq.shape
    flat = seq.reshape(T, -1)
    flat_i = pd.DataFrame(flat).interpolate(method="linear", limit_direction="both", axis=0).values
    return flat_i.reshape(T, J, C)

def smooth(seq, win=5):
    if win <= 1: return seq
    return uniform_filter1d(seq, size=win, axis=0, mode="nearest")

def resample_to_T(seq, T_out):
    T_in = seq.shape[0]
    if T_in == T_out: return seq
    x_in  = np.arange(T_in)
    x_out = np.linspace(0, T_in - 1, T_out)
    f = interp1d(x_in, seq, kind="linear", axis=0, fill_value="extrapolate", assume_sorted=True)
    return f(x_out)

def process_video(path):
    frames = sample_frames(path, fps_target=FPS_TARGET)
    pose, lh, rh, face = extract_landmarks(frames, INCLUDE_FACE)
    pose, lh, rh, face = normalize_geometry(pose, lh, rh, face)
    seq = np.concatenate([pose, lh, rh, face], axis=1)
    seq = interpolate_nans(seq)
    seq = smooth(seq, win=SMOOTH_WIN)
    seq = resample_to_T(seq, T_out=T_TARGET)
    X = seq.reshape(T_TARGET, -1)
    if ADD_VELOCITY:
        V = np.diff(seq, axis=0, prepend=seq[0:1]).reshape(T_TARGET, -1)
        X = np.concatenate([X, V], axis=1)
    return X.astype(np.float32)

OUT_DIR.mkdir(parents=True, exist_ok=True)

def save_npz(row):
    X = process_video(row['path'])
    out_path = OUT_DIR / row['split'] / row['label'] / (Path(row['path']).stem + ".npz")
    out_path.parent.mkdir(parents=True, exist_ok=True)
    np.savez_compressed(out_path, X=X, label=row['label'], path=row['path'])
    return str(out_path)

records = df_all[['path','label','split']].to_dict('records')
out_paths = Parallel(n_jobs=N_JOBS, prefer="threads")(delayed(save_npz)(r) for r in records)

pd.DataFrame({"npz_path": out_paths, "label": df_all['label'], "split": df_all['split']}).to_csv(OUT_DIR / "index_all.csv", index=False)
df_train.assign(npz_path=[OUT_DIR / "train" / l / (Path(p).stem + ".npz") for p,l in zip(df_train['path'], df_train['label'])]).to_csv(OUT_DIR / "index_train.csv", index=False)
df_val.assign(  npz_path=[OUT_DIR / "val"   / l / (Path(p).stem + ".npz") for p,l in zip(df_val['path'],   df_val['label'])]).to_csv(OUT_DIR / "index_val.csv",   index=False)
df_test.assign( npz_path=[OUT_DIR / "test"  / l / (Path(p).stem + ".npz") for p,l in zip(df_test['path'],  df_test['label'])]).to_csv(OUT_DIR / "index_test.csv",  index=False)

print("Step 8 done. NPZ written to:", OUT_DIR)

Step 8 done. NPZ written to: /kaggle/working/isl_keypoints


In [5]:
# Cell 5 — Offline augmentation (TRAIN only) -> index_train_plus_aug.csv
ROOT = OUT_DIR
TRAIN_INDEX = ROOT / "index_train.csv"
AUG_COPIES  = 1

def resample_to_T_local(seq, T_out):
    T_in = seq.shape[0]
    if T_in == T_out: return seq
    x_in  = np.arange(T_in)
    x_out = np.linspace(0, T_in - 1, T_out)
    f = interp1d(x_in, seq, kind="linear", axis=0, fill_value="extrapolate", assume_sorted=True)
    return f(x_out)

def split_coords_vel(X):
    T, F = X.shape
    if F % 6 == 0:
        had_vel = True; Fc = F // 2
    else:
        had_vel = False; Fc = F
    J = Fc // 3
    coords = X[:, :Fc].reshape(T, J, 3)
    return coords, had_vel, J

def reassemble(coords, had_vel):
    T, J, _ = coords.shape
    Xc = coords.reshape(T, -1)
    if had_vel:
        V = np.diff(coords, axis=0, prepend=coords[0:1]).reshape(T, -1)
        return np.concatenate([Xc, V], axis=1).astype(np.float32)
    return Xc.astype(np.float32)

def augment_once(X):
    coords, had_vel, J = split_coords_vel(X)
    if np.random.rand() < 0.5:
        deg = np.random.uniform(-8, 8); s = np.random.uniform(0.95, 1.05)
        th = np.deg2rad(deg)
        R = np.array([[np.cos(th), -np.sin(th)], [np.sin(th), np.cos(th)]], dtype=np.float32)
        T = coords.shape[0]
        xy = coords[...,:2].reshape(T*J, 2) @ R.T
        coords[...,:2] = (xy.reshape(T, J, 2) * s)
    if np.random.rand() < 0.2:
        k = max(1, int(round(J * 0.05)))
        idx = np.random.choice(J, size=k, replace=False)
        coords[:, idx, :] = 0.0
    if np.random.rand() < 0.3:
        T = coords.shape[0]
        w = max(1, int(round(T * np.random.uniform(0.05, 0.10))))
        st = np.random.randint(0, max(1, T - w + 1))
        coords[st:st+w, :, :] = 0.0
    if np.random.rand() < 0.7:
        coords += np.random.normal(0.0, 0.01, size=coords.shape).astype(np.float32)
    coords = resample_to_T_local(coords, T_TARGET)
    return reassemble(coords, had_vel)

train_df = pd.read_csv(TRAIN_INDEX)
base = train_df[['npz_path','label']].reset_index(drop=True)
aug_df = pd.DataFrame(np.repeat(base.values, AUG_COPIES, axis=0), columns=base.columns)

def aug_and_save(npz_path, label):
    data = np.load(npz_path)
    X = data['X'].astype(np.float32)
    X_aug = augment_once(X)
    src = Path(npz_path)
    out = src.with_name(src.stem + f"_aug{np.random.randint(1_000_000):06d}.npz")
    out.parent.mkdir(parents=True, exist_ok=True)
    np.savez_compressed(out, X=X_aug, label=label, path=str(src))
    return str(out)

aug_paths = Parallel(n_jobs=2, prefer="threads")(delayed(aug_and_save)(row['npz_path'], row['label']) for _, row in aug_df.iterrows())

train_plus = train_df.copy()
aug_index = pd.DataFrame({"path": train_df['path'].iloc[0:len(aug_paths)].values,
                          "label": aug_df['label'].values,
                          "split": "train",
                          "npz_path": aug_paths})
train_plus_aug = pd.concat([train_plus, aug_index], ignore_index=True)
train_plus_aug.to_csv(ROOT / "index_train_plus_aug.csv", index=False)

print("Augmentation done. Train index:", ROOT / "index_train_plus_aug.csv")

Augmentation done. Train index: /kaggle/working/isl_keypoints/index_train_plus_aug.csv


In [6]:
# Cell 6 — Two-stream dataset (landmarks + pose-video rendering)
import cv2
import torch
from torch.utils.data import Dataset, DataLoader

IM_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32).reshape(1,1,3)
IM_STD  = np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape(1,1,3)

POSE_J, LH_J, RH_J = 33, 21, 21
LH_START, RH_START = POSE_J, POSE_J + LH_J
POSE_EDGES = [(11,13),(13,15),(12,14),(14,16),(11,12),(11,23),(12,24),(23,24)]
HAND_EDGES = [(0,1),(1,2),(2,3),(3,4),
              (0,5),(5,6),(6,7),(7,8),
              (0,9),(9,10),(10,11),(11,12),
              (0,13),(13,14),(14,15),(15,16),
              (0,17),(17,18),(18,19),(19,20)]

def split_coords_from_X(X):
    T, F = X.shape
    Fc = F // 2 if F % 6 == 0 else F
    J = Fc // 3
    coords = X[:, :Fc].reshape(T, J, 3)
    return coords, J

def normalize_image(img):
    arr = img.astype(np.float32) / 255.0
    return (arr - IM_MEAN) / IM_STD

def render_skeleton_seq(coords, size=224, stride=2, thickness=2):
    T, J, _ = coords.shape
    xy = coords[..., :2]
    mask = (np.abs(xy).sum(axis=-1) > 0)
    valid_xy = xy[mask]
    if valid_xy.size == 0:
        valid_xy = np.array([[0.0, 0.0]], dtype=np.float32)
    min_xy = valid_xy.min(axis=0); max_xy = valid_xy.max(axis=0)
    span = np.maximum(max_xy - min_xy, 1e-3); pad = 0.1 * span
    min_xy = min_xy - pad; span = (max_xy + pad) - min_xy
    scale = (size - 1) / np.max(span); offset = -min_xy

    idx = np.arange(0, T, stride, dtype=int)
    frames = np.zeros((len(idx), size, size, 3), dtype=np.uint8)
    xy_pix = ((xy + offset) * scale).astype(np.int32)

    t2 = 0
    for t in idx:
        canvas = frames[t2]
        for a, b in POSE_EDGES:
            if a < J and b < J and mask[t, a] and mask[t, b]:
                pa = tuple(xy_pix[t, a]); pb = tuple(xy_pix[t, b])
                cv2.line(canvas, pa, pb, (255,255,255), thickness, cv2.LINE_AA)
        for a, b in HAND_EDGES:
            ai, bi = LH_START + a, LH_START + b
            if ai < J and bi < J and mask[t, ai] and mask[t, bi]:
                pa = tuple(xy_pix[t, ai]); pb = tuple(xy_pix[t, bi])
                cv2.line(canvas, pa, pb, (255,255,255), thickness, cv2.LINE_AA)
        for a, b in HAND_EDGES:
            ai, bi = RH_START + a, RH_START + b
            if ai < J and bi < J and mask[t, ai] and mask[t, bi]:
                pa = tuple(xy_pix[t, ai]); pb = tuple(xy_pix[t, bi])
                cv2.line(canvas, pa, pb, (255,255,255), thickness, cv2.LINE_AA)
        frames[t2] = canvas; t2 += 1

    frames = np.stack([normalize_image(frames[k]) for k in range(frames.shape[0])], axis=0)
    frames = np.transpose(frames, (0,3,1,2))  # (T',3,H,W)
    return frames.astype(np.float32)

class TwoStreamDataset(Dataset):
    def __init__(self, df, label2id, size=224, pose_stride=2):
        self.df = df.reset_index(drop=True)
        self.label2id = label2id
        self.size = size
        self.pose_stride = pose_stride
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        row = self.df.iloc[i]
        data = np.load(row['npz_path'])
        X = data['X'].astype(np.float32)
        Xn = (X - X.mean(axis=0, keepdims=True)) / (X.std(axis=0, keepdims=True) + 1e-6)
        x_land = torch.from_numpy(Xn).transpose(0,1)   # (F,T)
        coords,_ = split_coords_from_X(X)
        frames = render_skeleton_seq(coords, size=self.size, stride=self.pose_stride)
        x_pose = torch.from_numpy(frames)              # (T',3,H,W)
        y = self.label2id[row['label']]
        return x_land, x_pose, y

df_train = pd.read_csv(OUT_DIR / "index_train_plus_aug.csv")
df_val   = pd.read_csv(OUT_DIR / "index_val.csv")
df_test  = pd.read_csv(OUT_DIR / "index_test.csv")
labels = sorted(df_train['label'].unique())
label2id = {l:i for i,l in enumerate(labels)}
df_val  = df_val[df_val['label'].isin(labels)].reset_index(drop=True)
df_test = df_test[df_test['label'].isin(labels)].reset_index(drop=True)

train_ds = TwoStreamDataset(df_train, label2id, size=SIZE, pose_stride=POSE_STRIDE)
val_ds   = TwoStreamDataset(df_val,   label2id, size=SIZE, pose_stride=POSE_STRIDE)
test_ds  = TwoStreamDataset(df_test,  label2id, size=SIZE, pose_stride=POSE_STRIDE)

train_dl = DataLoader(train_ds, batch_size=BATCH, shuffle=True,  num_workers=2, pin_memory=True)
val_dl   = DataLoader(val_ds,   batch_size=BATCH, shuffle=False, num_workers=2, pin_memory=True)
test_dl  = DataLoader(test_ds,  batch_size=BATCH, shuffle=False, num_workers=2, pin_memory=True)

xb_land, xb_pose, yb = next(iter(train_dl))
F_in, T_in = xb_land.shape[1], xb_land.shape[2]
print("Dims | landmarks:", (F_in, T_in), "pose batch:", tuple(xb_pose.shape))

Dims | landmarks: (450, 60) pose batch: (4, 30, 3, 224, 224)


In [10]:
# Cell 7 — Two-stream model + fusion
import torch
import timm
import torch.nn as nn
import torch.nn.functional as F

class TemporalAttention(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.W = nn.Linear(d_model, d_model)
        self.v = nn.Linear(d_model, 1, bias=False)
    def forward(self, H):  # (T,B,D)
        M = torch.tanh(self.W(H))
        e = self.v(M).squeeze(-1)       # (T,B)
        a = torch.softmax(e, dim=0)     # (T,B)
        return (a.unsqueeze(-1) * H).sum(dim=0)  # (B,D)

class LandmarkStream(nn.Module):
    def __init__(self, f_in, n_classes, k=5, c1=256, c2=256, lstm_hidden=256, dropout=0.4):
        super().__init__()
        self.input_bn = nn.BatchNorm1d(f_in)
        self.conv1 = nn.Conv1d(f_in, c1, kernel_size=k, padding=k//2)
        self.bn1   = nn.BatchNorm1d(c1)
        self.drop1 = nn.Dropout(dropout)
        self.conv2 = nn.Conv1d(c1, c2, kernel_size=k, padding=k//2)
        self.bn2   = nn.BatchNorm1d(c2)
        self.drop2 = nn.Dropout(dropout)
        self.lstm  = nn.LSTM(input_size=c2, hidden_size=lstm_hidden, bidirectional=True, batch_first=False)
        self.attn  = TemporalAttention(2*lstm_hidden)
        self.fc1   = nn.Linear(2*lstm_hidden, 512)
        self.bn3   = nn.BatchNorm1d(512)
        self.drop3 = nn.Dropout(dropout)
        self.fc_out= nn.Linear(512, n_classes)
    def forward(self, x):    # (B,F,T)
        x = self.input_bn(x)
        x = self.drop1(F.relu(self.bn1(self.conv1(x))))
        x = self.drop2(F.relu(self.bn2(self.conv2(x))))
        H,_ = self.lstm(x.transpose(1,2).transpose(0,1))  # (T,B,2H)
        h = self.attn(H)                                  # (B,2H)
        h = self.bn3(self.fc1(h)); h = F.relu(h); h = self.drop3(h)
        return self.fc_out(h)

class PoseVideoStream(nn.Module):
    def __init__(self, n_classes, lstm_hidden=256, dropout=0.4):
        super().__init__()
        # MobileNetV2 backbone (ImageNet-pretrained)
        self.enc  = timm.create_model('mobilenetv2_100', pretrained=True, num_classes=0, global_pool='avg')
        feat_dim  = self.enc.num_features  # 1280
        self.lstm = nn.LSTM(input_size=feat_dim, hidden_size=lstm_hidden, bidirectional=True, batch_first=True)
        self.attn = TemporalAttention(2*lstm_hidden)
        self.fc1  = nn.Linear(2*lstm_hidden, 512)
        self.bn   = nn.BatchNorm1d(512)
        self.drop = nn.Dropout(dropout)
        self.fc_out = nn.Linear(512, n_classes)
    def forward(self, x):  # (B,T,3,H,W)
        B,T,C,H,W = x.shape
        feats = self.enc(x.reshape(B*T, C, H, W)).view(B, T, -1)  # (B,T,F)
        Hseq,_ = self.lstm(feats)                                 # (B,T,2H)
        Hseq = Hseq.transpose(0,1)                                # (T,B,2H)
        h = self.attn(Hseq)                                       # (B,2H)
        h = self.bn(self.fc1(h)); h = F.relu(h); h = self.drop(h)
        return self.fc_out(h)

class TwoStreamModel(nn.Module):
    def __init__(self, f_in, n_classes, dropout=0.4):
        super().__init__()
        self.land = LandmarkStream(f_in=f_in, n_classes=n_classes, dropout=dropout)
        self.pose = PoseVideoStream(n_classes=n_classes, dropout=dropout)
        self.fuse = nn.Linear(2*n_classes, n_classes)  # fuse on concatenated stream logits
    def forward(self, x_land, x_pose):
        lg1 = self.land(x_land)     # (B,nc)
        lg2 = self.pose(x_pose)     # (B,nc)
        logits = self.fuse(torch.cat([lg1, lg2], dim=-1))
        return logits, lg1, lg2

In [13]:
# Cell 8 — Training utilities (updated: train_epochs now accepts save_path)
import torch
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model  = TwoStreamModel(f_in=F_in, n_classes=len(labels), dropout=0.4).to(device)

def freeze_bn(m):
    for mod in m.modules():
        if isinstance(mod, (nn.BatchNorm1d, nn.BatchNorm2d)):
            mod.eval()
            for p in mod.parameters():
                p.requires_grad = False

def set_requires_grad(module, flag=True):
    for p in module.parameters():
        p.requires_grad = flag

def params_for_optim(m):
    return [p for p in m.parameters() if p.requires_grad]

y_train_ids = df_train['label'].map(label2id).values
class_w = compute_class_weight(class_weight='balanced', classes=np.arange(len(labels)), y=y_train_ids)
weights = torch.tensor(class_w, dtype=torch.float32, device=device)
criterion = nn.CrossEntropyLoss(weight=weights, label_smoothing=0.1)

def evaluate(dl):
    model.eval(); preds,targs,loss_sum,n = [],[],0.0,0
    for xl,xp,y in dl:
        xl=xl.to(device,non_blocking=True); xp=xp.to(device,non_blocking=True); y=torch.as_tensor(y,device=device)
        with torch.no_grad():
            logits,_,_ = model(xl,xp); loss = criterion(logits,y)
        loss_sum += loss.item()*xl.size(0); n += xl.size(0)
        preds.append(logits.argmax(1).cpu().numpy()); targs.append(y.cpu().numpy())
    preds=np.concatenate(preds) if preds else np.array([]); targs=np.concatenate(targs) if targs else np.array([])
    acc=accuracy_score(targs,preds) if len(targs) else 0.0
    f1 =f1_score(targs,preds,average='macro',zero_division=0) if len(targs) else 0.0
    return loss_sum/max(1,n), acc, f1

def train_epochs(dl, val_dl, epochs, lr, wd=5e-4, clip=1.0, cosine=True, desc="", save_path="/kaggle/working/best_two_stream.pth"):
    optimizer = torch.optim.AdamW(params_for_optim(model), lr=lr, weight_decay=wd)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs) if cosine else None
    scaler = torch.amp.GradScaler('cuda', enabled=(device.type=='cuda'))
    best_acc = -1.0
    for ep in range(1, epochs+1):
        model.train(); total=0; loss_sum=0.0; preds=[]; targs=[]
        for xl,xp,y in dl:
            xl=xl.to(device,non_blocking=True); xp=xp.to(device,non_blocking=True); y=torch.as_tensor(y,device=device)
            optimizer.zero_grad(set_to_none=True)
            with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                logits,_,_ = model(xl,xp); loss = criterion(logits,y)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip)
            scaler.scale(loss).backward(); scaler.step(optimizer); scaler.update()
            loss_sum += loss.item()*xl.size(0); total += xl.size(0)
            preds.append(logits.detach().argmax(1).cpu().numpy()); targs.append(y.cpu().numpy())
        if scheduler is not None: scheduler.step()
        va_loss, va_acc, va_f1 = evaluate(val_dl)
        if va_acc > best_acc:
            best_acc = va_acc
            torch.save({'model': model.state_dict(), 'labels': labels}, save_path)
        tr_preds=np.concatenate(preds); tr_targs=np.concatenate(targs)
        tr_acc=accuracy_score(tr_targs,tr_preds); tr_f1=f1_score(tr_targs,tr_preds,average='macro',zero_division=0)
        tr_loss=loss_sum/max(1,total)
        print(f"{desc} Epoch {ep:02d} | train loss {tr_loss:.4f} acc {tr_acc:.3f} f1 {tr_f1:.3f} | val acc {va_acc:.3f} f1 {va_f1:.3f}")
    return best_acc

In [14]:
# Cell 9 — Three training phases (BN frozen by default). Save best per phase and keep overall best.
import torch.nn as nn
import shutil

torch.backends.cudnn.benchmark = True
freeze_bn(model)

PH1_PATH = "/kaggle/working/best_two_stream_ph1.pth"
PH2_PATH = "/kaggle/working/best_two_stream_ph2.pth"
PH3_PATH = "/kaggle/working/best_two_stream_ph3.pth"
BEST_OVERALL_PATH = "/kaggle/working/best_two_stream_overall.pth"

# Phase 1 — warm-up (land: conv2+head; pose: head; fusion)
set_requires_grad(model, False)
set_requires_grad(model.land.conv2,  True)
set_requires_grad(model.land.fc1,    True)
set_requires_grad(model.land.fc_out, True)
set_requires_grad(model.pose.fc1,    True)
set_requires_grad(model.pose.fc_out, True)
for p in model.fuse.parameters(): p.requires_grad = True

print("Phase 1: warm-up (land: conv2+head; pose: head), LR=1e-3")
best1 = train_epochs(train_dl, val_dl, epochs=12, lr=1e-3, wd=5e-4, cosine=False, desc="[PH1]", save_path=PH1_PATH)

# Phase 2 — joint (unfreeze selected parts; allow BN updates ONLY in pose encoder)
freeze_bn(model)            # keep all BN frozen first
set_requires_grad(model, False)

# Let BN in the pose encoder update running stats (weights of early blocks still frozen)
for m in model.pose.enc.modules():
    if isinstance(m, nn.BatchNorm2d):
        m.train()
        for p in m.parameters():
            p.requires_grad = True  # will be overridden by next mask for non-final blocks

# Landmark: unfreeze full CNN + LSTM + head
set_requires_grad(model.land.conv1,  True)
set_requires_grad(model.land.conv2,  True)
set_requires_grad(model.land.lstm,   True)
set_requires_grad(model.land.fc1,    True)
set_requires_grad(model.land.fc_out, True)

# Pose encoder: train only final blocks (features.14+), plus its LSTM + head
for name, p in model.pose.enc.named_parameters():
    p.requires_grad = any(name.startswith(f"features.{k}") for k in [14, 15, 16, 17])
set_requires_grad(model.pose.lstm,   True)
set_requires_grad(model.pose.fc1,    True)
set_requires_grad(model.pose.fc_out, True)

for p in model.fuse.parameters(): p.requires_grad = True

print("Phase 2: joint (land CNN+LSTM+head; pose last-stage+BN+LSTM+head), LR=3e-4, cosine")
best2 = train_epochs(train_dl, val_dl, epochs=30, lr=3e-4, wd=5e-4, cosine=True, desc="[PH2]", save_path=PH2_PATH)

# Phase 3 — fine-tune (freeze both encoders; train LSTMs + heads; BN frozen)
freeze_bn(model)
set_requires_grad(model, False)

set_requires_grad(model.land.lstm,   True)
set_requires_grad(model.land.fc1,    True)
set_requires_grad(model.land.fc_out, True)

set_requires_grad(model.pose.lstm,   True)
set_requires_grad(model.pose.fc1,    True)
set_requires_grad(model.pose.fc_out, True)

for p in model.fuse.parameters(): p.requires_grad = True

print("Phase 3: fine-tune (LSTMs+heads), LR=5e-5")
best3 = train_epochs(train_dl, val_dl, epochs=12, lr=5e-5, wd=5e-4, cosine=False, desc="[PH3]", save_path=PH3_PATH)

# Pick the best across phases and copy to a single final path
accs  = [best1, best2, best3]
paths = [PH1_PATH, PH2_PATH, PH3_PATH]
best_idx = int(np.argmax(accs))
shutil.copy2(paths[best_idx], BEST_OVERALL_PATH)

print("Best val accuracy across phases:", accs[best_idx])
print("Saved overall-best checkpoint to:", BEST_OVERALL_PATH)

Phase 1: warm-up (land: conv2+head; pose: head), LR=1e-3
[PH1] Epoch 01 | train loss 3.4999 acc 0.183 f1 0.167 | val acc 0.191 f1 0.139
[PH1] Epoch 02 | train loss 2.4196 acc 0.469 f1 0.455 | val acc 0.181 f1 0.143
[PH1] Epoch 03 | train loss 1.9980 acc 0.607 f1 0.600 | val acc 0.275 f1 0.227
[PH1] Epoch 04 | train loss 1.7942 acc 0.699 f1 0.698 | val acc 0.343 f1 0.292
[PH1] Epoch 05 | train loss 1.7211 acc 0.737 f1 0.733 | val acc 0.275 f1 0.244
[PH1] Epoch 06 | train loss 1.6230 acc 0.774 f1 0.772 | val acc 0.387 f1 0.338
[PH1] Epoch 07 | train loss 1.5378 acc 0.815 f1 0.812 | val acc 0.392 f1 0.336
[PH1] Epoch 08 | train loss 1.5608 acc 0.792 f1 0.789 | val acc 0.422 f1 0.374
[PH1] Epoch 09 | train loss 1.4746 acc 0.837 f1 0.837 | val acc 0.417 f1 0.373
[PH1] Epoch 10 | train loss 1.4193 acc 0.848 f1 0.847 | val acc 0.338 f1 0.315
[PH1] Epoch 11 | train loss 1.4297 acc 0.858 f1 0.860 | val acc 0.324 f1 0.291
[PH1] Epoch 12 | train loss 1.4442 acc 0.851 f1 0.850 | val acc 0.373 f1 0

In [15]:
# Cell 10 — Final test evaluation (best-by-val-acc)
ckpt = torch.load("/kaggle/working/best_two_stream.pth", map_location=device)
model.load_state_dict(ckpt['model'])

from sklearn.metrics import accuracy_score, f1_score
def evaluate_final(m, dl):
    m.eval(); preds,targs=[],[]
    for xl,xp,y in dl:
        xl=xl.to(device,non_blocking=True); xp=xp.to(device,non_blocking=True)
        with torch.no_grad():
            logits,_,_ = m(xl,xp)
        preds.append(logits.argmax(1).cpu().numpy()); targs.append(y)
    preds=np.concatenate(preds); targs=np.concatenate(targs)
    acc=accuracy_score(targs,preds); f1=f1_score(targs,preds,average='macro',zero_division=0)
    return acc,f1

test_acc, test_f1 = evaluate_final(model, test_dl)
print(f"Test | acc {test_acc:.3f} macro-F1 {test_f1:.3f}")

Test | acc 0.297 macro-F1 0.252
