# Model Training

## 0. Setup

In [1]:
# Imports

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torchvision.transforms import RandomAffine

from sklearn.preprocessing import StandardScaler
import random

In [2]:
print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Using device:", torch.cuda.get_device_name(0))
else:
    print("Using CPU")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CUDA Available: False
Using CPU


In [3]:
BASE_DIR = "../input/asl-signs/"

# Read in train dataset
df = pd.read_csv(f"{BASE_DIR}/train.csv")

In [4]:
# Create a label-to-index mapping
SIGN_TO_INDEX = {sign: idx for idx, sign in enumerate(df['sign'].unique())}

# Frame Constants
LANDMARKS_PER_FRAME = 543 # Number of landmarks per frame from MediaPipe
MAX_LEN = 384 # Maximum number of frames used per video (for padding/cropping)
CROP_LEN = MAX_LEN # Same as `MAX_LEN` defined separately for flexibility e.g. use a shorter crop for training
NUM_CLASSES = 250 # Number of sign classes in the dataset
PAD = -100. # Fill value for padding frames to distinguish from real values

# Landmark Groups
NOSE=[1, 2, 98, 327]
LNOSE = [98]
RNOSE = [327]
LIP = [
    0, 13, 14, 17, 37, 39, 40, 61, 78, 80, 81, 82, 84, 87, 88, 91, 
    95, 146, 178, 181, 185, 191, 267, 269, 270, 291, 308, 310, 311, 
    312, 314, 317, 318, 321, 324, 375, 402, 405, 409, 415
] # Full mouth contour
LLIP = [37, 39, 40, 61, 78, 80, 81, 82, 84, 87, 88, 91, 95, 146, 178, 181, 185, 191] # Left part of the lips
RLIP = [267, 269, 270, 291, 308, 310, 311, 312, 314, 317, 318, 321, 324, 375, 402, 405, 409, 415] # Right part of the lips

POSE = [500, 501, 502, 503, 504, 505, 512, 513] # Pose landmarks
LPOSE = [501, 503, 505, 513]
RPOSE = [500, 502, 504, 512]

REYE = [7, 33, 133, 144, 145, 153, 154, 155, 157, 158, 159, 160, 161, 163, 173, 246] # Right eye
LEYE = [249, 263, 362, 373, 374, 380, 381, 382, 384, 385, 386, 387, 388, 390, 398, 466] # Left eye

LHAND = np.arange(468, 489).tolist() # Left hand
RHAND = np.arange(522, 543).tolist() # Right hand

# Final landmark selection: list of landmarks to be used as input to model
POINT_LANDMARKS = LIP + LHAND + RHAND + NOSE + REYE + LEYE #+POSE
NUM_NODES = len(POINT_LANDMARKS) # Number of chosen landmark points
CHANNELS = 6*NUM_NODES # 6 input channels because preprocessing creates x,dx,dx2,y,dy,dy2 per landmark point

## 2. Preprocessing
- Landmark Selection & Dimensionality Reduction:
    - Only a subset of informative landmark indices (`point_landmarks`) are used to reduce dimensionality.
    - We discard the z-axis (depth), retaining only x and y coordinates.
- Normalization:
    - Landmark coordinates are normalized by subtracting the mean position of the nose (landmark 17).
    - Normalization is done per sample to center the hand/body around the face.
    - The full sample is divided by the global standard deviation for scale invariance.
- Padding / Truncation:
    - Sequences are padded with the last frame (if too short) or truncated (if too long) to a fixed `max_len`.
    - Padding is only for keeping sequence size constant for the data loader. We generate a binary mask (1 for real values) to be passed to the model for it to ignore padded positions during training and inferrence.
- Motion Features:
    - We compute velocity (`dx`) and acceleration (`dx2`) features by differencing adjacent and second-adjacent frames.
    - Final input = concatenation of `[positions, velocity, acceleration]` (for both x and y) per frame.

In [5]:
class Preprocess:
    def __init__(self, point_landmarks: list[int], max_len: int):
        self.point_landmarks = point_landmarks # List of selected landmark indices
        self.max_len = max_len # Maximum sequence length for padding and truncation

    def __call__(self, x): 
        # x: (T, 543, 3), where T is number of timesteps (frames)
        T, P, C = x.shape # (timesteps, points, coordinates)

        # Only use specified landmarks
        x = x[:, self.point_landmarks, :2] # Discard z-coordinate (estimated depth)
        # x now contains 2D coordinates only: (T, P, 2)

        # Reference mean: landmark 17 (at the nose)
        # It is usually located close to center ([0.5,0.5])
        nose_reference = x[:, [17], :]

        # Compute the mean position of the nose across all frames
        nose_mean = torch.nanmean(nose_reference, dim=(0, 1), keepdim=True) # Mean across time and points
        nose_mean[torch.isnan(nose_mean)] = 0.5 # Set any NaN values to 0.5 
        x_no_nan = torch.nan_to_num(x, nan=0.0)
        std = torch.std(x_no_nan, dim=(0, 1), keepdim=True) # Standard deviation across time and points (we let NaN values be 0)
        # TODO: use torch.nanstd

        # Normalize inputs
        x = (x - nose_mean) / std

        # Pad or truncate sequence to fixed length
        original_seq_length = x.shape[0]
        print("d:", original_seq_length, self.max_len)
        if original_seq_length > self.max_len:
            # Truncate
            x = x[:self.max_len] 
        elif original_seq_length < self.max_len:
            # Pad with the last frame
            pad = x[-1:].repeat(self.max_len - x.shape[0], 1, 1)
            x = torch.cat([x, pad], dim=0)

        # Create mask: 1 for real data, 0 for padded values (to be ignored by model)
        mask = torch.ones(x.shape[0], dtype=torch.float32)
        if original_seq_length < self.max_len: # i.e. If we padded the sequence
            mask[original_seq_length:] = 0.0 # Set padded values (appended to original sequence) to 0

        # Compute motion features: dx (first differential), dx2 (second differential)
        # variable x contains 2D position data (both x and y coordinates)
        dx = torch.zeros_like(x) # velocity
        dx2 = torch.zeros_like(x) # acceleration

        # Calculate the first difference (lag-1)
        if x.shape[0] > 1:
            dx[:-1] = x[1:] - x[:-1] # dx[t] = x[t+1] - x[t]

        # Calculate the second difference (lag-2)
        if x.shape[0] > 2:
            dx2[:-2] = x[2:] - x[:-2] # dx2[t] = dx[t+1] - dx[t]
            
        # Reshape the tensor for concatenation: flatten each sequence into single vector
        # x = x.view(x.shape[0], -1) # Shape: (T,P*2), flatten the 2D points (x,y) into single vector
        # dx = dx.view(dx.shape[0], -1) # Shape: (T,P*2), flatten the velocity (dx)
        # dx2 = dx2.view(dx2.shape[0], -1) # Shape: (T, P*2), flatten the acceleration (dx2)

        # Concatenate x (position), dx (velocity), and dx2 (acceleration) along the feature axis
        x = torch.cat([x, dx, dx2], dim=-1) # Shape: (T, P*6), combining all features
        x[torch.isnan(x)] = 0.0 # Replace any NaN values with 0

        return x, mask

## 3. Augmentation
- Temporal Rescaling:
    - Each sequence is randomly stretched or compressed using linear interpolation (by a random scale ∈ [0.5, 1.5]).
- Random Temporal Masking:
    - With some probability, a subset of frames is replaced with NaNs to simulate missing data.
- Horizontal Flipping:
    - With a certain probability, the x-coordinates are flipped to simulate mirrored gestures.
- Affine Transformations:
    - Each frame is independently perturbed using random affine transforms (rotation, translation, scaling, shearing).
- Cutout (Frame Dropout):
    - A contiguous block of frames is set to zero to simulate occlusion or dropped frames.

In [6]:
class Augment:
    def __init__(
        self, 
        temporal_scale_range=(0.5,1.5), 
        masking_prob=0.1, 
        hflip_prob=0.5,
        cutout_frames=10, 
        affine_params=dict(degrees=10, translate=(0.1,0.1), scale=(0.9,1.1), shear=10)
    ):
        self.temporal_scale_range = temporal_scale_range
        self.masking_prob = masking_prob
        self.hflip_prob = hflip_prob
        self.cutout_frames = cutout_frames
        self.affine = RandomAffine(**affine_params)

    def __call__(self, x):
        T, P, C = x.shape # (timesteps, points, coordinates)

        # (1) Temporal Augmentations
        # Random Resample: Randomly stretch/compress the time sequence using linear interpolation
        scale = random.uniform(*self.temporal_scale_range)
        new_len = max(4, int(T * scale)) # Avoid overly small length < 4 
        x_tc = x.permute(1, 2, 0).reshape(1, P * C, T) # Reshape so that each (p,c) is its own channel: (1, P*C, T)
        x_tc = F.interpolate(x_tc, size=new_len, mode='linear', align_corners=True) # Interpolate along temporal dimension
        x = x_tc.reshape(1, P, C, new_len).squeeze(0).permute(2, 0, 1) # Reshape back to (T', P, C)

        # Random Masking: Dropout-like temporal occlusion
        if random.random() < self.masking_prob:
            mask_idx = torch.randperm(new_len)[: new_len // 10]
            x[mask_idx] = float('nan')

        # (2) Spatial Augmentations
        # Horizontal Flip (Mirroring)
        if random.random() < self.hflip_prob:
            x[..., 0] = 1.0 - x[..., 0] # flip x-coordinates

        # Affine Transform: Local geometric transformation
        for i in range(x.shape[0]):
            frame = x[i] # (P, C)
            x[i] = self.affine(frame.unsqueeze(0)).squeeze(0)

        # (3) Temporal Cutout (Zero out frames): Simulates lost/blank frames
        if new_len > self.cutout_frames:
            start = random.randint(0, new_len - self.cutout_frames)
            x[start:start + self.cutout_frames] = 0

        return x

In [7]:
class SignLanguageDataset(Dataset):
    def __init__(self, df=None, augment=None, preprocess=None):
        self.data = df or pd.read_csv(f"{BASE_DIR}/train.csv")
        self.augment = augment or Augment()
        self.preprocess = preprocess or Preprocess(
            point_landmarks=POINT_LANDMARKS, 
            max_len=MAX_LEN
        )

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get the sample from the DataFrame
        sample = self.data.iloc[idx]

        # Load landmark data from the Parquet file specified in the 'path' column
        path = f"{BASE_DIR}/{sample['path']}"
        landmark_df = pd.read_parquet(path)

        # Extract coordinates (x, y, z) from the DataFrame
        coordinates = landmark_df[['x', 'y', 'z']].values.reshape(-1, LANDMARKS_PER_FRAME, 3) # (num_frames, num_landmarks, xyz)
        
        # Extract label (sign)
        sign = sample['sign']
        sign_index = SIGN_TO_INDEX[sign]

        # Convert to tensors
        inputs = torch.tensor(coordinates, dtype=torch.float32)
        target = torch.tensor(sign_index, dtype=torch.long)

        print("[DEBUG] Pre-augment input shape:", inputs.shape)

        # Augmentation
        if self.augment:
            inputs = self.augment(inputs)
            print("[DEBUG] Post-augment input shape:", inputs.shape)

        # Preprocessing
        if self.preprocess:
            inputs, mask = self.preprocess(inputs)
            print("[DEBUG] Post-processing input shape:", inputs.shape)

        return inputs, target, mask

In [8]:
# Example usage
dataset = SignLanguageDataset()

# Sample
idx = random.randint(0, len(dataset)-1)
sample = dataset[idx]
print("Input shape:", sample[0].shape) # (MAX_LEN, NUM_NODES, 6)
print("Label (sign index):", sample[1])
print("Mask:", sample[2])

[DEBUG] Pre-augment input shape: torch.Size([8, 543, 3])
[DEBUG] Post-augment input shape: torch.Size([9, 543, 3])
d: 9 384
[DEBUG] Post-processing input shape: torch.Size([384, 118, 6])
Input shape: torch.Size([384, 118, 6])
Label (sign index): tensor(104)
Mask: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 

In [9]:
dataset[0][2]

[DEBUG] Pre-augment input shape: torch.Size([23, 543, 3])
[DEBUG] Post-augment input shape: torch.Size([34, 543, 3])
d: 34 384
[DEBUG] Post-processing input shape: torch.Size([384, 118, 6])


tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 