# Exploratory Data Analysis

## 0. Dataset Description
*Taken from dataset [Kaggle page](https://www.kaggle.com/competitions/asl-signs/data)*

Deaf children are often born to hearing parents who do not know sign language. Your challenge in this competition is to help identify signs made in processed videos, which will support the development of mobile apps to help teach parents sign language so they can communicate with their Deaf children.

### 0.1. Files
`train_landmark_files/[participant_id]/[sequence_id].parquet` 

The landmark data. The landmarks were extracted from raw videos with the MediaPipe holistic model. Not all of the frames necessarily had visible hands or hands that could be detected by the model.

- frame - The frame number in the raw video.
- row_id - A unique identifier for the row.
- type - The type of landmark. One of ['face', 'left_hand', 'pose', 'right_hand'].
- landmark_index - The landmark index number. Details of the hand landmark locations can be found here.
- [x/y/z] - The normalized spatial coordinates of the landmark. These are the only columns that will be provided to your submitted model for inference. The MediaPipe model is not fully trained to predict depth so you may wish to ignore the z values.

`train.csv`
- path - The path to the landmark file.
- participant_id - A unique identifier for the data contributor.
- sequence_id - A unique identifier for the landmark sequence.
- sign - The label for the landmark sequence.

## Preprocessing

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score

In [None]:
def preprocess_landmarks(landmark_df, num_frames=48, feature_size=84):
    """
    Preprocess landmark DataFrame into (num_frames, features) shape.
    We'll only use left and right hands: 21 points each, (x,y), so 84 features.
    """
    # Only keep hand landmarks
    landmark_df = landmark_df[landmark_df['type'].isin(['left_hand', 'right_hand'])]

    frames = []
    for frame_id, frame_data in landmark_df.groupby('frame'):
        frame_vec = []

        for landmark_type in ['left_hand', 'right_hand']:
            type_data = frame_data[frame_data['type'] == landmark_type]
            type_data = type_data.sort_values('landmark_index')
            xy = type_data[['x', 'y']].to_numpy().flatten()

            # pad missing landmarks
            expected_landmarks = 21 * 2  
            if xy.shape[0] < expected_landmarks:
                xy = np.pad(xy, (0, expected_landmarks - xy.shape[0]))

            frame_vec.append(xy)

        frame_vec = np.concatenate(frame_vec)

        if frame_vec.shape[0] != feature_size:
            print(f"Warning: Frame has incorrect size {frame_vec.shape[0]} instead of {feature_size}")

        frames.append(frame_vec)

    frames = np.stack(frames)

    if frames.shape[0] < num_frames:
        pad_len = num_frames - frames.shape[0]
        frames = np.pad(frames, ((0, pad_len), (0, 0)), mode='constant')
    elif frames.shape[0] > num_frames:
        frames = frames[:num_frames]

    return frames.astype(np.float32) 


In [None]:
def visualize_frame(frame_vec, title="Hand Landmarks"):
    left_hand = frame_vec[:42].reshape(21, 2)
    right_hand = frame_vec[42:].reshape(21, 2)

    plt.figure(figsize=(5, 5))
    plt.scatter(left_hand[:, 0], -left_hand[:, 1], c='blue', label='Left Hand')
    plt.scatter(right_hand[:, 0], -right_hand[:, 1], c='red', label='Right Hand')
    plt.legend()
    plt.title(title)
    plt.show()

In [None]:
class ASLDataset(Dataset):
    def __init__(self, df, landmarks_path, label_to_index, num_frames=48):
        self.df = df
        self.landmarks_path = landmarks_path
        self.label_to_index = label_to_index
        self.num_frames = num_frames

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        participant_id = row['participant_id']
        sequence_id = row['sequence_id']
        sign = row['sign']
        
        file_path = os.path.join(self.landmarks_path, str(participant_id), f"{sequence_id}.parquet")
        
        if os.path.exists(file_path):
            landmark_df = pd.read_parquet(file_path)
            video_array = preprocess_landmarks(landmark_df, num_frames=self.num_frames)
            label = self.label_to_index[sign]
            
            return torch.tensor(video_array, dtype=torch.float32), torch.tensor(label, dtype=torch.long)
        else:
            raise FileNotFoundError(f"Landmark file {file_path} not found.")

## Model

In [None]:
class SignLanguageGRU(nn.Module):
    """
    A Bidirectional GRU-based neural network with Attention for sequence classification.

    Args:
        input_size (int): Number of input features per time step.
        hidden_size (int): Number of features in the hidden state of the GRU.
        num_classes (int): Number of output classes for classification.

    Architecture:
        - A bidirectional GRU that captures both past and future context from the input sequence.
        - An attention mechanism that learns to weigh important time steps.
        - A fully connected (linear) classifier that maps the attention-weighted context vector to class scores.
    """
    def __init__(self, input_size=84, hidden_size=256, num_classes=250):  # adjust num_classes
        super(SignLanguageGRU, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.attn = nn.Linear(hidden_size * 2, 1)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size * 2, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        gru_out, _ = self.gru(x)  # (batch, seq, 2*hidden)
        attn_weights = torch.softmax(self.attn(gru_out).squeeze(-1), dim=1)  # (batch, seq)
        context = torch.sum(gru_out * attn_weights.unsqueeze(-1), dim=1)  # (batch, 2*hidden)
        return self.classifier(context)


## Training

In [None]:
from sklearn.model_selection import train_test_split

# Set seed for reproducibility
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

# Create label mappings
label_to_index = {label: idx for idx, label in enumerate(df['sign'].unique())}
index_to_label = {idx: label for label, idx in label_to_index.items()}

# Split dataframe into training and validation
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['sign'], random_state=SEED)

# Initialize datasets
train_dataset = ASLDataset(train_df, landmarks_path, label_to_index)
val_dataset = ASLDataset(val_df, landmarks_path, label_to_index)

# Initialize DataLoaders
train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    num_workers=4,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=64,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
def train_model(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0

    for inputs, labels in loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        loss.backward()
        optimizer.step()

    return total_loss / len(loader), correct / len(loader.dataset)

def evaluate_model(model, loader, device):
    model.eval()
    correct = 0
    predictions, actuals = [], []

    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            predictions.extend(preds.cpu().numpy())
            actuals.extend(labels.cpu().numpy())

    accuracy = correct / len(loader.dataset)
    return accuracy, predictions, actuals

In [None]:
model = SignLanguageGRU(input_size=84, hidden_size=256, num_classes=len(label_to_index)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(10):
    train_loss, train_acc = train_model(model, train_loader, criterion, optimizer, device)
    val_acc, preds, actuals = evaluate_model(model, val_loader, device)
    print(f"[Epoch {epoch+1}] Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")


idx_to_label = {v: k for k, v in label_to_index.items()}
for i in range(5):
    print(f"Predicted: {idx_to_label[preds[i]]}, Actual: {idx_to_label[actuals[i]]}")