## Training via Pytorch 

In [None]:
import os
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [None]:
print("PyTorch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())

In [None]:
if torch.cuda.is_available():
    print("Number of GPUs:", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print("GPU {}: {}".format(i, torch.cuda.get_device_name(i)))
else:
    print("CUDA is not available.")

### ----- 1. Load Data -----

In [None]:
data_dir = "/home/haggenmueller/asl_detection/machine_learning/datasets/how2sign/keypoints"
train_json_dir = os.path.join(data_dir, "train/openpose_output/json")
val_json_dir = os.path.join(data_dir, "val/openpose_output/json")
test_json_dir = os.path.join(data_dir, "test/openpose_output/json")

csv_path = "/home/haggenmueller/asl_detection/machine_learning/datasets/how2sign/english_translation"
train_labels_csv = os.path.join(csv_path, "how2sign_realigned_train.csv")
val_labels_csv = os.path.join(csv_path, "how2sign_realigned_val.csv")
test_labels_csv = os.path.join(csv_path, "how2sign_realigned_test.csv")

In [None]:
# ----- Training Labels -----
train_label_df = pd.read_csv(train_labels_csv, delimiter="\t")

# Create a sorted list of unique SENTENCE_ID values for training
unique_train_sentences = sorted(set(train_label_df["SENTENCE_ID"]))
# Map original SENTENCE_IDs to 0-indexed IDs for training
sentence_to_id_train = {sentence: idx for idx, sentence in enumerate(unique_train_sentences)}

# Create mapping: SENTENCE_NAME -> 0-indexed SENTENCE_ID for training
label_mapping_train = {
    name: sentence_id
    for name, sentence_id in zip(train_label_df["SENTENCE_NAME"], train_label_df["SENTENCE_ID"])
}

In [None]:
# Apply mapping
y_labels = [sentence_to_id_train[sentence_id] for sentence_id in train_label_df["SENTENCE_ID"]]
print(f"New y_labels min: {min(y_labels)}, max: {max(y_labels)}")


In [None]:
# ----- Validation Labels -----
val_label_df = pd.read_csv(val_labels_csv, delimiter="\t")

# Create a sorted list of unique SENTENCE_ID values for validation
unique_val_sentences = sorted(set(val_label_df["SENTENCE_ID"]))
# Map original SENTENCE_IDs to 0-indexed IDs for validation
sentence_to_id_val = {sentence: idx for idx, sentence in enumerate(unique_val_sentences)}

# Create mapping: SENTENCE_NAME -> 0-indexed SENTENCE_ID for validation
label_mapping_val = {
    name: sentence_id
    for name, sentence_id in zip(val_label_df["SENTENCE_NAME"], val_label_df["SENTENCE_ID"])
}

In [None]:
# ----- Test Labels -----
test_label_df = pd.read_csv(test_labels_csv, delimiter="\t")

# Create a sorted list of unique SENTENCE_ID values for test
unique_test_sentences = sorted(set(test_label_df["SENTENCE_ID"]))
# Map original SENTENCE_IDs to 0-indexed IDs for test
sentence_to_id_test = {sentence: idx for idx, sentence in enumerate(unique_test_sentences)}

# Create mapping: SENTENCE_NAME -> 0-indexed SENTENCE_ID for test
label_mapping_test = {
    name: sentence_id
    for name, sentence_id in zip(test_label_df["SENTENCE_NAME"], test_label_df["SENTENCE_ID"])
}

In [None]:
# Optional: Print examples to check the mappings
print("Train mapping example:", list(label_mapping_train.items())[:5])
print("Val mapping example:", list(label_mapping_val.items())[:5])
print("Test mapping example:", list(label_mapping_test.items())[:5])

In [None]:
def load_keypoints(json_folder, max_frames=100):
    """
    Load keypoints from JSON files and return a padded sequence as a tensor.
    
    Returns:
        torch.Tensor: Shape (max_frames, feature_dim)
    """
    keypoints_sequence = []
    required_dim = 411  # Fixed feature dimension

    for frame_file in sorted(os.listdir(json_folder)):
        frame_path = os.path.join(json_folder, frame_file)
        with open(frame_path, "r") as f:
            data = json.load(f)
        
        if "people" in data and len(data["people"]) > 0:
            person = data["people"][0]  # First detected person

            # Extract keypoints from different parts
            pose = person.get("pose_keypoints_2d", [])
            face = person.get("face_keypoints_2d", [])
            left_hand = person.get("hand_left_keypoints_2d", [])
            right_hand = person.get("hand_right_keypoints_2d", [])
            
            # Combine all keypoints
            full_keypoints = pose + face + left_hand + right_hand
            
            # Pad or truncate to required_dim
            if len(full_keypoints) < required_dim:
                full_keypoints += [0.0] * (required_dim - len(full_keypoints))
            else:
                full_keypoints = full_keypoints[:required_dim]
            
            keypoints_tensor = torch.tensor(full_keypoints, dtype=torch.float32)
            keypoints_sequence.append(keypoints_tensor)
    
    # If no frames were loaded, return zeros
    if not keypoints_sequence:
        return torch.zeros((max_frames, required_dim), dtype=torch.float32)
    
    # Stack tensors: (num_frames, feature_dim)
    seq_tensor = torch.stack(keypoints_sequence)
    
    # Pad or truncate to max_frames
    if seq_tensor.shape[0] < max_frames:
        padded_sequence = torch.zeros((max_frames, required_dim), dtype=torch.float32)
        padded_sequence[:seq_tensor.shape[0]] = seq_tensor
    else:
        padded_sequence = seq_tensor[:max_frames]
    
    return padded_sequence

In [None]:
# Maximum number of frames per sequence (set based on dataset analysis)
MAX_FRAMES = 200  

def pad_or_truncate(sequence, max_frames=MAX_FRAMES):
    """Pads or truncates the sequence tensor to ensure a fixed length."""
    num_frames, num_features = sequence.shape
    if num_frames < max_frames:
        pad = torch.zeros((max_frames - num_frames, num_features),
                          dtype=sequence.dtype, device=sequence.device)
        sequence = torch.cat((sequence, pad), dim=0)
    else:
        sequence = sequence[:max_frames]
    return sequence

In [None]:
def process_data(json_dir, mapping, sentence_to_id, max_frames=MAX_FRAMES):
    X_data, y_labels = [], []

    print(f"\nChecking JSON directory: {json_dir}")
    json_folders = os.listdir(json_dir)
    print(f"Existing JSON folders: {json_folders[:5]}")

    for folder_name in json_folders:
        folder_path = os.path.join(json_dir, folder_name)

        if not os.path.isdir(folder_path):
            print(f"Skipping '{folder_name}' (not a directory)")
            continue

        # Check if folder name exists in mapping
        if folder_name not in mapping:
            print(f"Skipping: '{folder_name}' (not in mapping)")
            continue

        # Get the sentence ID directly (String)
        sentence_id = mapping[folder_name]

        # Check if sentence ID exists in sentence_to_id
        if sentence_id not in sentence_to_id:
            print(f"Skipping: Sentence ID '{sentence_id}' (not in sentence_to_id)")
            continue
        
        # print(f"Processing: '{folder_name}' -> Sentence ID '{sentence_id}' -> Mapped ID {sentence_to_id[sentence_id]}")

        # Load keypoints and normalize
        keypoints_sequence = load_keypoints(folder_path)
        keypoints_sequence = pad_or_truncate(keypoints_sequence, max_frames)

        X_data.append(keypoints_sequence)
        y_labels.append(sentence_to_id[sentence_id])

    if not X_data:
        print(f"\n⚠️  No valid data found in {json_dir} ⚠️")

    X_data = torch.stack(X_data) if X_data else torch.empty(0, max_frames, 411)
    y_labels = torch.tensor(y_labels, dtype=torch.long) if y_labels else torch.empty(0, dtype=torch.long)
    
    return X_data, y_labels

In [None]:
print("sentence_to_id example:", list(sentence_to_id_train.keys())[:5])
print("label_mapping values:", list(label_mapping_train.values())[:5])

### ----- 2. Prepare Data for PyTorch -----

In [None]:
# Process data for training
X_train, y_train = process_data(train_json_dir, label_mapping_train, sentence_to_id_train)

In [None]:
print("X_train samples:", X_train.shape[0])
print("y_train samples:", len(y_train))

In [None]:
# Process data for validation 
X_val, y_val = process_data(val_json_dir, label_mapping_val, sentence_to_id_val)

In [None]:
# Process data for testing
X_test, y_test = process_data(test_json_dir, label_mapping_test, sentence_to_id_test)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_fn(batch):
    X_batch, y_batch = zip(*batch)
    X_batch = [torch.tensor(seq, dtype=torch.float32, device=device) for seq in X_batch]  
    lengths = torch.tensor([len(seq) for seq in X_batch], dtype=torch.long, device=device)
    X_batch = pad_sequence(X_batch, batch_first=True)

    return X_batch, torch.tensor(y_batch, dtype=torch.long, device=device), lengths

In [None]:
train_loader = DataLoader(list(zip(X_train, y_train)), batch_size=128, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(list(zip(X_val, y_val)), batch_size=128, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(list(zip(X_test, y_test)), batch_size=128, shuffle=False, collate_fn=collate_fn)

### ----- 3. Define LSTM Model -----

In [None]:
# Define an LSTM-based model for sequence classification
class SignLanguageLSTM(nn.Module):
    def __init__(self, input_dim=411, hidden_dim=1024, num_layers=2, output_dim=30814, dropout=0.2):
        super(SignLanguageLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x, lengths=None):
        lstm_out, _ = self.lstm(x)  # without packing
        last_outputs = lstm_out[:, -1, :]  # last timestep
        return self.fc(last_outputs)



In [None]:
# Model parameters
input_dim = 411            # Number of keypoints per frame
hidden_dim = 1024           # Number of hidden units in LSTM
num_layers = 2             # Number of LSTM layers
output_dim = len(set(label_mapping_train))  # Number of classes (0-indexed)
dropout = 0.2              # Dropout for regularization

# Optionally disable cuDNN for debugging purposes
torch.backends.cudnn.enabled = False

# Create model and move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("device: ", device)
# device = torch.device("cpu")
model = SignLanguageLSTM(input_dim, hidden_dim, num_layers, output_dim, dropout).to(device)

# Apply weight initialization
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if "weight" in name:
                nn.init.kaiming_uniform_(param)  # Better initialization for LSTMs
            elif "bias" in name:
                nn.init.zeros_(param)

model.apply(init_weights)  #Apply to all layers

# Define loss function (CrossEntropyLoss for classification)
criterion = nn.CrossEntropyLoss()

# Define optimizer (Adam works well for LSTMs)
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)

# Initialize LR scheduler
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

In [None]:
# Extract a single batch correctly
X_batch, y_batch, lengths = next(iter(train_loader))

# Move tensors to the correct device
X_batch, y_batch, lengths = X_batch.to(device), y_batch.to(device), lengths.to(device)

print("y_batch min:", y_batch.min().item(), "y_batch max:", y_batch.max().item())

### ----- 4. Training -----

In [None]:
import os
import cv2
import torch
import numpy as np
from collections import defaultdict

# Directories
base_path = "/home/haggenmueller/asl_detection/machine_learning/datasets"
raw_videos_path = f"{base_path}/wlasl/raw_videos"
shortened_videos_path = f"{base_path}/own_dataset/shortened_videos"
augmented_videos_path = f"{base_path}/own_dataset/videos_augmented"
processed_folder = f"{base_path}/own_dataset/videos_processed"
os.makedirs(processed_folder, exist_ok=True)

# Count videos per label and determine maximum frames
label_counts = defaultdict(int)
label_max_frames = defaultdict(int)
max_frames = 0

def process_videos(folder, is_augmented=False):
    global max_frames
    for video_file in os.listdir(folder):
        if video_file.endswith(".mp4"):
            parts = video_file.rsplit("_", 3) if is_augmented else video_file.rsplit("_", 2)
            if len(parts) >= 3:
                label = parts[1]  # The label is the second element
                video_path = os.path.join(folder, video_file)
                
                # Open video and count frames
                cap = cv2.VideoCapture(video_path)
                frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                cap.release()
                
                label_counts[label] += 1
                label_max_frames[label] = max(label_max_frames[label], frame_count)
                max_frames = max(max_frames, frame_count)

# Count videos and find maximum frames in all three folders
process_videos(raw_videos_path)
process_videos(shortened_videos_path)
process_videos(augmented_videos_path, is_augmented=True)

print(f"📏 Maximum number of frames: {max_frames}")

# Function to extract frames as Torch tensors
def extract_frames(video_path, device="cuda"):
    cap = cv2.VideoCapture(video_path)
    frames = []

    if not cap.isOpened():
        print(f"⚠️ Warning: Could not open video: {video_path}")
        return torch.zeros((1, 3, 224, 224), dtype=torch.float32, device=device)
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, (224, 224))
        frame = torch.tensor(frame, dtype=torch.float32, device=device).permute(2, 0, 1)
        frames.append(frame)
    
    cap.release()
    
    if not frames:
        print(f"⚠️ Warning: No frames extracted for {video_path}")
        return torch.zeros((1, 3, 224, 224), dtype=torch.float32, device=device)
    
    return torch.stack(frames, dim=0)

# Function to pad frames
def pad_frames(frames, target_length, device="cuda"):
    num_frames = frames.shape[0]
    
    if num_frames < target_length:
        padding = torch.zeros((target_length - num_frames, 3, 224, 224), dtype=torch.float32, device=device)
        return torch.cat((frames, padding), dim=0)
    else:
        return frames[:target_length]

# Process videos
device = "cuda" if torch.cuda.is_available() else "cpu"

num_epochs = 30  # Adjust as needed
patience = 10   # Early stopping patience
best_val_loss = float('inf')
trigger_times = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for X_batch, y_batch, lengths in train_loader:  # Unpacking 3 values now
        X_batch, y_batch, lengths = X_batch.to(device), y_batch.to(device), lengths.to(device)

        # Forward pass with sequence lengths
        outputs = model(X_batch, lengths)  

        loss = criterion(outputs, y_batch)
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    
    # Validation loop
    model.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for X_val, y_val, lengths in val_loader:  # Unpack 3 values
            X_val, y_val = X_val.to(device), y_val.to(device)
            lengths = lengths.cpu()  # Move lengths to CPU

            outputs_val = model(X_val, lengths)  # Pass lengths to model
            loss_val = criterion(outputs_val, y_val)
            total_val_loss += loss_val.item()

    avg_val_loss = total_val_loss / len(val_loader)
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
    
    # LR-Scheduler: Update based on Validation Loss
    scheduler.step()
    
    # Early Stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print("Early stopping triggered.")
            break

print("🚀 Normalization and Training completed!")

### ----- 5. Save Model & Evaluate -----

In [None]:
# Save only state_dict
torch.save(model.state_dict(), "sign_language_lstm_state.pth")

# Then later load it into a model instance
model = SignLanguageLSTM(input_dim, hidden_dim, num_layers, output_dim, dropout).to(device)
model.load_state_dict(torch.load("sign_language_lstm_state.pth", map_location=device))
model.eval()

### ----- 6. Testing & Inference -----

In [None]:
def predict(model, sample_input):
    model.eval()  # Set model to evaluation mode
    
    # Ensure input is a PyTorch tensor
    if not isinstance(sample_input, torch.Tensor):
        sample_input = torch.tensor(sample_input, dtype=torch.float32)
    
    sample_input = sample_input.unsqueeze(0).to(device)  # Add batch dimension and move to correct device
    
    with torch.no_grad():  # Disable gradient computation for inference
        output = model(sample_input)
        predicted_label = torch.argmax(output, dim=1).item()
    
    return predicted_label

# Example usage with test data
# Ensure that X_test is not empty; here we take the first sample
sample_idx = 0  # or any valid index in the test set
sample_input = X_test[sample_idx]
predicted_label = predict(model, sample_input)
print(f"Predicted Label (Test Data): {predicted_label}")