## Training via Pytorch 

In [None]:
import os
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [None]:
print("PyTorch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())

In [None]:
if torch.cuda.is_available():
    print("Number of GPUs:", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print("GPU {}: {}".format(i, torch.cuda.get_device_name(i)))
else:
    print("CUDA is not available.")

### ----- 1. Load Data -----

In [None]:
data_dir = "/home/haggenmueller/asl_detection/machine_learning/datasets/how2sign/keypoints"
train_json_dir = os.path.join(data_dir, "train/openpose_output/json")
val_json_dir = os.path.join(data_dir, "val/openpose_output/json")
test_json_dir = os.path.join(data_dir, "test/openpose_output/json")

csv_path = "/home/haggenmueller/asl_detection/machine_learning/datasets/how2sign/english_translation"
train_labels_csv = os.path.join(csv_path, "how2sign_realigned_train.csv")
val_labels_csv = os.path.join(csv_path, "how2sign_realigned_val.csv")
test_labels_csv = os.path.join(csv_path, "how2sign_realigned_test.csv")

In [None]:
# ----- Training Labels -----
train_label_df = pd.read_csv(train_labels_csv, delimiter="\t")

# Create a sorted list of unique SENTENCE_ID values for training
unique_train_sentences = sorted(set(train_label_df["SENTENCE_ID"]))
# Map original SENTENCE_IDs to 0-indexed IDs for training
sentence_to_id_train = {sentence: idx for idx, sentence in enumerate(unique_train_sentences)}

# Create mapping: SENTENCE_NAME -> 0-indexed SENTENCE_ID for training
label_mapping_train = {
    name: sentence_to_id_train[sentence_id] 
    for name, sentence_id in zip(train_label_df["SENTENCE_NAME"], train_label_df["SENTENCE_ID"])
}

In [None]:
# ----- Validation Labels -----
val_label_df = pd.read_csv(val_labels_csv, delimiter="\t")

# Create a sorted list of unique SENTENCE_ID values for validation
unique_val_sentences = sorted(set(val_label_df["SENTENCE_ID"]))
# Map original SENTENCE_IDs to 0-indexed IDs for validation
sentence_to_id_val = {sentence: idx for idx, sentence in enumerate(unique_val_sentences)}

# Create mapping: SENTENCE_NAME -> 0-indexed SENTENCE_ID for validation
label_mapping_val = {
    name: sentence_to_id_val[sentence_id] 
    for name, sentence_id in zip(val_label_df["SENTENCE_NAME"], val_label_df["SENTENCE_ID"])
}

In [None]:
# ----- Test Labels -----
test_label_df = pd.read_csv(test_labels_csv, delimiter="\t")

# Create a sorted list of unique SENTENCE_ID values for test
unique_test_sentences = sorted(set(test_label_df["SENTENCE_ID"]))
# Map original SENTENCE_IDs to 0-indexed IDs for test
sentence_to_id_test = {sentence: idx for idx, sentence in enumerate(unique_test_sentences)}

# Create mapping: SENTENCE_NAME -> 0-indexed SENTENCE_ID for test
label_mapping_test = {
    name: sentence_to_id_test[sentence_id] 
    for name, sentence_id in zip(test_label_df["SENTENCE_NAME"], test_label_df["SENTENCE_ID"])
}

In [None]:
# Optional: Print examples to check the mappings
print("Train mapping example:", list(label_mapping_train.items())[:5])
print("Val mapping example:", list(label_mapping_val.items())[:5])
print("Test mapping example:", list(label_mapping_test.items())[:5])

In [None]:
def load_keypoints(json_folder, max_frames=100):
    """
    Load keypoints from JSON files and return a padded sequence as a tensor.
    
    Returns:
        torch.Tensor: Shape (max_frames, feature_dim)
    """
    keypoints_sequence = []
    required_dim = 411  # Fixed feature dimension

    for frame_file in sorted(os.listdir(json_folder)):
        frame_path = os.path.join(json_folder, frame_file)
        with open(frame_path, "r") as f:
            data = json.load(f)
        
        if "people" in data and len(data["people"]) > 0:
            person = data["people"][0]  # First detected person

            # Extract keypoints from different parts
            pose = person.get("pose_keypoints_2d", [])
            face = person.get("face_keypoints_2d", [])
            left_hand = person.get("hand_left_keypoints_2d", [])
            right_hand = person.get("hand_right_keypoints_2d", [])
            
            # Combine all keypoints
            full_keypoints = pose + face + left_hand + right_hand
            
            # Pad or truncate to required_dim
            if len(full_keypoints) < required_dim:
                full_keypoints += [0.0] * (required_dim - len(full_keypoints))
            else:
                full_keypoints = full_keypoints[:required_dim]
            
            keypoints_tensor = torch.tensor(full_keypoints, dtype=torch.float32)
            keypoints_sequence.append(keypoints_tensor)
    
    # If no frames were loaded, return zeros
    if not keypoints_sequence:
        return torch.zeros((max_frames, required_dim), dtype=torch.float32)
    
    # Stack tensors: (num_frames, feature_dim)
    seq_tensor = torch.stack(keypoints_sequence)
    
    # Pad or truncate to max_frames
    if seq_tensor.shape[0] < max_frames:
        padded_sequence = torch.zeros((max_frames, required_dim), dtype=torch.float32)
        padded_sequence[:seq_tensor.shape[0]] = seq_tensor
    else:
        padded_sequence = seq_tensor[:max_frames]
    
    return padded_sequence

In [None]:
# Maximum number of frames per sequence (set based on dataset analysis)
MAX_FRAMES = 200  

def pad_or_truncate(sequence, max_frames=MAX_FRAMES):
    """Pads or truncates the sequence tensor to ensure a fixed length."""
    num_frames, num_features = sequence.shape
    if num_frames < max_frames:
        pad = torch.zeros((max_frames - num_frames, num_features),
                          dtype=sequence.dtype, device=sequence.device)
        sequence = torch.cat((sequence, pad), dim=0)
    else:
        sequence = sequence[:max_frames]
    return sequence

In [None]:
def process_data(json_dir, mapping, sentence_to_id, max_frames=MAX_FRAMES):
    X_data, y_labels = [], []
    for sentence_name in os.listdir(json_dir):
        sentence_folder = os.path.join(json_dir, sentence_name)
        if os.path.isdir(sentence_folder) and sentence_name in mapping:
            keypoints_sequence = load_keypoints(sentence_folder)
            keypoints_sequence = pad_or_truncate(keypoints_sequence, max_frames)
            X_data.append(keypoints_sequence)
            
            label = mapping.get(sentence_name, None)
            if str(label) in sentence_to_id:
                y_labels.append(sentence_to_id[label])
    if not X_data:
        print(f"No data found in {json_dir}")
    X_data = torch.stack(X_data)
    y_labels = torch.tensor(y_labels, dtype=torch.long)
    return X_data, y_labels

### ----- 2. Prepare Data for PyTorch -----

In [None]:
# Process data for training
X_train, y_train = process_data(train_json_dir, label_mapping_train, sentence_to_id_train)

In [None]:
print("X_train samples:", X_train.shape[0])
print("y_train samples:", len(y_train))

In [None]:
# Process data for validation 
X_val, y_val = process_data(val_json_dir, label_mapping_val, sentence_to_id_val)

In [None]:
# Process data for testing
X_test, y_test = process_data(test_json_dir, label_mapping_test, sentence_to_id_test)

In [None]:
# Create DataLoaders for batch processing
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=16, shuffle=True)
val_loader   = DataLoader(TensorDataset(X_val, y_val), batch_size=16, shuffle=False)
test_loader  = DataLoader(TensorDataset(X_test, y_test), batch_size=16, shuffle=False)

### ----- 3. Define LSTM Model -----

In [None]:
# Define an LSTM-based model for sequence classification
class SignLanguageLSTM(nn.Module):
    def __init__(self, input_dim=411, hidden_dim=256, num_layers=2, output_dim=30814, dropout=0.3):
        super(SignLanguageLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Dropout check (PyTorch does not allow dropout for a single-layer LSTM)
        dropout = dropout if num_layers > 1 else 0
        
        # LSTM layer
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        
        # Fully connected output layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, lengths=None):
        if lengths is not None:
            # Pack the sequence to ignore padded frames
            packed_input = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
            packed_output, _ = self.lstm(packed_input)
            lstm_out, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
            # Extract the last valid output for each sequence
            last_outputs = torch.stack([lstm_out[i, length-1, :] for i, length in enumerate(lengths)])
        else:
            lstm_out, _ = self.lstm(x)
            last_outputs = lstm_out[:, -1, :]
            
        return self.fc(last_outputs)

In [None]:
# Model parameters
input_dim = 411            # Number of keypoints per frame
hidden_dim = 256           # Number of hidden units in LSTM
num_layers = 2             # Number of LSTM layers
output_dim = len(set(label_mapping_train))  # Number of classes (0-indexed)
dropout = 0.3              # Dropout for regularization

# Optionally disable cuDNN for debugging purposes
torch.backends.cudnn.enabled = False

# Create model and move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("device: ", device)
# device = torch.device("cpu")
model = SignLanguageLSTM(input_dim, hidden_dim, num_layers, output_dim, dropout).to(device)

# Define loss function (CrossEntropyLoss for classification)
criterion = nn.CrossEntropyLoss()

# Define optimizer (Adam works well for LSTMs)
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# Initialize LR scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

In [None]:
# Test a single batch before training
X_batch, y_batch = next(iter(train_loader))
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
print("y_batch min:", y_batch.min().item(), "y_batch max:", y_batch.max().item())
outputs = model(X_batch)
print("Output shape:", outputs.shape)

### ----- 4. Training -----

In [None]:
num_epochs = 200  # Adjust as needed
patience = 10   # Early stopping patience
best_val_loss = float('inf')
trigger_times = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    
    # Validierungsschleife (vorausgesetzt, du hast einen val_loader)
    model.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for X_val, y_val in val_loader:
            X_val, y_val = X_val.to(device), y_val.to(device)
            outputs_val = model(X_val)
            loss_val = criterion(outputs_val, y_val)
            total_val_loss += loss_val.item()
    avg_val_loss = total_val_loss / len(val_loader)
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
    
    # LR-Scheduler: Update based on Validation Loss
    scheduler.step(avg_val_loss)
    
    # Early Stopping: Prüfe, ob sich der Validierungs-Loss verbessert hat
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print("Early stopping triggered.")
            break

### ----- 5. Save Model & Evaluate -----

In [None]:
# Save only state_dict
torch.save(model.state_dict(), "sign_language_lstm_state.pth")

# Then later load it into a model instance
model = SignLanguageLSTM(input_dim, hidden_dim, num_layers, output_dim, dropout).to(device)
model.load_state_dict(torch.load("sign_language_lstm_state.pth", map_location=device))
model.eval()

### ----- 6. Testing & Inference -----

In [None]:
def predict(model, sample_input):
    model.eval()  # Set model to evaluation mode
    
    # Ensure input is a PyTorch tensor
    if not isinstance(sample_input, torch.Tensor):
        sample_input = torch.tensor(sample_input, dtype=torch.float32)
    
    sample_input = sample_input.unsqueeze(0).to(device)  # Add batch dimension and move to correct device
    
    with torch.no_grad():  # Disable gradient computation for inference
        output = model(sample_input)
        predicted_label = torch.argmax(output, dim=1).item()
    
    return predicted_label

# Example usage with test data
# Ensure that X_test is not empty; here we take the first sample
sample_idx = 0  # or any valid index in the test set
sample_input = X_test[sample_idx]
predicted_label = predict(model, sample_input)
print(f"Predicted Label (Test Data): {predicted_label}")