## Training via Pytorch 

In [1]:
import os
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [2]:
print("PyTorch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())

PyTorch version: 2.6.0+cu118
CUDA version: 11.8
cuDNN version: 90100


In [3]:
if torch.cuda.is_available():
    print("Number of GPUs:", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print("GPU {}: {}".format(i, torch.cuda.get_device_name(i)))
else:
    print("CUDA is not available.")

Number of GPUs: 2
GPU 0: NVIDIA RTX A6000
GPU 1: NVIDIA RTX A6000


### ----- 1. Load Data -----

In [4]:
data_dir = "/home/haggenmueller/asl_detection/machine_learning/datasets/how2sign/keypoints"
train_json_dir = os.path.join(data_dir, "train/openpose_output/json")
val_json_dir = os.path.join(data_dir, "val/openpose_output/json")
test_json_dir = os.path.join(data_dir, "test/openpose_output/json")

csv_path = "/home/haggenmueller/asl_detection/machine_learning/datasets/how2sign/english_translation"
train_labels_csv = os.path.join(csv_path, "how2sign_realigned_train.csv")
val_labels_csv = os.path.join(csv_path, "how2sign_realigned_val.csv")
test_labels_csv = os.path.join(csv_path, "how2sign_realigned_test.csv")

In [5]:
# ----- Training Labels -----
train_label_df = pd.read_csv(train_labels_csv, delimiter="\t")

# Create a sorted list of unique SENTENCE_ID values for training
unique_train_sentences = sorted(set(train_label_df["SENTENCE_ID"]))
# Map original SENTENCE_IDs to 0-indexed IDs for training
sentence_to_id_train = {sentence: idx for idx, sentence in enumerate(unique_train_sentences)}

# Create mapping: SENTENCE_NAME -> 0-indexed SENTENCE_ID for training
label_mapping_train = {
    name: sentence_id
    for name, sentence_id in zip(train_label_df["SENTENCE_NAME"], train_label_df["SENTENCE_ID"])
}

In [6]:
# ----- Validation Labels -----
val_label_df = pd.read_csv(val_labels_csv, delimiter="\t")

# Create a sorted list of unique SENTENCE_ID values for validation
unique_val_sentences = sorted(set(val_label_df["SENTENCE_ID"]))
# Map original SENTENCE_IDs to 0-indexed IDs for validation
sentence_to_id_val = {sentence: idx for idx, sentence in enumerate(unique_val_sentences)}

# Create mapping: SENTENCE_NAME -> 0-indexed SENTENCE_ID for validation
label_mapping_val = {
    name: sentence_id
    for name, sentence_id in zip(val_label_df["SENTENCE_NAME"], val_label_df["SENTENCE_ID"])
}

In [7]:
# ----- Test Labels -----
test_label_df = pd.read_csv(test_labels_csv, delimiter="\t")

# Create a sorted list of unique SENTENCE_ID values for test
unique_test_sentences = sorted(set(test_label_df["SENTENCE_ID"]))
# Map original SENTENCE_IDs to 0-indexed IDs for test
sentence_to_id_test = {sentence: idx for idx, sentence in enumerate(unique_test_sentences)}

# Create mapping: SENTENCE_NAME -> 0-indexed SENTENCE_ID for test
label_mapping_test = {
    name: sentence_id
    for name, sentence_id in zip(test_label_df["SENTENCE_NAME"], test_label_df["SENTENCE_ID"])
}

In [8]:
# Optional: Print examples to check the mappings
print("Train mapping example:", list(label_mapping_train.items())[:5])
print("Val mapping example:", list(label_mapping_val.items())[:5])
print("Test mapping example:", list(label_mapping_test.items())[:5])

Train mapping example: [('--7E2sU6zP4_10-5-rgb_front', '--7E2sU6zP4_10'), ('--7E2sU6zP4_11-5-rgb_front', '--7E2sU6zP4_11'), ('--7E2sU6zP4_12-5-rgb_front', '--7E2sU6zP4_12'), ('--7E2sU6zP4_13-5-rgb_front', '--7E2sU6zP4_13'), ('--7E2sU6zP4_5-5-rgb_front', '--7E2sU6zP4_5')]
Val mapping example: [('-d5dN54tH2E_0-1-rgb_front', '-d5dN54tH2E_0'), ('-d5dN54tH2E_1-1-rgb_front', '-d5dN54tH2E_1'), ('-d5dN54tH2E_10-1-rgb_front', '-d5dN54tH2E_10'), ('-d5dN54tH2E_11-1-rgb_front', '-d5dN54tH2E_11'), ('-d5dN54tH2E_12-1-rgb_front', '-d5dN54tH2E_12')]
Test mapping example: [('-fZc293MpJk_0-1-rgb_front', '-fZc293MpJk_0'), ('-fZc293MpJk_2-1-rgb_front', '-fZc293MpJk_2'), ('-fZc293MpJk_3-1-rgb_front', '-fZc293MpJk_3'), ('-fZc293MpJk_4-1-rgb_front', '-fZc293MpJk_4'), ('-fZc293MpJk_5-1-rgb_front', '-fZc293MpJk_5')]


In [9]:
def load_keypoints(json_folder, max_frames=100):
    """
    Load keypoints from JSON files and return a padded sequence as a tensor.
    
    Returns:
        torch.Tensor: Shape (max_frames, feature_dim)
    """
    keypoints_sequence = []
    required_dim = 411  # Fixed feature dimension

    for frame_file in sorted(os.listdir(json_folder)):
        frame_path = os.path.join(json_folder, frame_file)
        with open(frame_path, "r") as f:
            data = json.load(f)
        
        if "people" in data and len(data["people"]) > 0:
            person = data["people"][0]  # First detected person

            # Extract keypoints from different parts
            pose = person.get("pose_keypoints_2d", [])
            face = person.get("face_keypoints_2d", [])
            left_hand = person.get("hand_left_keypoints_2d", [])
            right_hand = person.get("hand_right_keypoints_2d", [])
            
            # Combine all keypoints
            full_keypoints = pose + face + left_hand + right_hand
            
            # Pad or truncate to required_dim
            if len(full_keypoints) < required_dim:
                full_keypoints += [0.0] * (required_dim - len(full_keypoints))
            else:
                full_keypoints = full_keypoints[:required_dim]
            
            keypoints_tensor = torch.tensor(full_keypoints, dtype=torch.float32)
            keypoints_sequence.append(keypoints_tensor)
    
    # If no frames were loaded, return zeros
    if not keypoints_sequence:
        return torch.zeros((max_frames, required_dim), dtype=torch.float32)
    
    # Stack tensors: (num_frames, feature_dim)
    seq_tensor = torch.stack(keypoints_sequence)
    
    # Pad or truncate to max_frames
    if seq_tensor.shape[0] < max_frames:
        padded_sequence = torch.zeros((max_frames, required_dim), dtype=torch.float32)
        padded_sequence[:seq_tensor.shape[0]] = seq_tensor
    else:
        padded_sequence = seq_tensor[:max_frames]
    
    return padded_sequence

In [10]:
# Maximum number of frames per sequence (set based on dataset analysis)
MAX_FRAMES = 200  

def pad_or_truncate(sequence, max_frames=MAX_FRAMES):
    """Pads or truncates the sequence tensor to ensure a fixed length."""
    num_frames, num_features = sequence.shape
    if num_frames < max_frames:
        pad = torch.zeros((max_frames - num_frames, num_features),
                          dtype=sequence.dtype, device=sequence.device)
        sequence = torch.cat((sequence, pad), dim=0)
    else:
        sequence = sequence[:max_frames]
    return sequence

In [11]:
def process_data(json_dir, mapping, sentence_to_id, max_frames=MAX_FRAMES):
    X_data, y_labels = [], []

    print(f"\nChecking JSON directory: {json_dir}")
    json_folders = os.listdir(json_dir)
    print(f"Existing JSON folders: {json_folders[:5]}")

    for folder_name in json_folders:
        folder_path = os.path.join(json_dir, folder_name)

        if not os.path.isdir(folder_path):
            print(f"Skipping '{folder_name}' (not a directory)")
            continue

        # Check if folder name exists in mapping
        if folder_name not in mapping:
            print(f"Skipping: '{folder_name}' (not in mapping)")
            continue

        # Get the sentence ID directly (String)
        sentence_id = mapping[folder_name]

        # Check if sentence ID exists in sentence_to_id
        if sentence_id not in sentence_to_id:
            print(f"Skipping: Sentence ID '{sentence_id}' (not in sentence_to_id)")
            continue
        
        print(f"Processing: '{folder_name}' -> Sentence ID '{sentence_id}' -> Mapped ID {sentence_to_id[sentence_id]}")

        # Load keypoints and normalize
        keypoints_sequence = load_keypoints(folder_path)
        keypoints_sequence = pad_or_truncate(keypoints_sequence, max_frames)

        X_data.append(keypoints_sequence)
        y_labels.append(sentence_to_id[sentence_id])

    if not X_data:
        print(f"\n⚠️  No valid data found in {json_dir} ⚠️")

    X_data = torch.stack(X_data) if X_data else torch.empty(0, max_frames, 411)
    y_labels = torch.tensor(y_labels, dtype=torch.long) if y_labels else torch.empty(0, dtype=torch.long)
    
    return X_data, y_labels

In [12]:
print("sentence_to_id example:", list(sentence_to_id_train.keys())[:5])
print("label_mapping values:", list(label_mapping_train.values())[:5])

sentence_to_id example: ['--7E2sU6zP4_10', '--7E2sU6zP4_11', '--7E2sU6zP4_12', '--7E2sU6zP4_13', '--7E2sU6zP4_5']
label_mapping values: ['--7E2sU6zP4_10', '--7E2sU6zP4_11', '--7E2sU6zP4_12', '--7E2sU6zP4_13', '--7E2sU6zP4_5']


### ----- 2. Prepare Data for PyTorch -----

In [13]:
# Process data for training
X_train, y_train = process_data(train_json_dir, label_mapping_train, sentence_to_id_train)


Checking JSON directory: /home/haggenmueller/asl_detection/machine_learning/datasets/how2sign/keypoints/train/openpose_output/json
Existing JSON folders: ['02LnULLXds4_21-8-rgb_front', 'b1hxJIvcgls_1-8-rgb_front', '23LLbhn_Mjg_18-5-rgb_front', '1RKv7hJtf80_12-5-rgb_front', '-cv1gZaGtNk_20-5-rgb_front']
Processing: '02LnULLXds4_21-8-rgb_front' -> Sentence ID '02LnULLXds4_21' -> Mapped ID 2669
Processing: 'b1hxJIvcgls_1-8-rgb_front' -> Sentence ID 'b1hxJIvcgls_1' -> Mapped ID 26098
Processing: '23LLbhn_Mjg_18-5-rgb_front' -> Sentence ID '23LLbhn_Mjg_18' -> Mapped ID 15163
Processing: '1RKv7hJtf80_12-5-rgb_front' -> Sentence ID '1RKv7hJtf80_12' -> Mapped ID 9896
Processing: '-cv1gZaGtNk_20-5-rgb_front' -> Sentence ID '-cv1gZaGtNk_20' -> Mapped ID 1691
Processing: 'eBrlZcccILg_15-3-rgb_front' -> Sentence ID 'eBrlZcccILg_15' -> Mapped ID 29623
Processing: '1tryKCIRb40_2-8-rgb_front' -> Sentence ID '1tryKCIRb40_2' -> Mapped ID 14361
Processing: '1cSjZ5kNZw8_18-8-rgb_front' -> Sentence ID '1

In [14]:
print("X_train samples:", X_train.shape[0])
print("y_train samples:", len(y_train))

X_train samples: 31047
y_train samples: 31047


In [15]:
# Process data for validation 
X_val, y_val = process_data(val_json_dir, label_mapping_val, sentence_to_id_val)


Checking JSON directory: /home/haggenmueller/asl_detection/machine_learning/datasets/how2sign/keypoints/val/openpose_output/json
Existing JSON folders: ['-f1_kdl050s_19-1-rgb_front', 'a5yNwUSiYpA_8-5-rgb_front', '_5CV2fIG7qY_6-5-rgb_front', 'cw5evdziBB4_3-8-rgb_front', 'bIUmw2DVW7Q_17-3-rgb_front']
Processing: '-f1_kdl050s_19-1-rgb_front' -> Sentence ID '-f1_kdl050s_19' -> Mapped ID 27
Processing: 'a5yNwUSiYpA_8-5-rgb_front' -> Sentence ID 'a5yNwUSiYpA_8' -> Mapped ID 977
Processing: '_5CV2fIG7qY_6-5-rgb_front' -> Sentence ID '_5CV2fIG7qY_6' -> Mapped ID 917
Processing: 'cw5evdziBB4_3-8-rgb_front' -> Sentence ID 'cw5evdziBB4_3' -> Mapped ID 1264
Processing: 'bIUmw2DVW7Q_17-3-rgb_front' -> Sentence ID 'bIUmw2DVW7Q_17' -> Mapped ID 1076
Processing: 'DfnHNkTE7mE_14-5-rgb_front' -> Sentence ID 'DfnHNkTE7mE_14' -> Mapped ID 757
Processing: 'ETOZLBScxWY_6-3-rgb_front' -> Sentence ID 'ETOZLBScxWY_6' -> Mapped ID 818
Processing: 'ETOZLBScxWY_1-5-rgb_front' -> Sentence ID 'ETOZLBScxWY_1' -> Ma

In [16]:
# Process data for testing
X_test, y_test = process_data(test_json_dir, label_mapping_test, sentence_to_id_test)


Checking JSON directory: /home/haggenmueller/asl_detection/machine_learning/datasets/how2sign/keypoints/test/openpose_output/json
Existing JSON folders: ['G1LiGqM3FhM_6-8-rgb_front', 'G2hnUeetWcc_18-5-rgb_front', 'G3di9jJTqDs_1-10-rgb_front', '_g0fpC8aiME_0-5-rgb_front', 'G1lNlhjWC1I_11-8-rgb_front']
Processing: 'G1LiGqM3FhM_6-8-rgb_front' -> Sentence ID 'G1LiGqM3FhM_6' -> Mapped ID 514
Processing: 'G2hnUeetWcc_18-5-rgb_front' -> Sentence ID 'G2hnUeetWcc_18' -> Mapped ID 682
Processing: 'G3di9jJTqDs_1-10-rgb_front' -> Sentence ID 'G3di9jJTqDs_1' -> Mapped ID 925
Processing: '_g0fpC8aiME_0-5-rgb_front' -> Sentence ID '_g0fpC8aiME_0' -> Mapped ID 1102
Processing: 'G1lNlhjWC1I_11-8-rgb_front' -> Sentence ID 'G1lNlhjWC1I_11' -> Mapped ID 541
Processing: 'G23G21G49dk_8-2-rgb_front' -> Sentence ID 'G23G21G49dk_8' -> Mapped ID 591
Processing: 'FzUdcaxw_vs_9-2-rgb_front' -> Sentence ID 'FzUdcaxw_vs_9' -> Mapped ID 257
Processing: 'g0yUlOaqL6k_4-3-rgb_front' -> Sentence ID 'g0yUlOaqL6k_4' -> M

In [17]:
# Create DataLoaders for batch processing
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=16, shuffle=True)
val_loader   = DataLoader(TensorDataset(X_val, y_val), batch_size=16, shuffle=False)
test_loader  = DataLoader(TensorDataset(X_test, y_test), batch_size=16, shuffle=False)

### ----- 3. Define LSTM Model -----

In [18]:
# Define an LSTM-based model for sequence classification
class SignLanguageLSTM(nn.Module):
    def __init__(self, input_dim=411, hidden_dim=256, num_layers=2, output_dim=30814, dropout=0.3):
        super(SignLanguageLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Dropout check (PyTorch does not allow dropout for a single-layer LSTM)
        dropout = dropout if num_layers > 1 else 0
        
        # LSTM layer
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        
        # Fully connected output layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, lengths=None):
        if lengths is not None:
            # Pack the sequence to ignore padded frames
            packed_input = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
            packed_output, _ = self.lstm(packed_input)
            lstm_out, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
            # Extract the last valid output for each sequence
            last_outputs = torch.stack([lstm_out[i, length-1, :] for i, length in enumerate(lengths)])
        else:
            lstm_out, _ = self.lstm(x)
            last_outputs = lstm_out[:, -1, :]
            
        return self.fc(last_outputs)

In [19]:
# Model parameters
input_dim = 411            # Number of keypoints per frame
hidden_dim = 256           # Number of hidden units in LSTM
num_layers = 2             # Number of LSTM layers
output_dim = len(set(label_mapping_train))  # Number of classes (0-indexed)
dropout = 0.3              # Dropout for regularization

# Optionally disable cuDNN for debugging purposes
torch.backends.cudnn.enabled = False

# Create model and move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("device: ", device)
# device = torch.device("cpu")
model = SignLanguageLSTM(input_dim, hidden_dim, num_layers, output_dim, dropout).to(device)

# Define loss function (CrossEntropyLoss for classification)
criterion = nn.CrossEntropyLoss()

# Define optimizer (Adam works well for LSTMs)
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# Initialize LR scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

device:  cuda




In [20]:
# Test a single batch before training
X_batch, y_batch = next(iter(train_loader))
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
print("y_batch min:", y_batch.min().item(), "y_batch max:", y_batch.max().item())
outputs = model(X_batch)
print("Output shape:", outputs.shape)

y_batch min: 109 y_batch max: 29688
Output shape: torch.Size([16, 31165])


### ----- 4. Training -----

In [None]:
num_epochs = 200  # Adjust as needed
patience = 10   # Early stopping patience
best_val_loss = float('inf')
trigger_times = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    
    # Validierungsschleife (vorausgesetzt, du hast einen val_loader)
    model.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for X_val, y_val in val_loader:
            X_val, y_val = X_val.to(device), y_val.to(device)
            outputs_val = model(X_val)
            loss_val = criterion(outputs_val, y_val)
            total_val_loss += loss_val.item()
    avg_val_loss = total_val_loss / len(val_loader)
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
    
    # LR-Scheduler: Update based on Validation Loss
    scheduler.step(avg_val_loss)
    
    # Early Stopping: Prüfe, ob sich der Validierungs-Loss verbessert hat
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print("Early stopping triggered.")
            break

### ----- 5. Save Model & Evaluate -----

In [None]:
# Save only state_dict
torch.save(model.state_dict(), "sign_language_lstm_state.pth")

# Then later load it into a model instance
model = SignLanguageLSTM(input_dim, hidden_dim, num_layers, output_dim, dropout).to(device)
model.load_state_dict(torch.load("sign_language_lstm_state.pth", map_location=device))
model.eval()

### ----- 6. Testing & Inference -----

In [None]:
def predict(model, sample_input):
    model.eval()  # Set model to evaluation mode
    
    # Ensure input is a PyTorch tensor
    if not isinstance(sample_input, torch.Tensor):
        sample_input = torch.tensor(sample_input, dtype=torch.float32)
    
    sample_input = sample_input.unsqueeze(0).to(device)  # Add batch dimension and move to correct device
    
    with torch.no_grad():  # Disable gradient computation for inference
        output = model(sample_input)
        predicted_label = torch.argmax(output, dim=1).item()
    
    return predicted_label

# Example usage with test data
# Ensure that X_test is not empty; here we take the first sample
sample_idx = 0  # or any valid index in the test set
sample_input = X_test[sample_idx]
predicted_label = predict(model, sample_input)
print(f"Predicted Label (Test Data): {predicted_label}")