## Training via Pytorch 

In [2]:
import os
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


### ----- 1. Load Data -----

In [5]:
data_dir = "/home/haggenmueller/asl_detection/machine_learning/datasets/how2sign/keypoints/train/openpose_output"
json_dir = os.path.join(data_dir, "json")
video_dir = os.path.join(data_dir, "video")

csv_path = "/home/haggenmueller/asl_detection/machine_learning/datasets/how2sign/english_translation"
labels_csv = os.path.join(csv_path, "how2sign_realigned_train.csv")

In [6]:
# Load label mapping from CSV
label_df = pd.read_csv(labels_csv, delimiter="\t")

unique_sentences = list(set(label_df["SENTENCE_ID"]))
sentence_to_index = {sentence: idx for idx, sentence in enumerate(unique_sentences)}

label_mapping = {name: sentence_to_index[sentence_id] for name, sentence_id in zip(label_df["SENTENCE_NAME"], label_df["SENTENCE_ID"])}

In [5]:
# print(list(label_mapping.items())[0])

In [6]:
# for i, (key,value) in enumerate(label_mapping.items()):
#     print(f"Key: {key}, Value:{value}")
#     if(i == 5):
#         break

In [None]:
def load_keypoints(json_folder, max_frames=100):
    """
    Load keypoints from JSON files and return a padded sequence.
    
    Returns:
        torch.Tensor: Shape (max_frames, feature_dim)
    """
    keypoints_sequence = []

    for frame_file in sorted(os.listdir(json_folder)):
        frame_path = os.path.join(json_folder, frame_file)

        with open(frame_path, "r") as f:
            data = json.load(f)

        if "people" in data and len(data["people"]) > 0:
            person = data["people"][0]  # First detected person

            # Extract keypoints
            pose = person.get("pose_keypoints_2d", [])
            face = person.get("face_keypoints_2d", [])
            left_hand = person.get("hand_left_keypoints_2d", [])
            right_hand = person.get("hand_right_keypoints_2d", [])

            # Ensure fixed feature size
            full_keypoints = pose + face + left_hand + right_hand
            full_keypoints += [0] * (411 - len(full_keypoints))  # Pad if missing keypoints

            keypoints_sequence.append(np.array(full_keypoints, dtype=np.float32))

    # Convert list to array
    if len(keypoints_sequence) == 0:
        return torch.zeros((max_frames, 411), dtype=torch.float32)

    keypoints_sequence = np.array(keypoints_sequence)

    # Pad or truncate
    padded_sequence = np.zeros((max_frames, 411), dtype=np.float32)
    seq_length = min(len(keypoints_sequence), max_frames)
    padded_sequence[:seq_length, :] = keypoints_sequence[:seq_length, :]

    return torch.tensor(padded_sequence, dtype=torch.float32)  # Shape: (max_frames, 411)


In [None]:
# Maximum number of frames per sequence (set based on dataset analysis)
MAX_FRAMES = 200  

def pad_or_truncate(sequence, max_frames=MAX_FRAMES):
    """Pads or truncates the sequence to ensure a fixed length"""
    num_frames, num_features = sequence.shape
    if num_frames < max_frames:
        pad = np.zeros((max_frames - num_frames, num_features))  # Pad with zeros
        sequence = np.vstack((sequence, pad))
    else:
        sequence = sequence[:max_frames, :]  # Truncate sequence
    return sequence

# Create a mapping from sentences to numerical IDs
unique_sentences = sorted(set(label_mapping.values()))  
sentence_to_id = {sentence: idx for idx, sentence in enumerate(unique_sentences)}

# Iterate through all JSON subfolders
X_data, y_labels = [], []

for sentence_name in os.listdir(json_dir):
    sentence_folder = os.path.join(json_dir, sentence_name)
    
    if os.path.isdir(sentence_folder) and sentence_name in label_mapping:
        keypoints_sequence = load_keypoints(sentence_folder)
        
        if keypoints_sequence.shape[0] == 0:  # Check if empty sequence
            keypoints_sequence = torch.zeros((MAX_FRAMES, 411), dtype=torch.float32)  
        else:
            if keypoints_sequence.dim() == 3:  
                keypoints_sequence = keypoints_sequence.squeeze(0).numpy()  # Remove batch dimension
            else:
                keypoints_sequence = keypoints_sequence.numpy()

        keypoints_sequence = pad_or_truncate(keypoints_sequence)  # Ensure fixed length
        X_data.append(keypoints_sequence)  

        # Use Sentence-ID instead of full sentence
        label = label_mapping.get(sentence_name, None)
        if label in sentence_to_id:  
            y_labels.append(sentence_to_id[label])  

# Convert to PyTorch Tensors
X_data = torch.tensor(np.array(X_data), dtype=torch.float32)
y_labels = torch.tensor(y_labels, dtype=torch.long)  # Classification labels as IDs


In [9]:
# Print final shapes
print("Shape von X_data:", X_data.shape)
print("Anzahl eindeutiger Labels:", len(set(y_labels.numpy())))
print("Beispiel eines Labels:", y_labels[0])

Shape von X_data: torch.Size([31047, 200, 411])
Anzahl eindeutiger Labels: 30814
Beispiel eines Labels: tensor(4665)


### ----- 2. Prepare Data for PyTorch -----

In [None]:
# Convert labels to numerical IDs if they are still in string format
if isinstance(y_labels.tolist()[0], str):  # Ensure correct type check
    unique_labels = {label: idx for idx, label in enumerate(set(y_labels))}
    y_labels = np.array([unique_labels[label] for label in y_labels])

# Convert y_labels to PyTorch tensor
y_labels = torch.tensor(y_labels, dtype=torch.long)

# Create a PyTorch dataset from preprocessed tensors
dataset = TensorDataset(X_data, y_labels)

# Create DataLoader for batch processing
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)


### ----- 3. Define LSTM Model -----

In [None]:

# Define an LSTM-based model for sequence classification
class SignLanguageLSTM(nn.Module):
    def __init__(self, input_dim=411, hidden_dim=256, num_layers=2, output_dim=30814, dropout=0.3):
        super(SignLanguageLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Dropout check (PyTorch does not allow dropout for a single-layer LSTM)
        dropout = dropout if num_layers > 1 else 0
        
        # LSTM layer
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        
        # Fully connected output layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # LSTM forward pass
        lstm_out, _ = self.lstm(x)  # Shape: (batch_size, seq_len, hidden_dim)
        
        # Take the last timestep's output
        last_output = lstm_out[:, -1, :]  # Shape: (batch_size, hidden_dim)
        
        # Fully connected layer
        out = self.fc(last_output)  # Shape: (batch_size, output_dim)
        
        return out


In [None]:
# Model parameters
input_dim = 411  # Number of keypoints per frame
hidden_dim = 256  # Number of hidden units in LSTM
num_layers = 2  # Number of LSTM layers
output_dim = len(set(y_labels.cpu().numpy()))  # Ensure y_labels is on CPU
dropout = 0.3  # Dropout for regularization

# Create model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SignLanguageLSTM(input_dim, hidden_dim, num_layers, output_dim, dropout).to(device)

# Loss function (CrossEntropy for classification)
criterion = nn.CrossEntropyLoss()

# Optimizer (Adam works well for LSTMs)
optimizer = optim.Adam(model.parameters(), lr=0.001)


### ----- 4. Training -----

In [None]:
# Training parameters
num_epochs = 10  # Adjust as needed
batch_size = 16  # Already set in DataLoader

torch.backends.cudnn.benchmark = True  # Optional: Faster training on GPU

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        X_batch, y_batch = batch
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        # Ensure correct label shape
        y_batch = y_batch.squeeze()

        # Debugging: Check tensor shapes
        print(f"X_batch shape: {X_batch.shape}, y_batch shape: {y_batch.shape}")

        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")


NameError: name 'X_batch' is not defined

### ----- 5. Save Model & Evaluate -----

In [None]:
# Save full model
torch.save(model, "sign_language_lstm_full.pth")

# Load full model
model = torch.load("sign_language_lstm_full.pth", map_location=device)
model.to(device)
model.eval()


### ----- 6. Testing & Inference -----

In [None]:
def predict(model, sample_input):
    model.eval()  # Set model to evaluation mode
    
    # Ensure input is a PyTorch tensor
    if not isinstance(sample_input, torch.Tensor):
        sample_input = torch.tensor(sample_input, dtype=torch.float32)
    
    sample_input = sample_input.unsqueeze(0).to(device)  # Add batch dimension and move to correct device
    
    with torch.no_grad():  # Disable gradient computation for inference
        output = model(sample_input)
        predicted_label = torch.argmax(output, dim=1).item()
    
    return predicted_label

# Example usage (with a sample from dataset)
sample_idx = min(0, len(X_data) - 1)  # Ensure valid index
sample_input = X_data[sample_idx]  # Pick one sample from dataset
predicted_label = predict(model, sample_input)
print(f"Predicted Label: {predicted_label}")
