## Training via Pytorch 

In [1]:
import os
import json
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

### ----- 1. Load Data -----

In [2]:
data_dir = "/home/geiger/asl_detection/machine_learning/datasets/how2sign/keypoints/train/openpose_output"
json_dir = os.path.join(data_dir, "json")
video_dir = os.path.join(data_dir, "video")

csv_path = "/home/geiger/asl_detection/machine_learning/datasets/how2sign/english_translation"
labels_csv = os.path.join(csv_path, "how2sign_realigned_train.csv")

In [3]:
# Load label mapping from CSV
label_df = pd.read_csv(labels_csv, delimiter="\t")
label_mapping = dict(zip(label_df["SENTENCE_NAME"], label_df["SENTENCE"]))

In [None]:
print(list(label_mapping.items())[0])

In [None]:
for i, (key,value) in enumerate(label_mapping.items()):
    print(f"Key: {key}, Value:{value}")
    if(i == 5):
        break

In [6]:
def normalize_keypoints(keypoints):
    # Normalisiere die Keypoints auf den Bereich [0, 1]
    keypoints_min = np.min(keypoints, axis=0)
    keypoints_max = np.max(keypoints, axis=0)
    keypoints_normalized = (keypoints - keypoints_min) / (keypoints_max - keypoints_min + 1e-8)  # Vermeide Division durch Null
    return keypoints_normalized

In [7]:
def load_keypoints(json_folder):
    keypoints_sequence = []
    for frame_file in sorted(os.listdir(json_folder)):
        frame_path = os.path.join(json_folder, frame_file)
        with open(frame_path, "r") as f:
            data = json.load(f)
            if data["people"]:  # Prüfen, ob eine Person erkannt wurde
                person = data["people"][0]
                pose = np.array(person.get("pose_keypoints_2d", []))
                face = np.array(person.get("face_keypoints_2d", []))
                hand_left = np.array(person.get("hand_left_keypoints_2d", []))
                hand_right = np.array(person.get("hand_right_keypoints_2d", []))

                keypoints = np.concatenate([pose, face, hand_left, hand_right])
            else:
                keypoints = np.zeros(75 + 70 + 63 + 63)

            keypoints_normalized = normalize_keypoints(keypoints)  # Normalisiere die Keypoints
            keypoints_sequence.append(keypoints_normalized)
    return np.array(keypoints_sequence)

In [8]:
# Iterate through all JSON subfolders
X_data, y_labels = [], []
for sentence_name in os.listdir(json_dir):
    sentence_folder = os.path.join(json_dir, sentence_name)
    if os.path.isdir(sentence_folder) and sentence_name in label_mapping:
        keypoints_sequence = load_keypoints(sentence_folder)
        X_data.append(keypoints_sequence)  # Use all frames
        y_labels.append(label_mapping[sentence_name])  # Use sentence ID as label

In [None]:
# print(X_data)
print (y_labels[:3])

print("Anzahl eindeutiger Labels:", len(set(y_labels)))
print("Beispiel eines Labels:", y_labels[0])
print("Shape eines Keypoints-Samples:", X_data[0].shape)

In [10]:
# Convert to NumPy arrays
X_data = np.array(X_data, dtype=object)  # Variable-length sequences
y_labels = np.array(y_labels)

### ----- 2. Prepare Data for PyTorch -----

In [11]:
# Map labels to IDs
unique_labels = {label: idx for idx, label in enumerate(set(y_labels))}
y_labels = np.array([unique_labels[label] for label in y_labels])

In [12]:
# PyTorch Dataset
class SignLanguageDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float32), self.y[idx]

In [13]:
# Create DataLoader
dataset = SignLanguageDataset(X_data, y_labels)

from torch.nn.utils.rnn import pad_sequence

# Ändere die DataLoader-Erstellung, um gepaddete Sequenzen zu verwenden
train_loader = DataLoader(
    dataset,
    batch_size=16,
    shuffle=True,
    collate_fn=lambda batch: (
        pad_sequence([item[0] for item in batch], batch_first=True),  # Padded Sequenzen
        torch.tensor([item[1] for item in batch], dtype=torch.long)
    )
)

### ----- 3. Define LSTM Model -----

In [14]:
class MultiLayerLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(MultiLayerLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Mehrere LSTM-Schichten
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        
        # Fully Connected Layer für die Klassifikation
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        # Initialisiere den versteckten Zustand und den Zellzustand
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # LSTM-Schichten
        out, _ = self.lstm(x, (h0, c0))
        
        # Verwende den letzten versteckten Zustand für die Klassifikation
        out = self.fc(out[:, -1, :])
        return out

In [20]:
input_size = X_data[0].shape[1]
hidden_size = 512
num_layers = 5 # Erhöhe die Anzahl der Schichten
num_classes = len(unique_labels)

In [None]:
print(input_size, hidden_size, num_layers, num_classes)

In [None]:
model = MultiLayerLSTMModel(input_size, hidden_size, num_layers, num_classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [23]:
# Loss function & optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)  # Erhöhe die Lernrate
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,mode="min", factor=0.1, patience=5)

### ----- 4. Training -----

In [None]:
num_epochs = 200
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}")

### ----- 5. Save Model & Evaluate -----

In [None]:
torch.save(model.state_dict(), "lstm_sign_language.pth")
print("Model saved!")

In [40]:
# Evaluation
model.eval()
correct = 0
total = 0

In [None]:
with torch.no_grad():
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

## Train via Tensorflow

In [None]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Dropout
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.optimizers import Adam

In [2]:
data_dir = "/home/geiger/asl_detection/machine_learning/datasets/how2sign/keypoints/train/openpose_output"
json_dir = os.path.join(data_dir, "json")
video_dir = os.path.join(data_dir, "video")

csv_path = "/home/geiger/asl_detection/machine_learning/datasets/how2sign/english_translation"
labels_csv = os.path.join(csv_path, "how2sign_realigned_train.csv")

In [3]:
data_dir_val = "/home/geiger/asl_detection/machine_learning/datasets/how2sign/keypoints/val/openpose_output"
json_dir_val = os.path.join(data_dir_val, "json")
video_dir_val = os.path.join(data_dir_val, "video")

csv_path_val = "/home/geiger/asl_detection/machine_learning/datasets/how2sign/english_translation"
labels_csv_val = os.path.join(csv_path_val, "how2sign_realigned_val.csv")

In [4]:
# Load label mapping from CSV
label_df = pd.read_csv(labels_csv, delimiter="\t")
label_mapping = dict(zip(label_df["SENTENCE_NAME"], label_df["SENTENCE"]))

In [5]:
label_df_val = pd.read_csv(labels_csv_val, delimiter="\t")
label_mapping_val = dict(zip(label_df_val["SENTENCE_NAME"], label_df_val["SENTENCE"]))

In [6]:
def normalize_keypoints(keypoints, target_length=100, keypoint_dim=1662):
    keypoints = np.array(keypoints)  # Sicherstellen, dass es ein NumPy-Array ist

    # Falls leeres Array, direkt zurückgeben
    if keypoints.size == 0:
        return np.zeros((target_length, keypoint_dim))  # Falls ganz leer, mit Nullen füllen

    # Berechne Min & Max für jede Spalte (also pro Keypoint-Feature)
    keypoints_min = np.min(keypoints, axis=0, keepdims=True)  
    keypoints_max = np.max(keypoints, axis=0, keepdims=True)

    # Vermeidung von Division durch 0: Falls min == max, dann einfach 0 setzen
    diff = keypoints_max - keypoints_min
    diff[diff == 0] = 1  # Falls min == max → setze diff auf 1 (damit bleibt der Wert einfach 0)

    keypoints_normalized = (keypoints - keypoints_min) / diff  

    seq_length = len(keypoints_normalized)

    # Kürzen, falls zu lang
    if seq_length > target_length:
        keypoints_normalized = keypoints_normalized[:target_length]
    
    # Padding mit letzten Frame, falls zu kurz
    elif seq_length < target_length:
        padding = np.tile(keypoints_normalized[-1], (target_length - seq_length, 1))  # Wiederholt letzten Frame
        keypoints_normalized = np.vstack((keypoints_normalized, padding))

    return keypoints_normalized

In [None]:
print([np.shape(x) for x in X_data_val])  # Shape jedes Elements


In [None]:
for x in X_data_val:
    print(np.shape(x), x.dtype, type(x))


In [7]:
def load_keypoints(json_folder):
    keypoints_sequence = []
    for frame_file in sorted(os.listdir(json_folder)):
        frame_path = os.path.join(json_folder, frame_file)
        with open(frame_path, "r") as f:
            data = json.load(f)
            if data["people"]:  # Prüfen, ob eine Person erkannt wurde
                person = data["people"][0]
                pose = np.array(person.get("pose_keypoints_2d", []))
                face = np.array(person.get("face_keypoints_2d", []))
                hand_left = np.array(person.get("hand_left_keypoints_2d", []))
                hand_right = np.array(person.get("hand_right_keypoints_2d", []))

                keypoints = np.concatenate([pose, face, hand_left, hand_right])
            else:
                keypoints = np.zeros(1662)  # Verwenden Sie 1662 statt 411

            keypoints_normalized = normalize_keypoints(keypoints)  # Normalisiere die Keypoints
            keypoints_sequence.append(keypoints_normalized)
    return np.array(keypoints_sequence)

In [None]:
X_data, y_labels = [], []

for sentence_name in os.listdir(json_dir):
    sentence_folder = os.path.join(json_dir, sentence_name)
    
    if os.path.isdir(sentence_folder) and sentence_name in label_mapping:
        keypoints_sequence = load_keypoints(sentence_folder)  # Lade Keypoints
        
        X_data.append(keypoints_sequence)
        y_labels.append(label_mapping[sentence_name])

In [8]:
X_data_val, y_labels_val = [], []
for sentence_name_val in os.listdir(json_dir_val):
    sentence_folder_val = os.path.join(json_dir_val, sentence_name_val)
    
    if os.path.isdir(sentence_folder_val) and sentence_name_val in label_mapping_val:
        keypoints_sequence_val = load_keypoints(sentence_folder_val)  # Lade Keypoints
                
        X_data_val.append(keypoints_sequence_val)
        y_labels_val.append(label_mapping_val[sentence_name_val])

In [57]:
unique_labels = sorted(set(y_labels))  # Sortiert für konsistente IDs
label_to_id = {label: idx for idx, label in enumerate(unique_labels)}

y_labels = [label_to_id[label] for label in y_labels]
X_train = np.array(X_data, dtype=np.float32)  # Konvertiere die Liste in ein NumPy-Array
y_train = np.array(y_labels, dtype=np.int32)

In [None]:
unique_labels_val = sorted(set(y_labels_val))  # Sortiert für konsistente IDs
label_to_id_val = {label_val: idx for idx, label_val in enumerate(unique_labels_val)}

y_labels_val = [label_to_id_val[label_val] for label_val in y_labels_val]
X_val = np.array(X_data_val, dtype=np.float32)  # Konvertiere die Liste in ein NumPy-Array
y_val = np.array(y_labels_val, dtype=np.int32)

In [None]:
print("Shape von X_train:", X_train.shape)  # Erwartet: (Samples, 100, 1662)
print("Shape von y_train:", y_train.shape)


In [None]:
print("Shape von X_train:", X_val.shape)
print("Shape von y_train:", y_val.shape) 

In [None]:
model = Sequential()

model.add(LSTM(128, input_shape=(100, 1662), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Für binäre Klassifikation

# 4. Kompilieren des Modells
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# 5. Modell trainieren
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# 6. Trainingsfortschritt ausgeben
print("Training abgeschlossen.")