In [None]:
import cv2
import torch
import numpy as np
import mediapipe as mp
import json
import os
from torch.nn.functional import softmax
import torch.nn as nn

# --- Load model and parameters ---
MODEL_PATH = "/workspaces/asl_detection/machine_learning/models/lstm/best_lstm_model.pth"
LABELS_PATH = "/workspaces/asl_detection/machine_learning/models/lstm/label_to_index.json"
NORMALIZED_DIR  = "/workspaces/asl_detection/machine_learning/datasets/own_dataset"

SEQUENCE_LENGTH = 102 
INPUT_SIZE = 225  # Number of keypoints
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Load label mapping ---
with open(LABELS_PATH, "r") as f:
    label_to_index = json.load(f)
index_to_label = {v: k for k, v in label_to_index.items()}
NUM_CLASSES = len(label_to_index)

# --- Load saved Min/Max values from training ---
keypoint_min = np.load(os.path.join(NORMALIZED_DIR, "keypoint_min.npy"))
keypoint_max = np.load(os.path.join(NORMALIZED_DIR, "keypoint_max.npy"))

# --- Define LSTM model ---
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.dropout = nn.Dropout(0.0)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.dropout(out[:, -1, :])  # Extract last time step
        return self.fc(out)
    


# Initialize model
model = LSTMModel(input_size=INPUT_SIZE, hidden_size=256, num_layers=2, num_classes=NUM_CLASSES)
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model.to(DEVICE)
model.eval()

# --- Initialize Mediapipe Holistic ---
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)

cap = cv2.VideoCapture(0)
sequence = []

def normalize_keypoints(keypoints, min_vals, max_vals):
    return np.clip((keypoints - min_vals) / (max_vals - min_vals + 1e-8), 0, 1)

# Smooth Keypoints (Moving Average Filter)
def smooth_keypoints(sequence, window_size=5):
    smoothed = []
    for i in range(len(sequence)):
        start_idx = max(0, i - window_size + 1)
        smoothed.append(np.mean(sequence[start_idx:i+1], axis=0))
    return smoothed

def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, lh, rh])

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = holistic.process(frame_rgb)
    
    mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    mp_drawing.draw_landmarks(frame, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

    keypoints = extract_keypoints(results)
    
    normalized_keypoints = normalize_keypoints(keypoints, keypoint_min, keypoint_max)
    print("Min normalized keypoints:", np.min(normalized_keypoints))
    print("Max normalized keypoints:", np.max(normalized_keypoints))

    sequence.append(normalized_keypoints)
    
    if len(sequence) > SEQUENCE_LENGTH:
        sequence.pop(0)

    sequence = smooth_keypoints(sequence)  # Apply smoothing

    input_tensor = torch.tensor([normalized_keypoints], dtype=torch.float32).unsqueeze(0).to(DEVICE)
    with torch.no_grad():
        output = model(input_tensor)
        probabilities = softmax(output, dim=-1)
        print("Model Logits:", output)
        print("Softmax Output:", probabilities)

        confidence, predicted_class = torch.max(probabilities, 1)
        print("Single Frame Prediction:", index_to_label[int(predicted_class.item())], confidence.item())


    if len(sequence) == SEQUENCE_LENGTH:
        input_tensor = torch.tensor([sequence], dtype=torch.float32).to(DEVICE)
        with torch.no_grad():
            output = model(input_tensor)
            # temperature = 0.7  # Softmax Temperature Scaling
            # probabilities = softmax(output / temperature, dim=-1)

            print("Sequence mean value:", np.mean(sequence))
            print("Sequence std dev:", np.std(sequence))

            probabilities = softmax(output, dim=-1)  # without Temperature Scaling
            confidence, predicted_class = torch.max(probabilities, 1)
            predicted_label = index_to_label[int(predicted_class.item())] if confidence.item() > 0.5 else "Uncertain"
            
            print(f"Predicted: {predicted_label} ({confidence.item()*100:.2f}%)")

        cv2.putText(frame, f"{predicted_label} ({confidence.item()*100:.2f}%)", (20, 50),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    cv2.imshow("ASL Detection", frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()

W0000 00:00:1741745165.005877  136099 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741745165.074455  136099 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741745165.077669  136101 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741745165.078576  136106 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741745165.081078  136099 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741745165.093532  136101 inference_feedback_manager.cc:114] Feedback manager 

Min normalized keypoints: 0.0
Max normalized keypoints: 1.0
Model Logits: tensor([[-8.5537e-02,  1.2092e-01, -1.0637e-01, -4.0797e-02, -1.4349e-01,
         -2.8739e-02,  1.4291e-01,  2.2160e-02, -1.1705e-01,  8.5542e-02,
          7.9627e-02, -1.0956e-01, -1.1852e-01, -3.1063e-02, -1.1965e-02,
          7.3286e-02, -1.9167e-02,  1.0128e-01, -1.6438e-01, -5.5640e-02,
          3.5885e-02,  1.4465e-01, -7.4999e-02,  1.2448e-01,  1.8814e-01,
          6.2579e-02, -2.1730e-02, -7.0508e-02, -5.1422e-05,  1.0967e-01,
         -1.1087e-01, -7.2515e-02, -1.4358e-02, -3.2234e-02,  7.1527e-02,
          4.2177e-02,  1.4014e-01,  1.1270e-01, -2.7474e-02, -1.1479e-01,
         -5.6727e-02, -9.2874e-02, -1.8755e-02,  3.7808e-03,  4.0475e-02,
          1.2624e-02,  1.2151e-01, -1.2537e-01, -8.7184e-02, -1.1253e-01,
          1.5561e-01,  1.7554e-01, -2.2083e-02, -3.4269e-02,  1.3205e-01,
         -3.3561e-02, -1.2810e-01, -3.5034e-02, -7.4127e-02,  4.7236e-03,
         -2.8737e-02,  4.1896e-02,  6.

In [1]:
# 📌 1️⃣ Importiere die benötigten Bibliotheken
import cv2
import numpy as np
import mediapipe as mp
import torch
import json
import os
from collections import Counter
import torch.nn as nn


# 📌 2️⃣ Mediapipe initialisieren (Holistic für Hand-, Gesichts- und Körper-Keypoints)
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

NORMALIZED_DIR  = "/workspaces/asl_detection/machine_learning/datasets/own_dataset"

keypoint_min = np.load(os.path.join(NORMALIZED_DIR, "keypoint_min.npy"))
keypoint_max = np.load(os.path.join(NORMALIZED_DIR, "keypoint_max.npy"))

# 📌 3️⃣ PyTorch-Modell laden
# model = torch.jit.load("/workspaces/asl_detection/machine_learning/models/lstm/best_lstm_model.pth")  # Falls TorchScript
MODEL_PATH = "/workspaces/asl_detection/machine_learning/models/lstm/best_lstm_model.pth"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
INPUT_SIZE = 225
NUM_CLASSES = 209

# LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.dropout = nn.Dropout(0.0)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.dropout(out[:, -1, :])  # Extract last time step
        return self.fc(out)

# Modell definieren (muss exakt zur Architektur beim Training passen!)
model = LSTMModel(input_size=INPUT_SIZE, hidden_size=256, num_layers=2, num_classes=NUM_CLASSES)

# State_dict laden
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))

# Modell auf das richtige Gerät setzen
model.to(DEVICE)

# Evaluierungsmodus aktivieren (wichtig für Dropout, BatchNorm, etc.)
model.eval()


# 📌 4️⃣ Labels aus JSON-Datei laden (Umwandlung von "Label: Index" in "Index: Label")
with open("/workspaces/asl_detection/machine_learning/models/lstm/label_to_index.json", "r") as f:
    label_dict = json.load(f)
label_names = {int(v): k for k, v in label_dict.items()}  # Umkehren, damit Index als Key dient

# 📌 5️⃣ Funktion zur Normalisierung der Keypoints
def normalize_keypoints(keypoints, min_vals, max_vals):
    return (keypoints - min_vals) / (max_vals - min_vals + 1e-8)

# 📌 6️⃣ Funktion zur Extraktion und Normalisierung von Keypoints
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

    keypoints = np.concatenate([pose, lh, rh])
    return normalize_keypoints(keypoints, keypoint_min, keypoint_max)  # Normalisierung der Keypoints

# 📌 7️⃣ Mehrheitsvoting-Speicher
frame_buffer = []
buffer_size = 10  # Anzahl der Frames für das Voting

def majority_voting(prediction):
    """Gibt das häufigste Label im Buffer zurück."""
    global frame_buffer
    frame_buffer.append(prediction)

    if len(frame_buffer) > buffer_size:
        frame_buffer.pop(0)

    return Counter(frame_buffer).most_common(1)[0][0]  # Häufigstes Label

# 📌 8️⃣ Kamera öffnen und Live-Vorhersage starten
cap = cv2.VideoCapture(0)  # 0 = Standard-Webcam
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # 📌 9️⃣ Bild für Mediapipe vorbereiten
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image.flags.writeable = False
        results = holistic.process(image)
        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

        # 📌 🔟 Keypoints extrahieren und normalisieren
        keypoints = extract_keypoints(results)
        keypoints = torch.tensor(keypoints, dtype=torch.float32).unsqueeze(0)  # PyTorch-Tensor erstellen

        # 📌 1️⃣1️⃣ Vorhersage mit PyTorch
        with torch.no_grad():
            keypoints = keypoints.unsqueeze(0).to(DEVICE)  # Sicherstellen, dass es die richtige Form hat
            prediction_probs = torch.nn.functional.softmax(model(keypoints), dim=1).cpu().numpy()[0]            
            predicted_label = np.argmax(prediction_probs)  # Höchste Wahrscheinlichkeit
            stable_label = majority_voting(predicted_label)  # Stabilisierung durch Mehrheitsvoting
            confidence = prediction_probs[predicted_label]  # Wahrscheinlichkeit der Vorhersage

        # 📌 1️⃣2️⃣ Zeichne die Mediapipe Keypoints
        mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
        mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
        mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

        # 📌 1️⃣3️⃣ Label & Confidence anzeigen
        label_text = label_names.get(stable_label, "Unbekannt")  # Falls unbekannter Index
        text = f"Label: {label_text} ({confidence*100:.1f}%)"
        cv2.putText(image, text, (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

        # 📌 1️⃣4️⃣ Livestream anzeigen
        cv2.imshow("Live ASL Recognition", image)

        # 📌 1️⃣5️⃣ Beenden mit 'q'
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

# 📌 1️⃣6️⃣ Ressourcen freigeben
cap.release()
cv2.destroyAllWindows()

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1741748853.860621  145759 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741748853.891422  145759 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741748853.894260  145759 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741748853.895476  145765 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741748853.896307  145766 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741748853.910303  145