In [13]:
import cv2
import mediapipe as mp
import torch
import numpy as np
import json

In [14]:
# Modelldatei und Label-Mapping laden
base_path = "/workspaces/asl_detection/machine_learning/models/lstm"
MODEL_PATH = f"{base_path}/lstm_model.pth"
LABELS_PATH = f"{base_path}/label_to_index.json"

# Mediapipe initialisieren
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [15]:
# LSTM-Modell definieren
class LSTMModel(torch.nn.Module):
    def __init__(self, input_size=225, hidden_size=256, num_layers=3, output_size=168):
        super(LSTMModel, self).__init__()
        self.lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)
        self.fc = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out.mean(dim=1))  # Mittelwert über die Sequenz
        return out

# Modell laden
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMModel().to(device)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval()

# Label-Mapping laden
with open(LABELS_PATH, "r") as f:
    label_to_index = json.load(f)
index_to_label = {v: k for k, v in label_to_index.items()}

# Label-Mapping laden
with open(LABELS_PATH, "r") as f:
    label_to_index = json.load(f)
index_to_label = {v: k for k, v in label_to_index.items()}

In [19]:
# Testmodus: Ein bestimmtes Wort prüfen
test_word = input("Welches Wort willst du testen? ").strip().lower()
if test_word not in label_to_index:
    print(f"❌ Fehler: '{test_word}' existiert nicht in den Labels!")
    exit()

test_label_index = label_to_index[test_word]

In [17]:
# Funktion zur Keypoint-Extraktion (exakt wie im Training!)
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33 * 3)
    left_hand = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21 * 3)
    right_hand = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21 * 3)
    
    return np.concatenate([pose, left_hand, right_hand])  # Exakt 225 Werte

In [20]:
# Webcam-Stream starten
cap = cv2.VideoCapture(0)
sequence = []
frame_count = 195  # Muss 195 sein, damit das Modell wie im Training arbeitet

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Bild konvertieren
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image.flags.writeable = False
        results = holistic.process(image)
        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        
        # Keypoints extrahieren
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)

        # Stelle sicher, dass immer nur 195 Frames genutzt werden
        if len(sequence) > frame_count:
            sequence.pop(0)

        # Sobald 195 Frames gesammelt sind, Vorhersage machen
        if len(sequence) == frame_count:
            input_tensor = torch.tensor([sequence], dtype=torch.float32).to(device)
            with torch.no_grad():
                output = model(input_tensor)
            pred_label = torch.argmax(torch.nn.functional.softmax(output, dim=1), dim=1).item()
            recognized_word = index_to_label.get(pred_label, "Unbekannt")

            # Ergebnis auf dem Bild anzeigen
            color = (0, 255, 0) if pred_label == test_label_index else (0, 0, 255)
            cv2.putText(image, recognized_word, (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2, cv2.LINE_AA)
            print(f"🔍 Erwartet: {test_word} | Erkannt: {recognized_word}")

        # Mediapipe-Overlays zeichnen
        mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
        mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
        mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

        # Bild anzeigen
        cv2.imshow("ASL Testmodus", image)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

W0000 00:00:1741260733.376775   14165 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741260733.402821   14165 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741260733.404190   14167 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741260733.404631   14176 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741260733.405243   14168 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741260733.413243   14168 inference_feedback_manager.cc:114] Feedback manager 

🔍 Erwartet: cousin | Erkannt: government
🔍 Erwartet: cousin | Erkannt: government
🔍 Erwartet: cousin | Erkannt: government
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: speech
🔍 Erwartet: cousin | Erkannt: past
🔍 Erwartet: cousin | Erkannt: past
🔍 Erwartet: cousin | Erkannt: past
🔍 Erwartet: cousin | Erkannt: past
🔍 Erwartet: cousin | Erkannt: past