In [5]:
# --- 1. Install & Import Dependencies ---

import torch
import torch.nn as nn
import cv2
import numpy as np
import mediapipe as mp
import json

# --- 2. Load Label Dictionary ---
base_path = "/workspaces/asl_detection/machine_learning/models/lstm"
LABELS_PATH = f"{base_path}/label_to_index.json"

with open(LABELS_PATH, "r") as f:
    label_to_index = json.load(f)
index_to_label = {v: k for k, v in label_to_index.items()}

# --- 3. Define LSTM Model ---
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=3):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        logits = self.fc(lstm_out.mean(dim=1))  # Average over all frames
        return torch.log_softmax(logits, dim=1)  # Log-softmax for stable gradients

# --- 4. Load Model ---
MODEL_PATH = f"{base_path}/best_lstm_model.pth"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim = 225  # Anpassung auf 225 Features entsprechend dem trainierten Modell
hidden_dim = 256
output_dim = len(label_to_index)

model = LSTMModel(input_dim, hidden_dim, output_dim).to(device)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval()

# --- 5. Initialize MediaPipe ---
mp_holistic = mp.solutions.holistic
holistic = mp_holistic.Holistic()

def extract_keypoints(results):
    """
    Extrahiert nur 225 Keypoints, passend zum trainierten Modell.
    """
    # Pose: 33 Keypoints (X, Y, Z) → 33 * 3 = 99
    pose = np.array([[res.x, res.y, res.z] for res in results.pose_landmarks.landmark]) if results.pose_landmarks else np.zeros((33, 3))
    pose = pose.flatten()
    
    # Hand: 21 Keypoints pro Hand (X, Y, Z) → 21 * 3 * 2 = 126
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]) if results.left_hand_landmarks else np.zeros((21, 3))
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]) if results.right_hand_landmarks else np.zeros((21, 3))
    lh, rh = lh.flatten(), rh.flatten()
    
    return np.concatenate([pose, lh, rh])  # Total = 225 Features

# --- 6. Real-Time Sign Language Detection ---
cap = cv2.VideoCapture(0)
sequence = []
frame_count = 30  # Frames per sequence

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = holistic.process(image)
    keypoints = extract_keypoints(results)
    sequence.append(keypoints)
    
    # Zeichne MediaPipe Holistic Keypoints
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    mp.solutions.drawing_utils.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    mp.solutions.drawing_utils.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp.solutions.drawing_utils.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    
    if len(sequence) == frame_count:
        input_tensor = torch.tensor(sequence, dtype=torch.float32).unsqueeze(0).to(device)
        with torch.no_grad():
            prediction = model(input_tensor)
            predicted_index = torch.argmax(prediction, dim=1).item()
            predicted_word = index_to_label[predicted_index]
        
        cv2.putText(image, predicted_word, (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        sequence = []
    
    cv2.imshow('ASL Recognition', image)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

W0000 00:00:1741550940.214503   56545 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741550940.239491   56545 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741550940.241350   56545 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741550940.242327   56541 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741550940.242445   56550 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741550940.252122   56550 inference_feedback_manager.cc:114] Feedback manager 