In [None]:
import json
import os

import cv2
import mediapipe as mp
import numpy as np
import torch
import torch.nn as nn

# MediaPipe Setup
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [3]:
def read_actions_from_json(json_path="/workspaces/asl_detection/machine_learning/datasets/asl_word_detection/actions_list.json"):
    """Liest die Actions aus der JSON-Datei."""
    try:
        with open(json_path, "r") as f:
            data = json.load(f)
            actions = data.get("actions", [])
            if actions:
                print(f"Actions aus JSON geladen: {actions}")
                return np.array(actions)
    except FileNotFoundError:
        print(f"Keine actions_list.json gefunden unter {json_path}")
    except json.JSONDecodeError:
        print(f"Fehler beim Lesen der JSON-Datei {json_path}")
    except Exception as e:
        print(f"Unerwarteter Fehler beim Lesen der JSON: {e}")
    return None

In [4]:
class SignLanguageModel(nn.Module):
    def __init__(self, num_classes, input_size=126, hidden_size=64, num_layers=1, dropout=0.3):
        super(SignLanguageModel, self).__init__()

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,  # Anpassbar aus JSON
            batch_first=True,
            bidirectional=True,
        )

        # Attention mechanism
        self.attention = nn.Sequential(nn.Linear(hidden_size * 2, 1), nn.Tanh())

        # Fully connected layers
        self.fc = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.BatchNorm1d(hidden_size),  # BatchNorm wird wieder verwendet
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, num_classes),
        )

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.LSTM):
            for name, param in module.named_parameters():
                if "weight" in name:
                    nn.init.orthogonal_(param)
                elif "bias" in name:
                    nn.init.zeros_(param)

    def forward(self, x):
        # LSTM
        lstm_out, _ = self.lstm(x)

        # Attention
        attention_weights = self.attention(lstm_out)
        attention_weights = torch.softmax(attention_weights, dim=1)

        # Weighted sum
        context = torch.sum(attention_weights * lstm_out, dim=1)

        # Classification
        out = self.fc(context)
        return out

In [5]:
def list_available_cameras():
    """Lists all available cameras"""
    available_cameras = []
    for i in range(10):  # Check first 10 possible camera indices
        cap = cv2.VideoCapture(i)
        if cap.isOpened():
            ret, _ = cap.read()
            if ret:
                available_cameras.append(i)
            cap.release()
    return available_cameras

In [7]:
def start_webcam_detection():
    """Starts webcam detection with the trained model."""
    print("\nStarting webcam detection...")

    # Lade Actions direkt aus der JSON-Datei
    actions = read_actions_from_json()
    if actions is None:
        print("Keine actions_list.json gefunden. Bitte stellen Sie sicher, dass die Datei existiert.")
        return

    print(f"\nLoaded {len(actions)} actions: {actions}")

    # Lade Modellinformationen aus JSON
    json_file = "/workspaces/asl_detection/machine_learning/datasets/asl_word_detection/actions_list.json"
    with open(json_file, 'r') as f:
        data = json.load(f)
        model_info = data.get("model_info", {})
        hidden_size = model_info.get("hidden_size", 64)
        num_layers = model_info.get("num_layers", 2)

    print(f"Initializing model with: hidden_size={hidden_size}, num_layers={num_layers}")
    
    # Initialize model
    input_size = 126  # Only hands: (21 + 21) * 3 = 126
    model = SignLanguageModel(
        input_size=input_size,
        hidden_size=hidden_size,
        num_classes=len(actions),
        num_layers=num_layers
    )

    # Lade das Modell
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_path = "/workspaces/asl_detection/machine_learning/models/asl_word_detection/best_model.pth"
    checkpoint = torch.load(model_path, map_location=device)
    
    if "model_state_dict" in checkpoint:
        model.load_state_dict(checkpoint["model_state_dict"], strict=False)
    else:
        model.load_state_dict(checkpoint, strict=False)
        
    print(f"Modell erfolgreich geladen von {model_path}!")
    model.eval()


    # Einfache Webcam-Initialisierung wie ursprünglich
    current_camera = 0
    cap = cv2.VideoCapture(current_camera)

    sequence = []
    prediction_history = []  # Für zeitliche Glättung
    
    print("\nControls:")
    print("0-9: Switch camera")
    print("q: Quit")

    # Generate colors for each action
    colors = []
    for i in range(len(actions)):
        # Generate evenly spaced colors in BGR
        hue = i * 180 // len(actions)
        color = cv2.cvtColor(np.uint8([[[hue, 255, 255]]]), cv2.COLOR_HSV2BGR)[0][0]
        colors.append(tuple(map(int, color)))

    with mp_holistic.Holistic(
        min_detection_confidence=0.5, min_tracking_confidence=0.5
    ) as holistic:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                print("Fehler beim Lesen des Videoframes")
                break

            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
            results = holistic.process(image)
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

            # Draw landmarks (for visualization)
            if results.pose_landmarks:  # Show but don't use
                mp_drawing.draw_landmarks(
                    image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS
                )
            if results.left_hand_landmarks:
                mp_drawing.draw_landmarks(
                    image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS
                )
            if results.right_hand_landmarks:
                mp_drawing.draw_landmarks(
                    image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS
                )

            # Extract ONLY hand keypoints for prediction
            lh = np.zeros(21 * 3)
            if results.left_hand_landmarks:
                for i, lm in enumerate(results.left_hand_landmarks.landmark):
                    lh[i * 3] = lm.x
                    lh[i * 3 + 1] = lm.y
                    lh[i * 3 + 2] = lm.z

            rh = np.zeros(21 * 3)
            if results.right_hand_landmarks:
                for i, lm in enumerate(results.right_hand_landmarks.landmark):
                    rh[i * 3] = lm.x
                    rh[i * 3 + 1] = lm.y
                    rh[i * 3 + 2] = lm.z

            keypoints = np.concatenate([lh, rh])  # Only hands!
            sequence.append(keypoints)
            
            # Verwende die Sequenz-Länge aus dem Training (102)
            sequence = sequence[-102:]  # Keep all frames just like in training

            if len(sequence) == 102:  # Nur vorhersagen, wenn wir genügend Frames haben
                # Make prediction
                with torch.no_grad():
                    sequence_tensor = torch.FloatTensor(np.array(sequence)).unsqueeze(0)
                    outputs = model(sequence_tensor)
                    probs = torch.nn.functional.softmax(outputs, dim=1)

                # Kombiniere Aktionen und Wahrscheinlichkeiten und sortiere nach Wahrscheinlichkeit
                action_probs = [(action, prob.item()) for action, prob in zip(actions, probs[0])]
                action_probs.sort(key=lambda x: x[1], reverse=True)  # Absteigend sortieren
                
                # Nur die Top-k anzeigen
                top_k = 5  # Anzahl der anzuzeigenden Top-Ergebnisse
                top_actions = action_probs[:top_k]
                
                # Prüfe auf Overfitting (extrem hohe Konfidenzen)
                overfitting_warning = False
                if top_actions[0][1] > 0.95 and (len(top_actions) < 2 or top_actions[0][1] - top_actions[1][1] > 0.8):
                    overfitting_warning = True
                
                # Überschrift
                cv2.putText(
                    image,
                    "TOP 5 PREDICTIONS:",
                    (10, 25),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.8,
                    (255, 255, 255),
                    2,
                    cv2.LINE_AA,
                )
                
                # Zeige die Top-k Ergebnisse an
                for i, (action, prob_value) in enumerate(top_actions):
                    # Farbcodierung nach Konfidenz
                    if prob_value > 0.3:  # Grüner Schwellenwert
                        color = (0, 255, 0)  # Grün für hohe Konfidenz
                    elif prob_value > 0.1:  # Gelber Schwellenwert
                        color = (0, 255, 255)  # Gelb für mittlere Konfidenz
                    else:
                        color = (0, 0, 255)  # Rot für niedrige Konfidenz
                    
                    # Füge (Overfitting?) Markierung hinzu, wenn verdächtig
                    warning_text = " (Overfitting?)" if overfitting_warning and i == 0 and prob_value > 0.95 else ""
                    
                    # Prozentbalken für visuelle Darstellung
                    bar_width = int(prob_value * 200)  # Max 200 Pixel breit
                    cv2.rectangle(image, (140, 40 + i * 25), (140 + bar_width, 55 + i * 25), color, -1)
                    
                    # Rang und Wort
                    cv2.putText(
                        image,
                        f"{i+1}. {action}:",
                        (10, 50 + i * 25),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.7,
                        (255, 255, 255),
                        2,
                        cv2.LINE_AA,
                    )
                    
                    # Konfidenzwert
                    cv2.putText(
                        image,
                        f"{prob_value:.2f}{warning_text}",
                        (145 + bar_width, 50 + i * 25),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.6,
                        (255, 255, 255),
                        1,
                        cv2.LINE_AA,
                    )
                
                # Show current camera number
                cv2.putText(
                    image,
                    f"Camera {current_camera}",
                    (10, image.shape[0] - 10),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.7,
                    (255, 255, 255),
                    2,
                    cv2.LINE_AA,
                )

            cv2.imshow("ASL Detection", image)

            # Process keyboard input
            key = cv2.waitKey(10) & 0xFF
            if key == ord("q"):
                break
            # Check for number keys (0-9)
            elif ord("0") <= key <= ord("9"):
                new_camera = key - ord("0")  # Convert ASCII to number
                if new_camera != current_camera:
                    print(f"Switching to camera {new_camera}")
                    cap.release()
                    current_camera = new_camera
                    cap = cv2.VideoCapture(current_camera)
                    sequence = []  # Reset sequence

    cap.release()
    cv2.destroyAllWindows()

In [None]:
if __name__ == "__main__":
    start_webcam_detection() 