In [None]:
import cv2
import torch
import numpy as np
import mediapipe as mp
import json
import os
from torch.nn.functional import softmax
import torch.nn as nn

# --- Load model and parameters ---
MODEL_PATH = "/workspaces/asl_detection/machine_learning/models/lstm/best_lstm_model.pth"
LABELS_PATH = "/workspaces/asl_detection/machine_learning/models/lstm/label_to_index.json"
TRAIN_KEYPOINTS_PATH = "/workspaces/asl_detection/machine_learning/datasets/own_dataset/test_normalized_keypoints/*.npy"
NORMALIZED_DIR  = "/workspaces/asl_detection/machine_learning/datasets/own_dataset/normalized_keypoints/*.npy"

SEQUENCE_LENGTH = 102
INPUT_SIZE = 225  # Number of keypoints
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Load label mapping ---
with open(LABELS_PATH, "r") as f:
    label_to_index = json.load(f)
index_to_label = {v: k for k, v in label_to_index.items()}
NUM_CLASSES = len(label_to_index)

# --- Define LSTM model ---
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.dropout(out[:, -1, :])
        out = self.fc(out)
        return out

# Initialize model
model = LSTMModel(input_size=INPUT_SIZE, hidden_size=512, num_layers=3, num_classes=NUM_CLASSES)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# --- Initialize Mediapipe Holistic ---
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils  # For drawing keypoints
holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)

# --- Start camera stream ---
cap = cv2.VideoCapture(0)

# Stores the last 102 frames for the LSTM model
sequence = []
last_valid_keypoints = np.zeros(INPUT_SIZE)  # Default value for the first frame

# --- Normalize live keypoints ---
def normalize_live_keypoints(live_keypoints):
    # Load stored min/max values
    min_vals = np.load(os.path.join(NORMALIZED_DIR, "keypoint_min.npy"))
    max_vals = np.load(os.path.join(NORMALIZED_DIR, "keypoint_max.npy"))
    
    # Normalize
    return (live_keypoints - min_vals) / (max_vals - min_vals + 1e-8)

# --- Extract keypoints ---
def extract_keypoints(frame, results):
    """Extracts pose and hand keypoints from a frame."""
    pose = np.array([[res.x, res.y, res.z] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    
    keypoints = np.concatenate([pose, lh, rh])
    
    return keypoints


while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = holistic.process(frame_rgb)
    
    # Draw keypoints on frame
    mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    mp_drawing.draw_landmarks(frame, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

    keypoints = extract_keypoints(frame, results)
    print ("Live-Keypoints: ", keypoints[:10])

    keypoints = keypoints.reshape(-1, 3)  # Convert back to (N, 3) format

    # Convert X/Y from (0,1) to (-1,1) range
    keypoints[:, :2] = (keypoints[:, :2] - 0.5) * 2  

    # Normalize depth (Z-values) properly
    keypoints[:, 2] = keypoints[:, 2] / (np.linalg.norm(keypoints[:, 2]) + 1e-8)  

    # Flatten back to 1D
    keypoints = keypoints.flatten()
    
    normalized_keypoints = normalize_live_keypoints(keypoints, "/workspaces/asl_detection/machine_learning/datasets/own_dataset")
    print ("Normalisierte Live-Keypoints: ", keypoints[:10])

    # Update sequence
    sequence.append(normalized_keypoints)  # Jetzt werden die richtigen Keypoints gespeichert
    if len(sequence) > SEQUENCE_LENGTH:
        sequence.pop(0)

    # Make a prediction when enough frames are collected
    if len(sequence) == SEQUENCE_LENGTH:
        input_tensor = torch.tensor([sequence], dtype=torch.float32).to(DEVICE)
        with torch.no_grad():
            output = model(input_tensor)
            probabilities = softmax(output, dim=1)
            confidence, predicted_class = torch.max(probabilities, 1)
            predicted_label = index_to_label[int(predicted_class.item())]

        # Display the prediction on the frame
        text = f"{predicted_label} ({confidence.item()*100:.2f}%)"
        cv2.putText(frame, text, (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Show frame
    cv2.imshow("ASL Detection", frame)

    # Quit with 'q'
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()

In [None]:
import random
import glob

# Load a random training sample
train_files = glob.glob(TRAIN_KEYPOINTS_PATH)
if train_files:
    random_train_file = random.choice(train_files)  # Pick a random file
    train_sample = np.load(random_train_file)  # Load the numpy array

    print(f"📂 Loaded training sample from {random_train_file}")
    print("Training Keypoints (first 10 values):", train_sample.flatten()[:10])
else:
    print("⚠️ No training keypoints found!")
