In [5]:
import cv2
import torch
import numpy as np
import mediapipe as mp
import json

# Paths
MODEL_PATH = "/workspaces/asl_detection/machine_learning/models/lstm/best_lstm_model.pth"
LABELS_PATH = "/workspaces/asl_detection/machine_learning/models/lstm/label_to_index.json"

# Load labels
with open(LABELS_PATH, "r") as f:
    label_to_index = json.load(f)
    index_to_label = {v: k for k, v in label_to_index.items()}

# Debugging: Check label mappings
print("Label-Index-Mapping:", label_to_index)
print("Index-Label-Mapping:", index_to_label)

# Model class
def load_model(model_path, input_size, hidden_size, num_layers, num_classes, device):
    class LSTMModel(torch.nn.Module):
        def __init__(self, input_size, hidden_size, num_layers, num_classes):
            super(LSTMModel, self).__init__()
            self.lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
            self.batch_norm = torch.nn.BatchNorm1d(hidden_size)
            self.dropout = torch.nn.Dropout(0.3)
            self.fc = torch.nn.Linear(hidden_size, num_classes)
        
        def forward(self, x):
            out, _ = self.lstm(x)
            out = self.batch_norm(out[:, -1, :])
            out = self.dropout(out)
            out = self.fc(out)
            return torch.nn.functional.softmax(out, dim=1)  # Softmax added
    
    model = LSTMModel(input_size, hidden_size, num_layers, num_classes)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    return model

# Model parameters
INPUT_SIZE = 300
SEQUENCE_LENGTH = 102
HIDDEN_SIZE = 512
NUM_LAYERS = 3
NUM_CLASSES = len(label_to_index)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load trained model
model = load_model(MODEL_PATH, INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, NUM_CLASSES, device)

# MediaPipe setup
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

def extract_keypoints(results):
    keypoints = []
    # Pose landmarks (33 points)
    if results.pose_landmarks:
        keypoints.extend([[p.x, p.y, p.z, p.visibility] for p in results.pose_landmarks.landmark])
    else:
        keypoints.extend([[0, 0, 0, 0]] * 33)
    # Hand landmarks (21 points each)
    for hand in [results.left_hand_landmarks, results.right_hand_landmarks]:
        if hand:
            keypoints.extend([[p.x, p.y, p.z, 1] for p in hand.landmark])
        else:
            keypoints.extend([[0, 0, 0, 1]] * 21)
    return np.array(keypoints).flatten()

# Camera setup
cap = cv2.VideoCapture(0)
holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)
frames = []

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = holistic.process(image)
    
    # Draw keypoints
    mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    mp_drawing.draw_landmarks(frame, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    
    # Extract keypoints
    keypoints = extract_keypoints(results)
    frames.append(keypoints)
    
    if len(frames) > SEQUENCE_LENGTH:
        frames.pop(0)
    
    # Predict if enough frames collected
    if len(frames) == SEQUENCE_LENGTH:
        input_tensor = torch.tensor([frames], dtype=torch.float32).to(device)
        with torch.no_grad():
            output = model(input_tensor)
            probabilities = torch.nn.functional.softmax(output, dim=1)
            confidence, predicted = torch.max(probabilities, 1)
            label = index_to_label.get(predicted.item(), "Unknown")
        
        # Debugging: Check model output
        print(f"Raw output: {output}")
        print(f"Probabilities: {probabilities}")
        print(f"Predicted index: {predicted.item()}, Label: {label}")
        
        # Display label
        cv2.putText(frame, f"{label}: {confidence.item()*100:.2f}%", (10, 50),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
    
    cv2.imshow('ASL Recognition', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

Label-Index-Mapping: {'I': 0, 'about': 1, 'accident': 2, 'add': 3, 'africa': 4, 'after': 5, 'ago': 6, 'alone': 7, 'always': 8, 'animal': 9, 'any': 10, 'apple': 11, 'appointment': 12, 'argue': 13, 'ask': 14, 'australia': 15, 'baby': 16, 'bad': 17, 'balance': 18, 'banana': 19, 'bar': 20, 'barely': 21, 'basketball': 22, 'beard': 23, 'bed': 24, 'before': 25, 'between': 26, 'bird': 27, 'black': 28, 'bowling': 29, 'brother': 30, 'buy': 31, 'california': 32, 'call': 33, 'can': 34, 'candy': 35, 'careful': 36, 'carrot': 37, 'cat': 38, 'champion': 39, 'change': 40, 'chat': 41, 'cheat': 42, 'check': 43, 'city': 44, 'cold': 45, 'computer': 46, 'convince': 47, 'cool': 48, 'corn': 49, 'cousin': 50, 'cow': 51, 'cry': 52, 'dark': 53, 'daughter': 54, 'day': 55, 'deaf': 56, 'decide': 57, 'decorate': 58, 'delay': 59, 'delicious': 60, 'dive': 61, 'dog': 62, 'drink': 63, 'drop': 64, 'eat': 65, 'environment': 66, 'family': 67, 'far': 68, 'fast': 69, 'fat': 70, 'fault': 71, 'feel': 72, 'few': 73, 'finish': 7

W0000 00:00:1741642423.984730    4063 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741642424.009466    4063 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741642424.011402    4062 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741642424.012232    4066 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741642424.012245    4061 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1741642424.021089    4069 inference_feedback_manager.cc:114] Feedback manager 

Raw output: tensor([[0.0040, 0.0056, 0.0210, 0.0090, 0.0026, 0.0060, 0.0033, 0.0047, 0.0073,
         0.0029, 0.0052, 0.0044, 0.0030, 0.0049, 0.0050, 0.0039, 0.0068, 0.0068,
         0.0045, 0.0037, 0.0071, 0.0032, 0.0029, 0.0036, 0.0034, 0.0038, 0.0062,
         0.0034, 0.0017, 0.0051, 0.0030, 0.0024, 0.0042, 0.0036, 0.0037, 0.0024,
         0.0091, 0.0050, 0.0020, 0.0028, 0.0049, 0.0067, 0.0059, 0.0044, 0.0027,
         0.0022, 0.0027, 0.0116, 0.0036, 0.0087, 0.0025, 0.0035, 0.0029, 0.0038,
         0.0034, 0.0038, 0.0037, 0.0032, 0.0027, 0.0034, 0.0036, 0.0025, 0.0038,
         0.0034, 0.0065, 0.0024, 0.0058, 0.0021, 0.0047, 0.0022, 0.0018, 0.0028,
         0.0047, 0.0041, 0.0042, 0.0035, 0.0052, 0.0034, 0.0018, 0.0033, 0.0045,
         0.0049, 0.0123, 0.0034, 0.0034, 0.0027, 0.0088, 0.0034, 0.0025, 0.0032,
         0.0032, 0.0032, 0.0034, 0.0020, 0.0032, 0.0064, 0.0039, 0.0031, 0.0038,
         0.0024, 0.0024, 0.0063, 0.0023, 0.0073, 0.0027, 0.0034, 0.0021, 0.0047,
         0.0051,