In [17]:
import cv2
import mediapipe as mp
import numpy as np
import tensorflow as tf
import json
import pyttsx3
from collections import deque
from statistics import mode

In [18]:
model = tf.keras.models.load_model("models/best_asl_landmark_model.h5")

with open("dataset/static/cache/label_map.json", "r") as f:
    label_map = json.load(f)

idx_to_label = {v: k for k, v in label_map.items()}

In [31]:
engine = pyttsx3.init()

# Set properties
engine.setProperty('rate', 150)     # Speed of speech
engine.setProperty('volume', 1.0)   # Max volume

# Choose a voice
voices = engine.getProperty('voices')
engine.setProperty('voice', voices[0].id)  

# For stable predictions
prediction_buffer = deque(maxlen=10)  
previous_prediction = ""

In [37]:
def get_stable_prediction(new_pred):
    prediction_buffer.append(new_pred)
    if len(prediction_buffer) == prediction_buffer.maxlen:
        return mode(prediction_buffer)
    return ""

def speak_prediction(prediction):
    global previous_prediction
    if prediction != "" and prediction != previous_prediction:
        previous_prediction = prediction
        engine.say(prediction)


In [23]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

hands = mp_hands.Hands(
    max_num_hands=1,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.7
)

In [None]:
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb)

    prediction_text = "No Hand Detected"

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.extend([lm.x, lm.y, lm.z])

            # Convert to NumPy and reshape
            input_data = np.array(landmarks).reshape(1, -1)

            # Predict
            prediction = model.predict(input_data)
            predicted_class = np.argmax(prediction)
            confidence = np.max(prediction)

            # Convert index to label
            predicted_label = idx_to_label[predicted_class]
            stable_pred = get_stable_prediction(predicted_label)
            prediction_text = f"{stable_pred} ({confidence:.2f})"
            speak_prediction(stable_pred)

    # Display prediction on the frame
    cv2.putText(frame, prediction_text, (10, 40),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    cv2.imshow("ASL Real-Time Detection", frame)

    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()