In [22]:
import cv2
import mediapipe as mp
import numpy as np
import tensorflow as tf
import json
import pyttsx3
from collections import deque
from statistics import mode
import time

In [23]:
model = tf.keras.models.load_model("models/best_asl_landmark_model.h5")

with open("dataset/static/cache/label_map.json", "r") as f:
    label_map = json.load(f)

idx_to_label = {v: k for k, v in label_map.items()}

In [24]:
engine = pyttsx3.init()

# Set properties
engine.setProperty('rate', 150)     # Speed of speech
engine.setProperty('volume', 1.0)   # Max volume

# Choose a voice
voices = engine.getProperty('voices')
engine.setProperty('voice', voices[0].id)  

In [25]:
formed_word = ""
last_trigger_time = 0
debounce_delay = 1.5  # seconds
prediction_buffer = deque(maxlen=10)
previous_prediction = ""

In [26]:
def get_stable_prediction(new_pred):
    prediction_buffer.append(new_pred)
    if len(prediction_buffer) == prediction_buffer.maxlen:
        return mode(prediction_buffer)
    return ""

def speak_text(text):
    if text.strip():
        engine.say(text)


In [27]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

hands = mp_hands.Hands(
    max_num_hands=1,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.7
)

In [29]:
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb)

    prediction_text = "No Hand Detected"
    current_time = time.time()

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.extend([lm.x, lm.y, lm.z])

            input_data = np.array(landmarks).reshape(1, -1)
            prediction = model.predict(input_data)
            predicted_class = np.argmax(prediction)
            confidence = np.max(prediction)

            predicted_label = idx_to_label[predicted_class]
            stable_pred = get_stable_prediction(predicted_label)
            prediction_text = f"{stable_pred} ({confidence:.2f})"

            # Handle gesture-based commands with debounce
            if stable_pred != previous_prediction and (current_time - last_trigger_time > debounce_delay):
                previous_prediction = stable_pred
                last_trigger_time = current_time

                if stable_pred == "SPACE":
                    formed_word += " "
                    print("Added SPACE")
                elif stable_pred == "DELETE":
                    formed_word = formed_word[:-1]
                    print("Deleted last char")
                elif stable_pred == "SPEAK":
                    print("Speaking:", formed_word)
                    speak_text(formed_word)
                    formed_word = ""
                elif stable_pred in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
                    formed_word += stable_pred
                    print("Added:", stable_pred)

    # Show both current prediction and formed sentence
    cv2.putText(frame, f"Prediction: {prediction_text}", (10, 40),
                cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
    
    cv2.putText(frame, f"Sentence: {formed_word}", (10, 80),
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 0), 2)

    cv2.imshow("ASL Real-Time Detection", frame)

    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()


Added: A
Added SPACE
Added: B
Deleted last char
