In [1]:
import cv2, threading, queue, time, json, re
import numpy as np
import torch
import sounddevice as sd
import whisper

from helper_functions import create_bold_stickman, load_gt_landmarks, sanitize_word

In [2]:
default_in, default_out = sd.default.device
print(f"Default input device index : {default_in}")
print(f"Default output device index: {default_out}")

Default input device index : 1
Default output device index: 3


In [3]:
# 2) List all devices, with their indices
print("\nAll audio devices:")
for idx, dev in enumerate(sd.query_devices()):
    print(f"{idx:>2}: {dev['name']} — {dev['hostapi']} — max_in:{dev['max_input_channels']}  max_out:{dev['max_output_channels']}")


All audio devices:
 0: Microsoft Sound Mapper - Input — 0 — max_in:2  max_out:0
 1: Microphone (HD Webcam eMeet C96 — 0 — max_in:2  max_out:0
 2: Microsoft Sound Mapper - Output — 0 — max_in:0  max_out:2
 3: Speakers (Realtek High Definiti — 0 — max_in:0  max_out:8
 4: ES-G27F2Q (NVIDIA High Definiti — 0 — max_in:0  max_out:2
 5: H24V13 (NVIDIA High Definition  — 0 — max_in:0  max_out:2
 6: Realtek Digital Output (Realtek — 0 — max_in:0  max_out:2
 7: Primary Sound Capture Driver — 1 — max_in:2  max_out:0
 8: Microphone (HD Webcam eMeet C960) — 1 — max_in:2  max_out:0
 9: Primary Sound Driver — 1 — max_in:0  max_out:2
10: Speakers (Realtek High Definition Audio) — 1 — max_in:0  max_out:8
11: ES-G27F2Q (NVIDIA High Definition Audio) — 1 — max_in:0  max_out:2
12: H24V13 (NVIDIA High Definition Audio) — 1 — max_in:0  max_out:2
13: Realtek Digital Output (Realtek High Definition Audio) — 1 — max_in:0  max_out:2
14: Speakers (Realtek High Definition Audio) — 2 — max_in:0  max_out:2
15: ES-

In [4]:
gt_land = load_gt_landmarks("landmark_data")
if "no" not in gt_land:
    raise RuntimeError("You must have a 'no.json' in landmark_data for missing-word fallback")

In [5]:
# —————————————————————————————————————————————
# 2) Set up Whisper tiny model on GPU if possible
# —————————————————————————————————————————————
device = "cuda" if torch.cuda.is_available() else "cpu"
model  = whisper.load_model("tiny").to(device)

In [9]:


# —————————————————————————————————————————————
#  Audio / word queues & constants
# —————————————————————————————————————————————
audio_q = queue.Queue()
word_q  = queue.Queue()

SAMPLERATE = 16000
BLOCK_SIZE = SAMPLERATE  # 1 s blocks
OVERLAP    = int(SAMPLERATE * 0.2)  # 0.2 s overlap

def audio_callback(indata, frames, time, status):
    """Convert CFFI buffer → int16 numpy → enqueue."""
    if status:
        print("Audio status:", status)
    arr = np.frombuffer(indata, dtype=np.int16).reshape(frames, 1)
    audio_q.put(arr)

def audio_worker():
    """Normalize, sliding buffer, transcribe, flush queue, enqueue newest word."""
    buffer = np.empty((0,), dtype=np.float32)
    while True:
        chunk = audio_q.get()
        if chunk is None:
            break

        # int16 → float32 in [-1,1]
        f32 = chunk[:,0].astype(np.float32) / 32768.0
        buffer = np.concatenate([buffer, f32])

        if len(buffer) >= SAMPLERATE:
            segment = buffer[:SAMPLERATE]
            # keep last OVERLAP samples for next round
            buffer = buffer[-OVERLAP:]

            print(f"[DEBUG] Transcribing {segment.shape[0]} samples…")
            result = model.transcribe(segment, word_timestamps=False)
            text   = result["text"].strip()
            print(f"[DEBUG] Whisper returned: “{text}”")

            for tok in text.split():
                w = sanitize_word(tok)
                if not w:
                    continue

                # flush older words
                try:
                    while True:
                        word_q.get_nowait()
                except queue.Empty:
                    pass

                word_q.put(w)
                print(f"[DEBUG] enqueue word: {w}")

# start audio stream + worker thread
stream = sd.RawInputStream(
    samplerate=SAMPLERATE,
    blocksize=BLOCK_SIZE,
    dtype="int16",
    channels=1,
    callback=audio_callback
)
stream.start()
t_audio = threading.Thread(target=audio_worker, daemon=True)
t_audio.start()

# —————————————————————————————————————————————
# Webcam loop with overlay
# —————————————————————————————————————————————
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    raise RuntimeError("Cannot open webcam")

current_word   = None
display_text   = ""
sequence       = []
seq_idx        = 0

FPS   = cap.get(cv2.CAP_PROP_FPS) or 30.0
DELAY = int(1000 / FPS)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # pull fresh word if available
    try:
        w = word_q.get_nowait()
    except queue.Empty:
        w = None

    if w:
        current_word = w
        if w in gt_land:
            sequence = gt_land[w]
            display_text = w
        else:
            sequence = gt_land["no"]
            display_text = f"{w} not found"
        seq_idx = 0

    # draw stickman if we have a sequence
    if sequence:
        flm = sequence[seq_idx % len(sequence)]
        seq_idx += 1

        ov = create_bold_stickman(flm, width=300, height=300)

        H, W = frame.shape[:2]
        x_off = (W - 300) // 2
        y_off = H - 300

        alpha = ov[:, :, 3] / 255.0
        for c in range(3):
            frame[y_off:y_off+300, x_off:x_off+300, c] = (
                alpha * ov[:, :, c] +
                (1-alpha) * frame[y_off:y_off+300, x_off:x_off+300, c]
            ).astype(np.uint8)

        cv2.putText(
            frame, display_text,
            org=(x_off+5, y_off-10),
            fontFace=cv2.FONT_HERSHEY_SIMPLEX,
            fontScale=1.0, color=(255,255,255),
            thickness=2, lineType=cv2.LINE_AA
        )

    cv2.imshow("Live ASL Translation", frame)
    if cv2.waitKey(DELAY) & 0xFF == ord("q"):
        break

# —————————————————————————————————————————————
# Cleanup
# —————————————————————————————————————————————
stream.stop()
audio_q.put(None)
t_audio.join()
cap.release()
cv2.destroyAllWindows()

[DEBUG] Transcribing 16000 samples…
[DEBUG] Whisper returned: “”
[DEBUG] Transcribing 16000 samples…
[DEBUG] Whisper returned: “”
[DEBUG] Transcribing 16000 samples…
[DEBUG] Whisper returned: “”
[DEBUG] Transcribing 16000 samples…
[DEBUG] Whisper returned: “Hello.”
[DEBUG] enqueue word: hello
[DEBUG] Transcribing 16000 samples…
[DEBUG] Whisper returned: “”
[DEBUG] Transcribing 16000 samples…
[DEBUG] Whisper returned: “”
[DEBUG] Transcribing 16000 samples…
[DEBUG] Whisper returned: “”
[DEBUG] Transcribing 16000 samples…
[DEBUG] Whisper returned: “”
[DEBUG] Transcribing 16000 samples…
[DEBUG] Whisper returned: “”
[DEBUG] Transcribing 16000 samples…
[DEBUG] Whisper returned: “work.”
[DEBUG] enqueue word: work
[DEBUG] Transcribing 16000 samples…
[DEBUG] Whisper returned: “”
[DEBUG] Transcribing 16000 samples…
[DEBUG] Whisper returned: “”
[DEBUG] Transcribing 16000 samples…
[DEBUG] Whisper returned: “”
[DEBUG] Transcribing 16000 samples…
[DEBUG] Whisper returned: “”
[DEBUG] Transcribing 160