In [5]:
import queue
import sounddevice as sd
import numpy as np
import json
import pyautogui
import time
from vosk import Model, KaldiRecognizer

# === Configuration ===
SAMPLE_RATE = 16000
BLOCK_SIZE = 8000
VOLUME_THRESHOLD = 300  # Adjustable volume threshold
COOLDOWN = 0.05

# === Model and Recognizer ===
model = Model("vosk-model-small-en-us-0.15")
recognizer = KaldiRecognizer(model, SAMPLE_RATE)
audio_queue = queue.Queue()

# === Keyword Groups ===
instant_groups = {
    "w": ["up", "go", "forward"],
    "a": ["left", "laft"],
    "s": ["down", "back"],
    "d": ["right", "write"],
    "p": ["pause", "stop"]
}

last_trigger_time = {key: 0 for key in instant_groups}
is_holding_space = False

# === Audio Callback: Feeds both recognizer & volume detection ===
def audio_callback(indata, frames, time_info, status):
    global is_holding_space
    if status:
        print(status)

    audio_queue.put(bytes(indata))  # For recognizer

    # Real-time volume detection (for spacebar hold)
    audio_np = np.frombuffer(indata, dtype=np.int16)
    volume = np.linalg.norm(audio_np)

    if volume > VOLUME_THRESHOLD:
        if not is_holding_space:
            pyautogui.keyDown('space')
            is_holding_space = True
            print("⬇️ SPACE DOWN by volume trigger")
    else:
        if is_holding_space:
            pyautogui.keyUp('space')
            is_holding_space = False
            print("⬆️ SPACE UP by volume drop")

# === Fuzzy Matching Helper Function ===
def fuzzy_match(text, keywords):
    return any(k in text for k in keywords)

# === Main Recognition Loop ===
def recognizer_loop():
    print("🎧 Listening (volume + keyword)...")
    while True:
        try:
            data = audio_queue.get_nowait()
        except queue.Empty:
            time.sleep(0.001)
            continue

        if recognizer.AcceptWaveform(data):
            result = json.loads(recognizer.Result())
            text = result.get("text", "").lower()
            if text:
                print(f"[FULL TEXT] {text}")  # Can be commented for debugging

                for key, keywords in instant_groups.items():
                    now = time.time()
                    if fuzzy_match(text, keywords) and (now - last_trigger_time[key] > COOLDOWN):
                        pyautogui.press(key)
                        last_trigger_time[key] = now
                        print(f"🔘 {text} -> press [{key}]")

# === Main Function ===
def main():
    with sd.RawInputStream(
        samplerate=SAMPLE_RATE,
        blocksize=BLOCK_SIZE,
        dtype='int16',
        channels=1,
        callback=audio_callback
    ):
        recognizer_loop()

if __name__ == "__main__":
    main()

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from vosk-model-small-en-us-0.15/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:282) Loading HCL and G from vosk-model-small-en-us-0.15/graph/HCLr.fst vosk-model-small-en-us-0.15/graph/Gr.fst
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo vosk-model-small-en-us-0.15/graph/phones/word_boundary.int


🎧 Listening (volume + keyword)...
⬇️ SPACE DOWN by volume trigger
[FULL TEXT] ah ah
⬆️ SPACE UP by volume drop
⬇️ SPACE DOWN by volume trigger
⬆️ SPACE UP by volume drop
⬇️ SPACE DOWN by volume trigger
[FULL TEXT] ah
[FULL TEXT] junk
⬆️ SPACE UP by volume drop
⬇️ SPACE DOWN by volume trigger
⬆️ SPACE UP by volume drop
⬇️ SPACE DOWN by volume trigger
⬆️ SPACE UP by volume drop
⬇️ SPACE DOWN by volume trigger
⬆️ SPACE UP by volume drop
⬇️ SPACE DOWN by volume trigger
⬆️ SPACE UP by volume drop
⬇️ SPACE DOWN by volume trigger
⬆️ SPACE UP by volume drop
⬇️ SPACE DOWN by volume trigger
⬆️ SPACE UP by volume drop
⬇️ SPACE DOWN by volume trigger
⬆️ SPACE UP by volume drop
⬇️ SPACE DOWN by volume trigger
⬆️ SPACE UP by volume drop
⬇️ SPACE DOWN by volume trigger
⬆️ SPACE UP by volume drop
⬇️ SPACE DOWN by volume trigger
⬆️ SPACE UP by volume drop
⬇️ SPACE DOWN by volume trigger
⬆️ SPACE UP by volume drop
⬇️ SPACE DOWN by volume trigger
⬆️ SPACE UP by volume drop
⬇️ SPACE DOWN by volume trigger

KeyboardInterrupt: 