
## 创建新的py环境
Create Environment
`conda create -n myvoiceenv python=3.10`

Activate Environment
`conda activate myvoiceenv`

Install Dependencies
`pip install vosk`

`pip install sounddevice`

`pip install pyautogui`

For the pyaudio library`

`brew install portaudio`

`pip install pyaudio`

如果没有 brew，运行 ⬇️

`/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"`

---
模型下载 https://alphacephei.com/vosk/models

* 本项目使用的是 vosk-model-small-en-us-0.15，下载后放在ipynb文件同级目录下

In [2]:
import sounddevice as sd
print(sd.query_devices())

  0 MacBook Air麦克风, Core Audio (1 in, 0 out)
  1 MacBook Air扬声器, Core Audio (0 in, 2 out)
  2 makou Microphone, Core Audio (1 in, 0 out)
> 3 🎸, Core Audio (1 in, 0 out)
< 4 🎸, Core Audio (0 in, 2 out)
  5 Microsoft Teams Audio, Core Audio (1 in, 1 out)


In [2]:
from vosk import Model, KaldiRecognizer
import sounddevice as sd
import queue
import json
import pyautogui
import threading
import time

# === 配置参数 ===
MODEL_PATH = "vosk-model-small-en-us-0.15"
SAMPLE_RATE = 16000
BLOCK_SIZE = 8000
COOLDOWN = 0.05  # 秒

# === 关键词配置 ===
instant_groups = {
    "w": ["up", "op"],
    "a": ["left", "暂停一下", "zantie", "zanting"],
    "s": ["down", "done", "don't", "don", "damn"],
    "d": ["right", "tingzhi", "停下"],
    "p": ["light", "like", "night"]
}

hold_groups = {
    "space": ["a", "ah", "eh", "hey"]
}

# === 状态变量 ===
q_audio = queue.Queue()
last_trigger_time = {k: 0 for k in instant_groups}
hold_key_status = {k: False for k in hold_groups}

# === 音频输入回调 ===
def audio_callback(indata, frames, time_info, status):
    if status:
        print("⚠️", status)
    q_audio.put(bytes(indata), block=False)

# === 初始化模型 ===
model = Model(MODEL_PATH)
recognizer = KaldiRecognizer(model, SAMPLE_RATE)

# === 快速判断函数 ===
def fuzzy_match(text, keywords):
    text = text.lower()
    return any(word in text for word in keywords)

# === 实时语音处理线程 ===
def recognizer_loop():
    print("🎧 Listening...")

    while True:
        try:
            data = q_audio.get_nowait()
        except queue.Empty:
            time.sleep(0.001)
            continue

        if recognizer.AcceptWaveform(data):
            result = json.loads(recognizer.Result())
            text = result.get("text", "").lower()

            if text:
                # 🐞 调试输出：完整识别
                print(f"[FULL TEXT] {text}")  # ←←←← 可在调试完成后注释

                # 瞬时关键词匹配
                for key, keywords in instant_groups.items():
                    now = time.time()
                    if fuzzy_match(text, keywords) and (now - last_trigger_time[key] > COOLDOWN):
                        pyautogui.press(key)
                        last_trigger_time[key] = now
                        print(f"🔘 {text} -> press [{key}]")

        else:
            partial = json.loads(recognizer.PartialResult())
            partial_text = partial.get("partial", "").lower()

            if partial_text:
                # 🐞 调试输出：部分识别
                print(f"[PARTIAL] {partial_text}")  # ←←←← 可在调试完成后注释

            # 持续关键词匹配
            for key, keywords in hold_groups.items():
                if fuzzy_match(partial_text, keywords):
                    if not hold_key_status[key]:
                        pyautogui.keyDown(key)
                        hold_key_status[key] = True
                        print(f"⬇️ Hold [{key}]")
                else:
                    if hold_key_status[key]:
                        pyautogui.keyUp(key)
                        hold_key_status[key] = False
                        print(f"⬆️ Release [{key}]")


# === 启动音频和识别线程 ===
def main():
    stream = sd.RawInputStream(
        samplerate=SAMPLE_RATE,
        blocksize=BLOCK_SIZE,
        dtype='int16',
        channels=1,
        callback=audio_callback,
        device=0  # 举例，你的麦克风设备编号
    )

    with stream:
        threading.Thread(target=recognizer_loop, daemon=True).start()
        while True:
            time.sleep(1)  # 主线程挂起即可

if __name__ == "__main__":
    main()


LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from vosk-model-small-en-us-0.15/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:282) Loading HCL and G from vosk-model-small-en-us-0.15/graph/HCLr.fst vosk-model-small-en-us-0.15/graph/Gr.fst
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo vosk-model-small-en-us-0.15/graph/phones/word_boundary.int
Exception ignored in: <function KaldiRecognizer.__del__ at 0x1073c91b0>
Traceback (most recent call 

🎧 Listening...
[PARTIAL] the
[PARTIAL] the for evil what's it to trigger can comfortably on them vertical
[PARTIAL] the for evil what's it to trigger can comfortably on them vertical atlanta ga ga
[PARTIAL] the for evil what's it to trigger can comfortably on them vertical atlanta ga ga the double life at
[PARTIAL] the for evil what's it to trigger can comfortably on them vertical atlanta ga ga the double life at a low the nicer
[PARTIAL] the for evil what's it to trigger can comfortably on them vertical atlanta ga ga the double life at a low the nicer than do with
[PARTIAL] the for evil what's it to trigger can comfortably on them vertical atlanta ga ga the double life at a low the nicer than do with how
[PARTIAL] the for evil what's it to trigger can comfortably on them vertical atlanta ga ga the double life at a low the nicer than do with how odd it
[PARTIAL] the for evil what's it to trigger can comfortably on them vertical atlanta ga ga the double life at a low the nicer than do w

KeyboardInterrupt: 