# Summary

Testing out vosk for speech recognition on a Raspberry Pi 4B. The code records audio and transcribes it using the Vosk model for Japanese language.

In [None]:
! wget https://alphacephei.com/vosk/models/vosk-model-ja-0.22.zip
! unzip vosk-model-ja-0.22.zip

In [16]:
from vosk import Model, KaldiRecognizer
import sounddevice as sd
import json
import numpy as np
import tempfile
import wave

In [17]:
print("Available input devices:")
devices = sd.query_devices()
for i, device in enumerate(devices):
    if device['max_input_channels'] > 0:
        print(device)
        # print(f"  {i}: {device['name']} (max input channels: {device['max_input_channels']})")

Available input devices:
{'name': 'USB PnP Sound Device: Audio (hw:2,0)', 'index': 1, 'hostapi': 0, 'max_input_channels': 1, 'max_output_channels': 0, 'default_low_input_latency': 0.008684807256235827, 'default_low_output_latency': -1.0, 'default_high_input_latency': 0.034829931972789115, 'default_high_output_latency': -1.0, 'default_samplerate': 44100.0}
{'name': 'pulse', 'index': 6, 'hostapi': 0, 'max_input_channels': 32, 'max_output_channels': 32, 'default_low_input_latency': 0.008684807256235827, 'default_low_output_latency': 0.008684807256235827, 'default_high_input_latency': 0.034807256235827665, 'default_high_output_latency': 0.034807256235827665, 'default_samplerate': 44100.0}
{'name': 'default', 'index': 10, 'hostapi': 0, 'max_input_channels': 32, 'max_output_channels': 32, 'default_low_input_latency': 0.008684807256235827, 'default_low_output_latency': 0.008684807256235827, 'default_high_input_latency': 0.034807256235827665, 'default_high_output_latency': 0.034807256235827665

In [18]:
sample_rate = 44100  # Adjusted to match the model's expected sample rate
frame_duration_ms = 30
frame_size = sample_rate * frame_duration_ms // 1000
device = 1

model = Model("/home/hankehly/Projects/PALM-9000/models/vosk-model-ja-0.22")

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=13 max-active=7000 lattice-beam=6
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 1 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 2 orphan components.
LOG (VoskAPI:Collapse():nnet-utils.cc:1488) Added 1 components, removed 2
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from /home/hankehly/Projects/PALM-9000/models/vosk-model-ja-0.22/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:279) Loading HCLG from /home/hankehly/Projects/PALM-9000/models/vosk-model-ja-0.22/graph/HCLG.fst
LOG (VoskAPI:ReadDataFiles():model.cc:297) Loading words from /home/hankehly/Projects/PALM-9000/models/vosk-model-ja-0.22/gr

In [None]:
def record_audio(
    duration_sec: float = 4.0, sample_rate: int = 44100, device: int | None = None
) -> bytes:
    """
    Record audio and return raw bytes.
    """
    print(f"🎙️ Recording for {duration_sec} seconds...")

    audio = sd.rec(
        int(duration_sec * sample_rate),
        samplerate=sample_rate,
        channels=1,
        dtype="int16",
        device=device,
    )
    sd.wait()

    print("✅ Done recording.")
    return audio.tobytes()


def transcribe_audio_vosk(audio_bytes: bytes) -> str:
    """
    Transcribe audio bytes using Vosk.
    """
    recognizer = KaldiRecognizer(model, sample_rate)

    if recognizer.AcceptWaveform(audio_bytes):
        result = json.loads(recognizer.Result())
        print("Result:", result)
        text = result.get("text", "")
    else:
        result = json.loads(recognizer.FinalResult())
        print("FinalResult:", result)
        text = result.get("text", "")

    return text


import scipy.io.wavfile


audio_bytes = record_audio(duration_sec=4.0, sample_rate=sample_rate, device=device)

# audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
# scipy.io.wavfile.write("out.wav", sample_rate, audio_array)

transcription = transcribe_audio_vosk(audio_bytes=audio_bytes)

transcription

🎙️ Recording for 4.0 seconds...
✅ Done recording.
FinalResult: {'text': 'こんにちは 今日 は 日曜 日 です'}


'こんにちは 今日 は 日曜 日 です'