# Summary

This code demonstrates how to use the picovoice Leopard speech-to-text engine to transcribe audio. It works much faster than whisper and has better accuracy than Vosk.

In [None]:
import pvleopard

import numpy as np
import sounddevice as sd
from pydantic import Field
from pydantic import BaseModel, SecretStr
from pydantic_settings import BaseSettings, SettingsConfigDict


class Settings(BaseSettings):
    picovoice_access_key: SecretStr
    pvleopard_model_path: str


    model_config = SettingsConfigDict(
        env_file=".env", env_nested_delimiter="__", extra="ignore"
    )

settings = Settings()

In [None]:
leopard = pvleopard.create(
    access_key=settings.picovoice_access_key.get_secret_value(),
    model_path=settings.pvleopard_model_path,
)

In [None]:
def record_audio(
    duration_sec: float = 4.0, sample_rate: int = 16000, device: int | None = None
) -> bytes:
    """
    Record audio and return raw bytes.
    """
    print(f"🎙️ Recording for {duration_sec} seconds...")

    audio = sd.rec(
        int(duration_sec * sample_rate),
        samplerate=sample_rate,
        channels=1,
        dtype="int16",
        device=device,
    )
    sd.wait()
    print("✅ Done recording.")
    # result = np.frombuffer(audio.tobytes(), dtype=np.int16)
    # return result
    return audio.flatten().tolist()


audio = record_audio()
transcript, words = leopard.process(audio)

print(transcript)

for word in words:
    print(
        '{word="%s" start_sec=%.2f end_sec=%.2f confidence=%.2f}'
        % (word.word, word.start_sec, word.end_sec, word.confidence)
    )

🎙️ Recording for 4.0 seconds...
✅ Done recording.
今日は火曜日ではありません
{word="今日" start_sec=0.80 end_sec=0.83 confidence=0.90}
{word="は" start_sec=1.02 end_sec=1.09 confidence=0.99}
{word="火曜" start_sec=1.18 end_sec=1.41 confidence=0.86}
{word="日" start_sec=1.50 end_sec=1.57 confidence=0.96}
{word="で" start_sec=1.63 end_sec=1.66 confidence=0.96}
{word="は" start_sec=1.76 end_sec=1.79 confidence=0.98}
{word="あり" start_sec=1.89 end_sec=1.98 confidence=0.98}
{word="ませ" start_sec=2.05 end_sec=2.18 confidence=0.99}
{word="ん" start_sec=2.24 end_sec=2.30 confidence=0.77}
