In [1]:
import sounddevice as sd
import numpy as np
import webrtcvad
import whisper
import collections
import struct

SAMPLE_RATE = 16000
FRAME_DURATION_MS = 30  # ms
FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION_MS / 1000)
CHANNELS = 1
VAD_AGGRESSIVENESS = 2  # 0–3: higher = more aggressive
MAX_SILENCE_SECONDS = 1.0

# === Load Whisper ===
model = whisper.load_model("base")

# === Setup VAD ===
vad = webrtcvad.Vad(VAD_AGGRESSIVENESS) 



In [2]:
def float32_to_int16(audio):
    return (audio * 32767).astype(np.int16)

def is_speech(frame_bytes):
    return vad.is_speech(frame_bytes, SAMPLE_RATE)

# === Main function: record until silence ===
def record_until_silence():
    print("Listening...")

    buffer = []
    silence_buffer = collections.deque(maxlen=int(MAX_SILENCE_SECONDS * 1000 / FRAME_DURATION_MS))
    stream = sd.InputStream(samplerate=SAMPLE_RATE, channels=CHANNELS, dtype='float32', blocksize=FRAME_SIZE)
    
    with stream:
        while True:
            audio_chunk, _ = stream.read(FRAME_SIZE)
            audio_chunk = audio_chunk.flatten()
            audio_int16 = float32_to_int16(audio_chunk)
            frame_bytes = struct.pack(f"{len(audio_int16)}h", *audio_int16)
            
            if is_speech(frame_bytes):
                buffer.append(audio_chunk)
                silence_buffer.clear()
            else:
                silence_buffer.append(audio_chunk)
                if len(silence_buffer) == silence_buffer.maxlen and len(buffer) > 0:
                    print("Silence detected, stopping...")
                    break

    full_audio = np.concatenate(buffer)
    return full_audio

In [3]:
audio_data = record_until_silence()
result = model.transcribe(audio_data)
print("You said:", result["text"])

Listening...
Silence detected, stopping...
You said:  This is a test, let me know when I stop.
