In [None]:
# %%capture
# %pip install --no-cache-dir torch==2.2.2 torchaudio==2.2.2 
# %pip install whisperx --no-cache-dir
# %pip install sounddevice --no-cache-dir #scipy wave #numpy==2.0

In [None]:
import torch
import whisperx
import sounddevice as sd
import numpy as np
import wave
import tempfile

# Configuration
device = "cpu"  # macOS does not support CUDA
compute_type = "int8"#"float32"  # Use float32 instead of float16
model_size = "base" #large-v2"
sample_rate = 16000  # Whisper expects 16kHz audio
duration = 10  # Recording duration in seconds

# Load WhisperX model (force float32 on macOS)
model = whisperx.load_model(model_size, device, compute_type=compute_type)

# Temporary file for recording
temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
temp_wav_path = temp_wav.name

def record_audio(filename, duration, sample_rate):
    """Records audio from the microphone and saves it as a WAV file."""
    print(f"Recording {duration} seconds of audio...")
    audio_data = sd.rec(int(sample_rate * duration), samplerate=sample_rate, channels=1, dtype=np.int16)
    sd.wait()  # Wait for recording to finish
    wavefile = wave.open(filename, "wb")
    wavefile.setnchannels(1)
    wavefile.setsampwidth(2)
    wavefile.setframerate(sample_rate)
    wavefile.writeframes(audio_data.tobytes())
    wavefile.close()
    print("Recording saved.")


# Record audio
record_audio(temp_wav_path, duration, sample_rate)


# Transcribe using WhisperX
print("Transcribing audio...")
audio = whisperx.load_audio(temp_wav_path)
transcription = model.transcribe(audio)

# Load diarization pipeline
diarize_model = whisperx.DiarizationPipeline(use_auth_token="HF_API_KEY", device=device)
diarized_segments = diarize_model(temp_wav_path)

# Align transcription with diarization
aligned_transcription = whisperx.align(transcription["segments"], diarized_segments, model.lang, model_size, device)

# Print diarized transcription
for segment in aligned_transcription["segments"]:
    speaker = segment.get("speaker", "Unknown")
    print(f"[{speaker}] {segment['text']}")

print("Transcription with diarization complete.")


  from .autonotebook import tqdm as notebook_tqdm
INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []


No language specified, language will be first be detected for each audio file (increases inference time).


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../usr/local/Caskroom/miniforge/base/envs/impacthack_whisper_env/lib/python3.10/site-packages/whisperx/assets/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.2.2. Bad things might happen unless you revert torch to 1.x.
Recording 10 seconds of audio...
Recording saved.
Transcribing audio...
Detected language: en (0.98) in first 30s of audio...
