In [4]:
import os
import time
import whisperx
import librosa
import jiwer
import sacrebleu

LIBRISPEECH_ROOT = "LibriSpeech/test-other"

model = whisperx.load_model("small", device="cpu", compute_type="float32")

def load_transcriptions(root):
    refs = {}
    for dirpath, _, filenames in os.walk(root):
        for fname in filenames:
            if fname.endswith(".trans.txt"):
                with open(os.path.join(dirpath, fname), "r", encoding="utf-8") as f:
                    for line in f:
                        utt_id, text = line.strip().split(" ", 1)
                        refs[utt_id] = text.lower()
    return refs

references = load_transcriptions(LIBRISPEECH_ROOT)

all_hypotheses = []
all_references = []

total_audio_time = 0.0
total_processing_time = 0.0

for dirpath, _, filenames in os.walk(LIBRISPEECH_ROOT):
    for fname in filenames:
        if fname.endswith(".flac"):
            utt_id = os.path.splitext(fname)[0]
            if utt_id not in references:
                continue

            audio_path = os.path.join(dirpath, fname)

            audio, sr = librosa.load(audio_path, sr=16000)
            duration = len(audio) / sr

            start = time.time()
            result = model.transcribe(audio_path)
            end = time.time()

            segments = result.get("segments", [])
            hypothesis = " ".join(seg["text"] for seg in segments).strip().lower()

            reference = references[utt_id]

            all_hypotheses.append(hypothesis)
            all_references.append(reference)

            total_audio_time += duration
            total_processing_time += (end - start)

wer = jiwer.wer(all_references, all_hypotheses)
cer = jiwer.cer(all_references, all_hypotheses)
chrf = sacrebleu.corpus_chrf(all_hypotheses, [all_references]).score
rtf = total_processing_time / total_audio_time

print("\n--- Evaluation Results (LibriSpeech test-other) ---")
print(f"WER  : {wer:.3f} ({wer*100:.1f}%)")
print(f"CER  : {cer:.3f} ({cer*100:.1f}%)")
print(f"chrF : {chrf:.2f}")
print(f"RTF  : {rtf:.3f}")


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint c:\Users\Lenovo X1 Carbon\AppData\Local\Programs\Python\Python311\Lib\site-packages\whisperx\assets\pytorch_model.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...
Model was trained with pyannote.audio 0.0.1, yours is 3.4.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cpu. Bad things might happen unless you revert torch to 1.x.


  torchaudio.list_audio_backends()


Detected language: en (0.98) in first 30s of audio...
Detected language: en (1.00) in first 30s of audio...
Detected language: en (0.99) in first 30s of audio...
Detected language: en (0.99) in first 30s of audio...
Detected language: en (1.00) in first 30s of audio...
Detected language: en (0.99) in first 30s of audio...
Detected language: en (0.99) in first 30s of audio...
Detected language: en (1.00) in first 30s of audio...
Detected language: en (1.00) in first 30s of audio...
Detected language: en (1.00) in first 30s of audio...
Detected language: en (0.98) in first 30s of audio...
Detected language: en (0.99) in first 30s of audio...
Detected language: en (0.99) in first 30s of audio...
Detected language: en (0.99) in first 30s of audio...
Detected language: en (1.00) in first 30s of audio...
Detected language: en (0.98) in first 30s of audio...
Detected language: en (0.99) in first 30s of audio...
Detected language: en (0.99) in first 30s of audio...
Detected language: en (0.99)