In [5]:
!pip install -q torchaudio datasets evaluate jiwer transformers speechbrain

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m864.1/864.1 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.6/118.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m739.1/739.1 kB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
import torch
from datasets import load_dataset
from evaluate import load
import numpy as np

In [None]:
librispeech = load_dataset("openslr/librispeech_asr", "clean", split="test")
librispeech = librispeech.select(range(100))


Downloading data files:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/338M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/347M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.39G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.0G [00:00<?, ?B/s]

In [None]:
# load code from hugging face model page
from transformers import pipeline
whisper = pipeline("automatic-speech-recognition", model="openai/whisper-small")

# load code from hugging face model page
from speechbrain.pretrained import EncoderDecoderASR
transformer_asr = EncoderDecoderASR.from_hparams(
    source="speechbrain/asr-transformer-transformerlm-librispeech",
    savedir="pretrained_models"
)


def quantize_model(model, precision):
    # model is quantised at fp32
    if precision == 'fp32':
        return model
    elif precision == 'fp16':
        return model.half()
    elif precision == 'int8':
        return torch.quantization.quantize_dynamic(
            model, {torch.nn.Linear}, dtype=torch.qint8
        )
    else:
        raise ValueError("Wrong precision")


def eval_model(model, precision, is_whisper=True):
    wer = load("wer")
    cer = load("cer")

    quant_model = quantize_model(model.model if is_whisper else model, precision)

    preds, refs = [], []
    for example in librispeech:
        if is_whisper:
            text = quant_model.device
            text = whisper(example["audio"]["array"], chunk_length_s=30)
        else:
            text = transformer_asr.transcribe_file(example["audio"]["path"])

        preds.append(text.lower())
        refs.append(example["text"].lower())

    return {
        "WER": 100 * wer.compute(predictions=preds, references=refs),
        "CER": 100 * cer.compute(predictions=preds, references=refs)
    }

In [None]:
results = {}
for model_name, model in [("Whisper", whisper), ("Transformer", transformer_asr)]:
    for precision in ["fp32", "fp16", "int8"]:
        print(f"Running {model_name} {precision}...")
        results[f"{model_name}_{precision}"] = eval_model(
            model, precision, is_whisper=(model_name=="Whisper")
        )

print("\nResults:")
for model_prec, metrics in results.items():
    print(f"{model_prec}:")
    print(f"  WER: {metrics['WER']:.2f}%")
    print(f"  CER: {metrics['CER']:.2f}%")