### Critical Dependencies

In [None]:
# !pip install --extra-index-url https://pypi.fury.io/arrow-nightlies/ --prefer-binary --pre pyarrow

In [None]:
# !add-apt-repository -y ppa:jonathonf/ffmpeg-4 && apt update && apt install -y ffmpeg
# !pip install --quiet datasets transformers evaluate huggingface_hub jiwer ipywidgets soundfile librosa

In [None]:
# from huggingface_hub import login
# login()

In [None]:
from datasets import load_dataset
dataset = load_dataset("speechcolab/gigaspeech", "xs", split="test", streaming=True)
print(next(iter(dataset)))

In [None]:
import IPython.display as ipd

sample = next(iter(dataset))
audio = sample["audio"]

print(sample["text"])
ipd.Audio(data=audio["array"], autoplay=True, rate=audio["sampling_rate"])

In [None]:
from datasets import Audio
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

def get_text(sample):
    if "text" in sample:
        return sample["text"]
    elif "sentence" in sample:
        return sample["sentence"]
    elif "normalized_text" in sample:
        return sample["normalized_text"]
    elif "transcript" in sample:
        return sample["transcript"]
    else:
        raise ValueError(f"Sample: {sample.keys()} has no transcript.")

In [None]:
# # For CPU-only version
# !pip install torch torchvision

# # For GPU version (if you have a compatible NVIDIA GPU)
# !pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113

In [None]:
from transformers import pipeline
whisper_asr = pipeline("automatic-speech-recognition", model="openai/whisper-tiny.en", device=0)

In [None]:
# whisper_asr.model.config.suppress_tokens.remove(6)
# whisper_asr.model.config.suppress_tokens.remove(12)

import evaluate
wer_metric = evaluate.load("wer")

In [None]:
whisper_norm = whisper_asr.tokenizer._normalize

def normalise(batch):
    batch["norm_text"] = whisper_norm(get_text(batch))
    return batch

dataset = dataset.map(normalise)


In [None]:
def is_target_text_in_range(ref):
    if ref.strip() == "ignore time segment in scoring":
        return False
    else:
        return ref.strip() != ""
    
dataset = dataset.filter(is_target_text_in_range, input_columns=["norm_text"])

In [None]:
def data(dataset):
    for i, item in enumerate(dataset):
        yield {**item["audio"], "reference": item["norm_text"]}

BATCH_SIZE = 16
dataset = dataset.take(128)

In [None]:
predictions = []
references = []

# run streamed inference
for out in whisper_asr(data(dataset), batch_size=BATCH_SIZE):
    predictions.append(whisper_norm(out["text"]))
    references.append(out["reference"][0])

wer = wer_metric.compute(references=references, predictions=predictions)
wer = round(100 * wer, 2)

print("WER:", wer)