In [2]:
# %% [markdown]
# ## Dependencies
#
# Install the following packages:
# 
# ```bash
# pip install datasets transformers evaluate torchaudio
# ```
#
# This notebook uses:
# - datasets (for streaming datasets)
# - transformers (for the ASR pipeline)
# - evaluate (to compute WER)
# - torchaudio (for audio processing if needed)

# %%
import torch
from transformers import pipeline
import evaluate
import datasets

# %%
# Initialize the ASR pipeline.
asr = pipeline(
    "automatic-speech-recognition",
    model="facebook/wav2vec2-base-960h",
    chunk_length_s=30  # adjust chunk length as needed
)

# Initialize the WER metric.
wer_metric = evaluate.load("wer")

# %%
# Define the datasets with configurations.
datasets_list = {
    "LibriSpeech Clean": {"dataset": "librispeech_asr", "config": "clean", "split": "test"},
    "LibriSpeech Other": {"dataset": "librispeech_asr", "config": "other", "split": "test"},
    "Common Voice": {"dataset": "mozilla-foundation/common_voice_11_0", "config": "en", "split": "test"},
    # Skipping datasets not available on the HF Hub:
    # "VoxPopuli": {"dataset": "voxpopuli", "config": "en", "split": "test"},
    # "TEDLIUM": {"dataset": "tedlium", "config": "release3", "split": "test"},
    # "GigaSpeech": {"dataset": "GigaSpeech", "split": "test"},
    # "SPGISpeech": {"dataset": "spgispeech", "split": "test"},
    # "Earnings-22": {"dataset": "Earnings-22", "split": "test"},
    "AMI": {"dataset": "ami", "config": "headset-single", "split": "test"}
}

results = {}

# %%
# Process each dataset in streaming mode (using 5 examples for quick testing).
for ds_name, ds_info in datasets_list.items():
    print(f"Processing {ds_name}...")
    try:
        ds = datasets.load_dataset(
            ds_info["dataset"],
            ds_info.get("config", None),
            split=ds_info["split"],
            streaming=True
        )
    except Exception as e:
        print(f"Could not load {ds_name}: {e}")
        continue

    ds_small = ds.take(5)
    
    predictions = []
    references = []
    
    for example in ds_small:
        try:
            audio = example["audio"]
            # Instead of unpacking sampling_rate, pass the full audio dict
            output = asr(audio)
            pred = output["text"]
            predictions.append(pred)
            references.append(example["text"])
        except Exception as inner_e:
            print(f"Error processing an example in {ds_name}: {inner_e}")
    
    if predictions and references:
        wer_score = wer_metric.compute(predictions=predictions, references=references)
        results[ds_name] = wer_score
        print(f"{ds_name} WER: {wer_score:.2%}")
    else:
        print(f"No valid examples processed for {ds_name}.")

# %%
# Print summary results
print("\nEvaluation Results:")
for ds_name, score in results.items():
    print(f"{ds_name}: {score:.2%}")


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Processing LibriSpeech Clean...
LibriSpeech Clean WER: 2.55%
Processing LibriSpeech Other...
LibriSpeech Other WER: 3.39%
Processing Common Voice...


Reading metadata...: 16354it [00:02, 7191.01it/s]


Error processing an example in Common Voice: 'text'
Error processing an example in Common Voice: 'text'
Error processing an example in Common Voice: 'text'
Error processing an example in Common Voice: 'text'
Error processing an example in Common Voice: 'text'
No valid examples processed for Common Voice.
Processing AMI...


            This version of the AMI dataset is deprecated.
            You can download the latest one (based on the official Kaldi recipes) with
            >>> load_dataset("edinburghcstr/ami", "ihm")  # for the "independent headset microphone" part
            or
            >>> load_dataset("edinburghcstr/ami", "sdm")  # for the "single distant microphone" part
            
AMI corpus cannot be downloaded using multi-processing. Setting number of downloaded processes `num_proc` to 1. 


Could not load AMI: https://groups.inf.ed.ac.uk/ami/AMICorpusMirror//amicorpus/ES2002a/audio/ES2002a.Mix-Headset.wav

Evaluation Results:
LibriSpeech Clean: 2.55%
LibriSpeech Other: 3.39%
