In [3]:
# !pip install -r ../requirements.txt

In [11]:
# !pip install ipywidgets

In [4]:
# !nvidia-smi

In [1]:
import torch
from datasets import load_dataset
from transformers import pipeline
from jiwer import wer
from tqdm import tqdm

In [2]:
# Optional: to save or play audio
import soundfile as sf
from IPython.display import Audio, display

In [3]:
# --- CONFIG ---
device = "cuda" if torch.cuda.is_available() else "cpu"
lang_code = "fa_ir"
num_samples = 20  # use more for full evaluation
save_audio = False  # set to True to save files
play_audio = False  # set to True in notebooks to play files

models = [
    "openai/whisper-small",
    "m3hrdadfi/wav2vec2-large-xlsr-persian"
]

In [5]:
# --- Load Persian FLEURS dataset ---
print("Loading dataset...")
dataset = load_dataset("google/fleurs", lang_code, split=f"test[:{num_samples}]", trust_remote_code=True)

Loading dataset...


In [6]:
# --- Evaluation loop ---
for model_name in models:
    print(f"\n🔍 Evaluating model: {model_name}")
    asr = pipeline("automatic-speech-recognition", model=model_name, device=0 if device == "cuda" else -1)

    references, hypotheses = [], []

    for idx, sample in tqdm(enumerate(dataset), total=len(dataset)):
        audio_array = sample['audio']['array']
        sampling_rate = sample['audio']['sampling_rate']
        reference_text = sample['transcription'].strip().lower()

        try:
            # --- Run ASR ---
            result = asr(audio_array, chunk_length_s=30, return_timestamps=False)
            predicted_text = result["text"].strip().lower()

            # --- Collect results ---
            references.append(reference_text)
            hypotheses.append(predicted_text)

            # --- Optional: Save or play audio ---
            if save_audio:
                out_path = f"audio_{idx}.wav"
                sf.write(out_path, audio_array, sampling_rate)
            if play_audio:
                display(Audio(data=audio_array, rate=sampling_rate))

        except Exception as e:
            print(f"Error on sample {idx}: {e}")

    # --- Calculate and display WER ---
    error = wer(references, hypotheses)
    print(f"✅ WER for {model_name} on FLEURS (Persian): {error:.3f}")


🔍 Evaluating model: openai/whisper-small


Device set to use cuda:0
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 20/20 [00:36<00:00,  1.83s/it]


✅ WER for openai/whisper-small on FLEURS (Persian): 0.547

🔍 Evaluating model: m3hrdadfi/wav2vec2-large-xlsr-persian


Device set to use cuda:0
100%|██████████| 20/20 [00:00<00:00, 630.25it/s]

Error on sample 0: CTC can either predict character level timestamps, or word level timestamps. Set `return_timestamps='char'` or `return_timestamps='word'` as required.
Error on sample 1: CTC can either predict character level timestamps, or word level timestamps. Set `return_timestamps='char'` or `return_timestamps='word'` as required.
Error on sample 2: CTC can either predict character level timestamps, or word level timestamps. Set `return_timestamps='char'` or `return_timestamps='word'` as required.
Error on sample 3: CTC can either predict character level timestamps, or word level timestamps. Set `return_timestamps='char'` or `return_timestamps='word'` as required.
Error on sample 4: CTC can either predict character level timestamps, or word level timestamps. Set `return_timestamps='char'` or `return_timestamps='word'` as required.
Error on sample 5: CTC can either predict character level timestamps, or word level timestamps. Set `return_timestamps='char'` or `return_timestamps='


