In [4]:
import torch
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
from datasets import load_dataset
import jiwer
from tqdm import tqdm
import pandas as pd
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# Text normalization transform
wer_transform = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip(),
    jiwer.RemovePunctuation(),
    jiwer.ReduceToListOfListOfWords()
])

def calculate_wer(references, predictions):
    try:
        return jiwer.wer(
            references,
            predictions,
            truth_transform=wer_transform,
            hypothesis_transform=wer_transform
        )
    except:
        return float('inf')

# Initialize model
device = "cpu"
torch_dtype = torch.float32
model_id = "openai/whisper-small"

model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype)
processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    device=device,
    torch_dtype=torch_dtype,
)

DATASET_CONFIG = [
    {"name": "LibriSpeech Clean", "path": "librispeech_asr", "config": "clean", "split": "test", "text_key": "text"},
    {"name": "LibriSpeech Other", "path": "librispeech_asr", "config": "other", "split": "test", "text_key": "text"},
    {"name": "Common Voice", "path": "mozilla-foundation/common_voice_11_0", "config": "en", "split": "test", "text_key": "sentence"},
    {"name": "VoxPopuli", "path": "facebook/voxpopuli", "config": "en", "split": "test", "text_key": "raw_text"},
]

def evaluate_dataset(dataset_config, max_samples=5):
    try:
        dataset = load_dataset(
            dataset_config["path"],
            dataset_config.get("config"),
            split=dataset_config["split"],
            streaming=True
        ).take(max_samples)
    except Exception as e:
        print(f"Error loading {dataset_config['name']}: {str(e)}")
        return None

    predictions, references = [], []
    
    for sample in tqdm(dataset, desc=dataset_config["name"]):
        try:
            # Validate audio
            audio = sample["audio"]["array"]
            if audio.size == 0 or audio.ndim != 1:
                raise ValueError("Invalid audio format")
                
            # Get text with dataset-specific key
            text = sample.get(dataset_config["text_key"])
            if not text or not isinstance(text, str):
                raise ValueError(f"Missing text in {dataset_config['name']}")
            
            # Get prediction
            result = pipe(
                audio,
                chunk_length_s=30,
                stride_length_s=(4, 2),
                generate_kwargs={"language": "<|en|>", "task": "transcribe"},
            )
            prediction = result["text"].strip()
            
            references.append(text.strip())
            predictions.append(prediction)
                
        except Exception as e:
            print(f"{dataset_config['name']} sample error: {str(e)}")
            continue

    if len(references) == 0:
        return None
        
    return {
        "dataset": dataset_config["name"],
        "samples_tested": len(references),
        "WER": calculate_wer(references, predictions)
    }

# Run evaluation
results = []
for config in DATASET_CONFIG:
    result = evaluate_dataset(config, max_samples=5)
    if result:
        results.append(result)

# Display results
if results:
    df = pd.DataFrame(results)
    print("\nEvaluation Results:")
    print(df[["dataset", "samples_tested", "WER"]].round(3))
else:
    print("No results to display")

Device set to use cpu
LibriSpeech Clean: 0it [00:00, ?it/s]You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
LibriSpeech Clean: 5it [00:38,  7.79s/it]
LibriSpeech Other: 5it [00:30,  6.18s/it]
Reading metadata...: 16354it [00:02, 6384.25it/s]
Common Voice: 5it [00:17,  3.52s/it]
VoxPopuli: 5it [00:26,  5.25s/it]


Evaluation Results:
             dataset  samples_tested    WER
0  LibriSpeech Clean               5  0.013
1  LibriSpeech Other               5  0.085
2       Common Voice               5  1.114
3          VoxPopuli               5  0.080



