In [2]:
import torch
import evaluate
from datasets import load_dataset
from transformers import WhisperForConditionalGeneration, WhisperProcessor

def load_and_evaluate_dataset(dataset_name, model, processor, max_samples=3):
    """
    Load a dataset with streaming, preprocess it, and evaluate speech recognition performance
    
    Args:
        dataset_name (str): Name of the dataset
        model: Pretrained Whisper model
        processor: Whisper processor for preparing audio inputs
        max_samples (int): Maximum number of samples to evaluate
    
    Returns:
        dict: Evaluation metrics
    """
    # Dataset loading configurations with streaming
    dataset_configs = {
        'LibriSpeech Clean': {
            'path': 'librispeech_asr', 
            'name': 'clean', 
            'split': 'test'
        },
        'LibriSpeech Other': {
            'path': 'librispeech_asr', 
            'name': 'other', 
            'split': 'test'
        },
        'Common Voice': {
            'path': 'mozilla-foundation/common_voice_11_0', 
            'name': 'en', 
            'split': 'test'
        },
        'VoxPopuli': {
            'path': 'facebook/voxpopuli', 
            'name': 'en', 
            'split': 'test'
        },
        'TEDLIUM': {
            'path': 'ted_talks_iwslt', 
            'name': 'release2', 
            'split': 'test'
        },
        'GigaSpeech': {
            'path': 'speechcolab/gigaspeech', 
            'name': 'xs', 
            'split': 'test'
        },
        'SPGISpeech': {
            'path': 'speechcolab/spgispeech', 
            'split': 'test'
        },
        'Earnings-22': {
            'path': 'speechcolab/earnings22', 
            'split': 'test'
        },
        'AMI': {
            'path': 'edinburghnlp/ami', 
            'split': 'test'
        }
    }
    
    # Prepare device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    
    # Prepare metric
    wer_metric = evaluate.load("wer")
    
    # Collect predictions and references
    predictions = []
    references = []
    
    # Streaming dataset load
    try:
        # Get configuration for the dataset
        config = dataset_configs.get(dataset_name, {})
        
        # Load dataset with streaming
        dataset = load_dataset(
            config.get('path', dataset_name.lower().replace(' ', '_')), 
            name=config.get('name'),
            split=config.get('split', 'test'),
            streaming=True  # Key change: Enable streaming
        )
        
        # Limit samples using iterator
        for idx, item in enumerate(dataset):
            if idx >= max_samples:
                break
            
            try:
                # Prepare audio input
                audio = item['audio']
                ref_text = item['text']
                
                # Process audio
                input_features = processor(
                    audio['array'], 
                    sampling_rate=audio['sampling_rate'], 
                    return_tensors="pt"
                ).input_features.to(device)
                
                # Generate transcription
                with torch.no_grad():
                    predicted_ids = model.generate(input_features)
                    pred_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
                
                predictions.append(pred_text)
                references.append(ref_text)
                
            except Exception as e:
                print(f"Error processing sample in {dataset_name}: {e}")
        
    except Exception as e:
        print(f"Error loading {dataset_name}: {e}")
        return None
    
    # Calculate Word Error Rate
    if predictions and references:
        wer = wer_metric.compute(predictions=predictions, references=references)
        return {
            'dataset': dataset_name,
            'WER': wer * 100,  # Convert to percentage
            'total_samples': len(predictions)
        }
    
    return None

def evaluate_model_across_datasets(model_name="openai/whisper-small", samples_per_dataset=3):
    """
    Evaluate a Whisper speech recognition model across multiple datasets
    
    Args:
        model_name (str): Hugging Face Whisper model identifier
        samples_per_dataset (int): Number of samples to test per dataset
    
    Returns:
        list: Evaluation results for each dataset
    """
    # Load model and processor
    model = WhisperForConditionalGeneration.from_pretrained(model_name)
    processor = WhisperProcessor.from_pretrained(model_name)
    
    # Datasets to evaluate
    datasets = [
        'LibriSpeech Clean', 
        'LibriSpeech Other', 
        'Common Voice', 
        'VoxPopuli', 
        'TEDLIUM', 
        'GigaSpeech', 
        'SPGISpeech', 
        'Earnings-22', 
        'AMI'
    ]
    
    # Evaluate on each dataset
    results = []
    for dataset in datasets:
        print(f"Evaluating on {dataset}")
        result = load_and_evaluate_dataset(
            dataset_name=dataset, 
            model=model, 
            processor=processor, 
            max_samples=samples_per_dataset
        )
        if result:
            results.append(result)
            print(f"Result: {result}")
    
    return results

# Main execution
if __name__ == "__main__":
    # Evaluate the model
    evaluation_results = evaluate_model_across_datasets(samples_per_dataset=3)
    
    # Optional: Display results in a pandas DataFrame
    import pandas as pd
    results_df = pd.DataFrame(evaluation_results)
    print("\nFull Evaluation Results:")
    print(results_df)

Evaluating on LibriSpeech Clean


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Result: {'dataset': 'LibriSpeech Clean', 'WER': 100.0, 'total_samples': 3}
Evaluating on LibriSpeech Other
Result: {'dataset': 'LibriSpeech Other', 'WER': 96.7741935483871, 'total_samples': 3}
Evaluating on Common Voice


Reading metadata...: 16354it [00:02, 8172.37it/s] 


Error processing sample in Common Voice: 'text'
Error processing sample in Common Voice: 'text'
Error processing sample in Common Voice: 'text'
Evaluating on VoxPopuli
Error processing sample in VoxPopuli: 'text'
Error processing sample in VoxPopuli: 'text'
Error processing sample in VoxPopuli: 'text'
Evaluating on TEDLIUM


README.md:   0%|          | 0.00/16.4k [00:00<?, ?B/s]

ted_talks_iwslt.py:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

Error loading TEDLIUM: BuilderConfig 'release2' not found. Available: ['eu_ca_2014', 'eu_ca_2015', 'eu_ca_2016', 'nl_en_2014', 'nl_en_2015', 'nl_en_2016', 'nl_hi_2014', 'nl_hi_2015', 'nl_hi_2016', 'de_ja_2014', 'de_ja_2015', 'de_ja_2016', 'fr-ca_hi_2014', 'fr-ca_hi_2015', 'fr-ca_hi_2016']
Evaluating on GigaSpeech
Result: {'dataset': 'GigaSpeech', 'WER': 128.57142857142858, 'total_samples': 3}
Evaluating on SPGISpeech
Error loading SPGISpeech: Dataset 'speechcolab/spgispeech' doesn't exist on the Hub or cannot be accessed.
Evaluating on Earnings-22
Error loading Earnings-22: Dataset 'speechcolab/earnings22' doesn't exist on the Hub or cannot be accessed.
Evaluating on AMI
Error loading AMI: Dataset 'edinburghnlp/ami' doesn't exist on the Hub or cannot be accessed.

Full Evaluation Results:
             dataset         WER  total_samples
0  LibriSpeech Clean  100.000000              3
1  LibriSpeech Other   96.774194              3
2         GigaSpeech  128.571429              3
