## Common Voice benchmarking tool

In [None]:
#| default_exp commonvoice

In [None]:
#| hide
from nbdev.showdoc import *

ModuleNotFoundError: No module named 'nbdev'

In [None]:
#| export
import time

import pandas as pd
from datasets import load_dataset, Audio
from jiwer import wer, cer
from transformers import pipeline

from malayalam_asr_benchmarking.utils import whisper_norm, is_target_text_in_range, get_text, normalise, data, get_model_size, clear_gpu_memory

In [None]:
#| export
def load_common_voice_malayalam_dataset():
    dataset = load_dataset(
            "mozilla-foundation/common_voice_11_0",
            "ml",
            split="test"
    )
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
    dataset = dataset.map(normalise)
    dataset = dataset.filter(is_target_text_in_range, input_columns=["norm_text"])
    return dataset

In [None]:
#| export
def evaluate_whisper_model_common_voice(model_name: "str")->None:
    whisper_asr = pipeline(
            "automatic-speech-recognition", model=model_name, device=0
        )
    dataset = load_common_voice_malayalam_dataset()
    
    predictions = []
    references = []

    start = time.time()
    for out in whisper_asr(data(dataset), batch_size=16):
        predictions.append(whisper_norm(out["text"]))
        references.append(out["reference"][0])
        
        
    end = time.time()
    print(f"Total time taken: {end - start}")
    
    df = pd.DataFrame({"predictions": predictions, "ground_truth": references})
    df["model_name"] = model_name
    df["wer"] = df.apply(lambda row: wer(row["ground_truth"], row["predictions"]), axis=1)
    df["cer"] = df.apply(lambda row: cer(row["ground_truth"], row["predictions"]), axis=1)
    df["total_time"] = end-start
    
    rwer = wer(references, predictions)
    rwer = round(100 * rwer, 2)
    print(f"The WER of model: {rwer}")

    rcer = cer(references, predictions)
    rcer = round(100 * rcer, 2)
    print(f"The CER of model: {rcer}")
    
    print(f"The model size is: {get_model_size(whisper_asr.model)}")
    df["model_size"] = get_model_size(whisper_asr.model)
    df.to_parquet(f"test_file.parquet")
    
    clear_gpu_memory()

## Testing with a sample model

In [None]:
#|eval: false
evaluate_whisper_model_common_voice("parambharat/whisper-tiny-ml")

Found cached dataset common_voice_11_0 (/home/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ml/11.0.0/2c65b95d99ca879b1b1074ea197b65e0497848fd697fdb0582e0f6b75b6f4da0)
Loading cached processed dataset at /home/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ml/11.0.0/2c65b95d99ca879b1b1074ea197b65e0497848fd697fdb0582e0f6b75b6f4da0/cache-374585c2877047e3.arrow
Loading cached processed dataset at /home/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ml/11.0.0/2c65b95d99ca879b1b1074ea197b65e0497848fd697fdb0582e0f6b75b6f4da0/cache-22670505c562e0d4.arrow


Total time taken: 65.49683594703674
The WER of model: 38.31
The CER of model: 21.93
The model size is: 37.76M
