## Common Voice benchmarking tool

In [2]:
#| default_exp commonvoice

In [3]:
#| hide
from nbdev.showdoc import *

In [10]:
#| export
import time

import pandas as pd
from datasets import load_dataset, Audio
from jiwer import wer, cer
from transformers import pipeline

from malayalam_asr_benchmarking.utils import is_target_text_in_range, get_text, normalise, data, get_model_size

In [15]:
#| export
def evaluate_whisper_model_common_voice(model_name: "str")->None:
    whisper_asr = pipeline(
            "automatic-speech-recognition", model=model_name, device=0
        )

    dataset = load_dataset(
            "mozilla-foundation/common_voice_11_0",
            "ml",
            split="test"
    )
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
    dataset = dataset.map(normalise)
    dataset = dataset.filter(is_target_text_in_range, input_columns=["norm_text"])
    predictions = []
    references = []

    start = time.time()
    for out in whisper_asr(data(dataset), batch_size=16):
        predictions.append(whisper_norm(out["text"]))
        references.append(out["reference"][0])
        
        
    end = time.time()
    print(f"Total time taken: {end - start}")
    
    df = pd.DataFrame({"predictions": predictions, "ground_truth": references})
    df["model_name"] = model_name
    df["wer"] = df.apply(lambda row: wer(row["ground_truth"], row["predictions"]), axis=1)
    df["cer"] = df.apply(lambda row: cer(row["ground_truth"], row["predictions"]), axis=1)
    df["total_time"] = end-start
    
    rwer = wer(references, predictions)
    rwer = round(100 * rwer, 2)
    print(f"The WER of model: {rwer}")

    rcer = cer(references, predictions)
    rcer = round(100 * rcer, 2)
    print(f"The CER of model: {rcer}")
    
    print(f"The model size is: {get_model_size(whisper_asr.model)}")
    df["model_size"] = get_model_size(whisper_asr.model)
    df.to_parquet(f"test_file.parquet")

In [9]:
evaluate_whisper_model_common_voice("parambharat/whisper-tiny-ml")

Found cached dataset common_voice_11_0 (/home/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ml/11.0.0/2c65b95d99ca879b1b1074ea197b65e0497848fd697fdb0582e0f6b75b6f4da0)


Map:   0%|          | 0/112 [00:00<?, ? examples/s]

Filter:   0%|          | 0/112 [00:00<?, ? examples/s]



Total time taken: 66.36547708511353
The WER of model: 38.31
The CER of model: 21.93
The model size is: 37.76M


In [16]:
evaluate_whisper_model_common_voice("parambharat/whisper-tiny-ml")

Found cached dataset common_voice_11_0 (/home/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ml/11.0.0/2c65b95d99ca879b1b1074ea197b65e0497848fd697fdb0582e0f6b75b6f4da0)
Loading cached processed dataset at /home/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ml/11.0.0/2c65b95d99ca879b1b1074ea197b65e0497848fd697fdb0582e0f6b75b6f4da0/cache-374585c2877047e3.arrow
Loading cached processed dataset at /home/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ml/11.0.0/2c65b95d99ca879b1b1074ea197b65e0497848fd697fdb0582e0f6b75b6f4da0/cache-22670505c562e0d4.arrow


Total time taken: 63.78305149078369
The WER of model: 38.31
The CER of model: 21.93
The model size is: 37.76M
                                         predictions  \
0  ഇന ത ര വതത ത ന ത ടർന ന ണ ട യ സ ക ക വര ത ത കല പ...   
1  പ ന റ യ നൽക ന നത ക ണ ട ര ഗത ത ല ക ക ള ള മ റ റ ...   
2         എന ത ന ന ഒര ക ല ഇക കടകന ന ന ങ ങ ള ട ട ത ണ    
3  ന പ പയ ണ ട ഉറവ ട കണ ട ത ത ന വല യ സ ഥവ വൻ നടക ക...   
4                     അത ക ക അത ക ണ ട പ യ ക ക ട ക ക    

                                        ground_truth  \
0  ഇന ദ ര വധത ത ന ത ടര ന ന ണ ട യ സ ഖ വ ര ദ ധ കല പ...   
1  പ ന ത ണയ നല ക ന നത ക ണ ട ര ഗത ത ല ക ക ള ള മ റ ...   
2              എന ത ന ഒര ക ല ഈ കട തന ന ന ങ ങള ട ത ണ    
3       ന പയ ട ഉറവ ട കണ ട ത ത ന വല യ ശ രമ നടക ക ന ന    
4                          അത ക അത ക ണ ട പ യ ക ടക ക    

                    model_name       wer       cer  total_time model_size  
0  parambharat/whisper-tiny-ml  0.450980  0.308333   63.783051     37.76M  
1  parambharat/whisper-tiny-ml  0.256410  0.122222   63.783051  

In [17]:
import pandas as pd

In [18]:
df = pd.read_parquet("test_file.parquet")
df.head()

Unnamed: 0,predictions,ground_truth,model_name,wer,cer,total_time,model_size
0,ഇന ത ര വതത ത ന ത ടർന ന ണ ട യ സ ക ക വര ത ത കല പ...,ഇന ദ ര വധത ത ന ത ടര ന ന ണ ട യ സ ഖ വ ര ദ ധ കല പ...,parambharat/whisper-tiny-ml,0.45098,0.308333,63.783051,37.76M
1,പ ന റ യ നൽക ന നത ക ണ ട ര ഗത ത ല ക ക ള ള മ റ റ ...,പ ന ത ണയ നല ക ന നത ക ണ ട ര ഗത ത ല ക ക ള ള മ റ ...,parambharat/whisper-tiny-ml,0.25641,0.122222,63.783051,37.76M
2,എന ത ന ന ഒര ക ല ഇക കടകന ന ന ങ ങ ള ട ട ത ണ,എന ത ന ഒര ക ല ഈ കട തന ന ന ങ ങള ട ത ണ,parambharat/whisper-tiny-ml,0.4375,0.25,63.783051,37.76M
3,ന പ പയ ണ ട ഉറവ ട കണ ട ത ത ന വല യ സ ഥവ വൻ നടക ക...,ന പയ ട ഉറവ ട കണ ട ത ത ന വല യ ശ രമ നടക ക ന ന,parambharat/whisper-tiny-ml,0.277778,0.232558,63.783051,37.76M
4,അത ക ക അത ക ണ ട പ യ ക ക ട ക ക,അത ക അത ക ണ ട പ യ ക ടക ക,parambharat/whisper-tiny-ml,0.363636,0.208333,63.783051,37.76M


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   predictions   112 non-null    object 
 1   ground_truth  112 non-null    object 
 2   model_name    112 non-null    object 
 3   wer           112 non-null    float64
 4   cer           112 non-null    float64
 5   total_time    112 non-null    float64
 6   model_size    112 non-null    object 
dtypes: float64(3), object(4)
memory usage: 6.2+ KB
