# Evaluation Common Voice - malayalam subset dataset

In [1]:
#| default_exp commonvoice

In [6]:
#| hide
from nbdev.showdoc import *

In [1]:
#| export
import time
from typing import List

import pandas as pd
from datasets import load_dataset, Audio
from faster_whisper import WhisperModel
from jiwer import wer, cer
from transformers import pipeline
from tqdm.notebook import tqdm
from whisper_normalizer.malayalam import MalayalamTextNormalizer

from malayalam_asr_benchmarking.utils import is_target_text_in_range, get_text, data, get_model_size, clear_gpu_memory

SyntaxError: invalid syntax (616949157.py, line 11)

In [3]:
#| export
def load_common_voice_malayalam_dataset():
    dataset = load_dataset(
            "mozilla-foundation/common_voice_11_0",
            "ml",
            split="test"
    )
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
    dataset = dataset.filter(is_target_text_in_range, input_columns=["norm_text"])
    return dataset

### Transformer Whisper models

In [4]:
#| export
normalizer = MalayalamTextNormalizer()
def evaluate_whisper_model_common_voice(
        model_name: str, # The model name
        werlist: List[float], # WER List
        cerlist: List[float],# CER list
        modelsizelist: List[str], # model size list
        timelist: List[float], # time(s) list
        bs:int =16, # batch size. Default value is 16.
)->None:
    """A utility function for calculing WER in Common voice dataset provided a model name in huggingface.
       You can store a WER, CER, ModelSize, TimeList to calculate results cumulatively over different epochs
    """
    whisper_asr = pipeline(
            "automatic-speech-recognition", model=model_name, device=0
        )
    dataset = load_common_voice_malayalam_dataset()

    prediction_raw = []
    references_raw = []
    predictions = []
    references = []

    start = time.time()
    for out in whisper_asr(data(dataset), batch_size=bs):
        prediction_raw.append(out["text"])
        references_raw.append(out["reference"][0])
        predictions.append(normalizer((out["text"])))
        references.append(normalizer(out["reference"][0]))
        
        
    end = time.time()
    print(f"Total time taken: {end - start}")
    timelist.append(end - start)
    
    df = pd.DataFrame({"predictions": predictions, "ground_truth": references})
    df["model_name"] = model_name
    df["wer"] = df.apply(lambda row: wer(normalizer(row["ground_truth"]), normalizer(row["predictions"])), axis=1)
    df["cer"] = df.apply(lambda row: cer(normalizer(row["ground_truth"]), normalizer(row["predictions"])), axis=1)
    df["total_time"] = end-start
    
    rwer = wer(references, predictions)
    rwer = round(100 * rwer, 2)
    werlist.append(rwer)
    print(f"The WER of model: {rwer}")

    rcer = cer(references, predictions)
    rcer = round(100 * rcer, 2)
    cerlist.append(rcer)
    print(f"The CER of model: {rcer}")
    
    print(f"The model size is: {get_model_size(whisper_asr.model)}")
    modelsizelist.append(get_model_size(whisper_asr.model))
    df["model_size"] = get_model_size(whisper_asr.model)
    
    save_name = model_name.split("/")
    print(save_name)
    df.to_parquet(f"{save_name[0]}_{save_name[1]}_commonvoice.parquet")
    
    clear_gpu_memory()

In [7]:
show_doc(evaluate_whisper_model_common_voice)

---

[source](https://github.com/kurianbenoy/malayalam_asr_benchmarking/blob/main/malayalam_asr_benchmarking/commonvoice.py#L38){target="_blank" style="float:right; font-size:smaller"}

### evaluate_whisper_model_common_voice

>      evaluate_whisper_model_common_voice (model_name:str, werlist:List[float],
>                                           cerlist:List[float],
>                                           modelsizelist:List[str],
>                                           timelist:List[float], bs:int=16)

A utility function for calculing WER in Common voice dataset provided a model name in huggingface.
You can store a WER, CER, ModelSize, TimeList to calculate results cumulatively over different epochs

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| model_name | str |  | The model name |
| werlist | List |  | WER List |
| cerlist | List |  | CER list |
| modelsizelist | List |  | model size list |
| timelist | List |  | time(s) list |
| bs | int | 16 | batch size. Default value is 16. |
| **Returns** | **None** |  |  |

## Testing with a sample model

In [None]:
#|eval: false
wer_list = []
cer_list = []
model_size_list = []
time_list = []
evaluate_whisper_model_common_voice("parambharat/whisper-tiny-ml", wer_list, cer_list, model_size_list, time_list)

Found cached dataset common_voice_11_0 (/home/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ml/11.0.0/2c65b95d99ca879b1b1074ea197b65e0497848fd697fdb0582e0f6b75b6f4da0)
Loading cached processed dataset at /home/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ml/11.0.0/2c65b95d99ca879b1b1074ea197b65e0497848fd697fdb0582e0f6b75b6f4da0/cache-374585c2877047e3.arrow
Loading cached processed dataset at /home/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ml/11.0.0/2c65b95d99ca879b1b1074ea197b65e0497848fd697fdb0582e0f6b75b6f4da0/cache-22670505c562e0d4.arrow


Total time taken: 59.84694576263428
The WER of model: 38.76
The CER of model: 22.21
The model size is: 37.76M
['parambharat', 'whisper-tiny-ml']


In [None]:
#|eval: false
wer_list

[38.76]

In [None]:
#|eval: false
cer_list

[22.21]

In [None]:
#|eval: false
model_size_list

['37.76M']

In [None]:
#|eval: false
time_list

[59.84694576263428]

### Faster-Whisper models

In [12]:
model = WhisperModel("kurianbenoy/vegam-whisper-medium-ml-fp16")

In [9]:
dataset = load_common_voice_malayalam_dataset()
t = dataset[0]

In [10]:
t

{'client_id': '82e6e487a5bf6fd5901946fc6e05ec523f0bc9fb574fbd7128b955f021a0831d629d8f6d61a7b107e8f1cbf9652d0cfb22c535bf999860db0ffdcd0f25f48261',
 'path': '/home/.cache/huggingface/datasets/downloads/extracted/2351c96984ecb73430e25907bb4c930df191ec6cd4e5d087deafed9b0ba1e99a/ml_test_0/common_voice_ml_32261077.mp3',
 'audio': {'path': '/home/.cache/huggingface/datasets/downloads/extracted/2351c96984ecb73430e25907bb4c930df191ec6cd4e5d087deafed9b0ba1e99a/ml_test_0/common_voice_ml_32261077.mp3',
  'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          9.95938535e-06, -7.94161679e-06,  4.42485316e-06]),
  'sampling_rate': 16000},
 'sentence': 'ഇന്ദിരാവധത്തിനെ തുടര്ന്നുണ്ടായ സിഖ് വിരുദ്ധ കലാപമാണ് വിഭജനത്തിനുശേഷം സ്വതന്ത്ര ഇന്ത്യ കണ്ടതില് വെച്ചു ഏറ്റവും രൂക്ഷമായ വംശീയകലാപം.',
 'up_votes': 2,
 'down_votes': 0,
 'age': '',
 'gender': '',
 'accent': '',
 'locale': 'ml',
 'segment': '',
 'norm_text': 'ഇന ദ ര വധത ത ന ത ടര ന ന ണ ട യ സ ഖ വ ര ദ ധ കല പമ ണ വ ഭജനത ത ന ശ ഷ സ വതന

In [22]:
segments, info = model.transcribe(t["audio"]["array"], beam_size=5)
print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

Detected language 'ba' with probability 0.361816


In [23]:
" ".join([segment.text for segment in segments])

'ഇന്ദിര വധത്തിനെ തുടർന്നുണ്ടായ സിഖുവിരുദ്ധ കലാപമാണ് വിഭജനത്തിനു ശേഷം സ്വതന്ത്ര്യ ഇന്ത്യ കണ്ടെത്തിൽ വെച്ച'

In [52]:
#| export
def evaluate_faster_whisper_model_common_voice(
        model_name: str, # The model name
        werlist: List[float], # WER List
        cerlist: List[float],# CER list
        modelsizelist: List[str], # model size list
        timelist: List[float], # time(s) list
        bs:int =16, # batch size. Default value is 16.
        compute_type:str="float16", # The compute type supported by faster-Whisper
        beam_size=1, # beam size
)->None:
    """A utility function for calculing WER in Common voice dataset provided a model name in huggingface.
       You can store a WER, CER, ModelSize, TimeList to calculate results cumulatively over different epochs
    """
    dataset = load_common_voice_malayalam_dataset()
    model = WhisperModel(model_name, device="cuda", compute_type=compute_type)
     
    
    predictions = []
    references = []

    start = time.time()
    for x in tqdm(dataset):
        segments, info = model.transcribe(x["audio"]["array"], beam_size=beam_size)
        predictions_raw.append(" ".join([segment.text for segment in segments]))
        references_raw.append(x["sentence"])
        predictions.append(normalizer(" ".join([segment.text for segment in segments])))
        references.append(normalizer(x["sentence"]))
              
    end = time.time()
    print(f"Total time taken: {end - start}")
    timelist.append(end - start)
    
    df = pd.DataFrame({"predictions": predictions, "ground_truth": references})
    df["model_name"] = model_name
    df["wer"] = df.apply(lambda row: wer(normalizer(row["ground_truth"]), normalizer(row["predictions"])), axis=1)
    df["cer"] = df.apply(lambda row: cer(normalizer(row["ground_truth"]), normalizer(row["predictions"])), axis=1)
    df["total_time"] = end-start
    
    rwer = wer(references, predictions)
    rwer = round(100 * rwer, 2)
    werlist.append(rwer)
    print(f"The WER of model: {rwer}")

    rcer = cer(references, predictions)
    rcer = round(100 * rcer, 2)
    cerlist.append(rcer)
    print(f"The CER of model: {rcer}")
    
    # print(f"The model size is: {get_model_size(whisper_asr.model)}")
    # modelsizelist.append(get_model_size(whisper_asr.model))
    # df["model_size"] = get_model_size(whisper_asr.model)
    
    save_name = model_name.split("/")
    print(save_name)
    df.to_parquet(f"{save_name[0]}_{save_name[1]}_commonvoice.parquet")
    
    clear_gpu_memory()

In [53]:
show_doc(evaluate_faster_whisper_model_common_voice)

---

### evaluate_faster_whisper_model_common_voice

>      evaluate_faster_whisper_model_common_voice (model_name:str,
>                                                  werlist:List[float],
>                                                  cerlist:List[float],
>                                                  modelsizelist:List[str],
>                                                  timelist:List[float],
>                                                  bs:int=16,
>                                                  compute_type:str='float16',
>                                                  beam_size=1)

A utility function for calculing WER in Common voice dataset provided a model name in huggingface.
You can store a WER, CER, ModelSize, TimeList to calculate results cumulatively over different epochs

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| model_name | str |  | The model name |
| werlist | List |  | WER List |
| cerlist | List |  | CER list |
| modelsizelist | List |  | model size list |
| timelist | List |  | time(s) list |
| bs | int | 16 | batch size. Default value is 16. |
| compute_type | str | float16 | The compute type supported by faster-Whisper |
| beam_size | int | 1 | beam size |
| **Returns** | **None** |  |  |

In [54]:
#|eval: false
wer_list = []
cer_list = []
model_size_list = []
time_list = []
evaluate_faster_whisper_model_common_voice("kurianbenoy/vegam-whisper-medium-ml-fp16", wer_list, cer_list, model_size_list, time_list)
wer_list, cer_list, model_size_list, time_list

  0%|          | 0/112 [00:00<?, ?it/s]

Total time taken: 91.5117712020874
The WER of model: 24.71
The CER of model: 18.57
['kurianbenoy', 'vegam-whisper-medium-ml-fp16']


([24.71], [18.57], [], [91.5117712020874])

In [49]:
#|eval: false
wer_list = []
cer_list = []
model_size_list = []
time_list = []
evaluate_faster_whisper_model_common_voice("kurianbenoy/vegam-whisper-medium-ml-fp16", wer_list, cer_list, model_size_list, time_list, beam_size=5)
wer_list, cer_list, model_size_list, time_list

  0%|          | 0/112 [00:00<?, ?it/s]

Total time taken: 132.16357469558716
The WER of model: 20.5
The CER of model: 13.95
['kurianbenoy', 'vegam-whisper-medium-ml-fp16']


([20.5], [13.95], [], [132.16357469558716])