# Evaluation Malayalam Speech Corpus(MSC) dataset

In [None]:
#| default_exp msc

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import time
from typing import List

import pandas as pd
from datasets import load_dataset, Audio
from faster_whisper import WhisperModel
from jiwer import wer, cer
from transformers import pipeline
from tqdm.notebook import tqdm
from whisper_normalizer.indic_normalizer import MalayalamNormalizer

from malayalam_asr_benchmarking.utils import get_text, data, get_model_size, clear_gpu_memory, store_results_as_dataset

  warn("The installed version of bitsandbytes was compiled without GPU support. "


/home/kurianbenoy/mambaforge/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


### Loading dataset and evaluating model

In [None]:
#| export
def load_malayalam_speech_corpus_dataset():
    dataset = load_dataset(
            "thennal/msc",
            split="train"
    )
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
    return dataset

In [None]:
#| export
normalizer = MalayalamNormalizer()

### Evaluating Whisper based model

In [None]:
#| export
def evaluate_whisper_model_msc(
        model_name: str, # The model name
        werlist: List[float], # WER List
        cerlist: List[float], # CER list
        modelsizelist: List[str], # model size list
        timelist: List[float], # time(s) list
        bs:int =16, # batch size
)->None:
    whisper_asr = pipeline(
            "automatic-speech-recognition", model=model_name, device=0
        )
    dataset = load_malayalam_speech_corpus_dataset()
    
    predictions = []
    references = []
    predictions_raw = []
    references_raw = []

    start = time.time()
    print("process of calculating predictions")
    for out in tqdm(whisper_asr(data(dataset), batch_size=bs)):
        predictions_raw.append(out["text"])
        references_raw.append(out["reference"][0])
        predictions.append(normalizer(out["text"]))
        references.append(normalizer(out["reference"][0]))
        
    print("completed getting predictions")
    end = time.time()
    print(f"Total time taken: {end - start}")
    timelist.append(end - start)
    
    rwer = wer(references, predictions)
    rwer = round(100 * rwer, 2)
    # df["total_wer"] = rwer
    werlist.append(rwer)
    print(f"The WER of model: {rwer}")

    rcer = cer(references, predictions)
    rcer = round(100 * rcer, 2)
    # df["total_cer"] = rcer
    cerlist.append(rcer)
    print(f"The CER of model: {rcer}")
    
    print(f"The model size is: {get_model_size(whisper_asr.model)}")
    modelsizelist.append(get_model_size(whisper_asr.model))
    # df["model_size"] = get_model_size(whisper_asr.model)

    store_results_as_dataset(predictions, predictions_raw, references, references_raw, model_name, end-start, get_model_size(whisper_asr.model), rwer, rcer, "msc.parquet")
    clear_gpu_memory()

In [None]:
show_doc(evaluate_whisper_model_msc)

---

[source](https://github.com/kurianbenoy/malayalam_asr_benchmarking/blob/main/malayalam_asr_benchmarking/msc.py#L37){target="_blank" style="float:right; font-size:smaller"}

### evaluate_whisper_model_msc

>      evaluate_whisper_model_msc (model_name:str, werlist:List[float],
>                                  cerlist:List[float], modelsizelist:List[str],
>                                  timelist:List[float], bs:int=16)

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| model_name | str |  | The model name |
| werlist | List |  | WER List |
| cerlist | List |  | CER list |
| modelsizelist | List |  | model size list |
| timelist | List |  | time(s) list |
| bs | int | 16 | batch size |
| **Returns** | **None** |  |  |

### Testing with a sample model

In [None]:
#|eval: false
wer_list = []
cer_list = []
model_size_list = []
time_list = []

In [None]:
#|eval: false
evaluate_whisper_model_msc("openai/whisper-tiny", wer_list, cer_list, model_size_list, time_list)

Downloading config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
#|eval: false
evaluate_whisper_model_msc("anuragshas/whisper-large-v2-ml", wer_list, cer_list, model_size_list, time_list, bs=4)

### Evaluating Faster-whisper based models

In [None]:
#| export
def evaluate_faster_whisper_model_msc(
        model_name: str, # The model name
        werlist: List[float], # WER List
        cerlist: List[float],# CER list
        modelsizelist: List[str], # model size list
        timelist: List[float], # time(s) list
        bs:int =16, # batch size. Default value is 16.
        compute_type:str="float16", # The compute type supported by faster-Whisper
        beam_size=1, # beam size
)->None:
    """A utility function for calculing WER in Common voice dataset provided a model name in huggingface.
       You can store a WER, CER, ModelSize, TimeList to calculate results cumulatively over different epochs
    """
    dataset = load_malayalam_speech_corpus_dataset()
    model = WhisperModel(model_name, device="cuda", compute_type=compute_type)
     
    
    predictions = []
    references = []
    predictions_raw = []
    references_raw = []

    start = time.time()
    for x in tqdm(dataset):
        segments, info = model.transcribe(x["audio"]["array"], beam_size=beam_size)
        predictions_raw.append(" ".join([segment.text for segment in segments]))
        predictions.append(normalizer(" ".join([segment.text for segment in segments])))
        references_raw.append(x["transcript"])
        references.append(normalizer(x["transcript"]))
              
    end = time.time()
    print(f"Total time taken: {end - start}")
    timelist.append(end - start)
    
    
    rwer = wer(references, predictions)
    rwer = round(100 * rwer, 2)
    werlist.append(rwer)
    print(f"The WER of model: {rwer}")

    rcer = cer(references, predictions)
    rcer = round(100 * rcer, 2)
    cerlist.append(rcer)
    print(f"The CER of model: {rcer}")
    
    # print(f"The model size is: {get_model_size(whisper_asr.model)}")
    # modelsizelist.append(get_model_size(whisper_asr.model))
    # df["model_size"] = get_model_size(whisper_asr.model)
    
    # save_name = model_name.split("/")
    # print(save_name)
    # df.to_parquet(f"{save_name[0]}_{save_name[1]}_msc.parquet")
    store_results_as_dataset(predictions, predictions_raw, references, references_raw, model_name, end-start, None, rwer, rcer, "msc.parquet")
    
    clear_gpu_memory()

In [None]:
show_doc(evaluate_faster_whisper_model_msc)

---

[source](https://github.com/kurianbenoy/malayalam_asr_benchmarking/blob/main/malayalam_asr_benchmarking/msc.py#L97){target="_blank" style="float:right; font-size:smaller"}

### evaluate_faster_whisper_model_msc

>      evaluate_faster_whisper_model_msc (model_name:str, werlist:List[float],
>                                         cerlist:List[float],
>                                         modelsizelist:List[str],
>                                         timelist:List[float], bs:int=16,
>                                         compute_type:str='float16',
>                                         beam_size=1)

A utility function for calculing WER in Common voice dataset provided a model name in huggingface.
You can store a WER, CER, ModelSize, TimeList to calculate results cumulatively over different epochs

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| model_name | str |  | The model name |
| werlist | List |  | WER List |
| cerlist | List |  | CER list |
| modelsizelist | List |  | model size list |
| timelist | List |  | time(s) list |
| bs | int | 16 | batch size. Default value is 16. |
| compute_type | str | float16 | The compute type supported by faster-Whisper |
| beam_size | int | 1 | beam size |
| **Returns** | **None** |  |  |

### Evaluating faster-Whisper based model

In [None]:
#|eval: false
wer_list = []
cer_list = []
model_size_list = []
time_list = []
evaluate_faster_whisper_model_msc("kurianbenoy/vegam-whisper-medium-ml-fp16", wer_list, cer_list, model_size_list, time_list)
wer_list, cer_list, model_size_list, time_list