# Evaluation Malayalam Speech Corpus(MSC) dataset

In [None]:
#| default_exp msc

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import time
from typing import List

import pandas as pd
from datasets import load_dataset, Audio
from jiwer import wer, cer
from transformers import pipeline
from tqdm.notebook import tqdm
from whisper_normalizer.basic import BasicTextNormalizer

from malayalam_asr_benchmarking.utils import is_target_text_in_range, get_text, normalise, data, get_model_size, clear_gpu_memory

## Loading dataset and evaluating model

In [None]:
#| export
def load_malayalam_speech_corpus_dataset():
    dataset = load_dataset(
            "thennal/msc",
            split="train"
    )
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
    dataset = dataset.map(normalise)
    dataset = dataset.filter(is_target_text_in_range, input_columns=["norm_text"])
    return dataset

In [None]:
#| export
normalizer = BasicTextNormalizer()
def evaluate_whisper_model_msc(
        model_name: str, # The model name
        werlist: List[float], # WER List
        cerlist: List[float], # CER list
        modelsizelist: List[str], # model size list
        timelist: List[float], # time(s) list
        bs:int =16, # batch size
)->None:
    whisper_asr = pipeline(
            "automatic-speech-recognition", model=model_name, device=0
        )
    dataset = load_malayalam_speech_corpus_dataset()
    
    predictions = []
    references = []

    start = time.time()
    print("process of calculating predictions")
    for out in tqdm(whisper_asr(data(dataset), batch_size=bs)):
        predictions.append(normalizer(out["text"]))
        references.append(normalizer(out["reference"][0]))
        
    print("completed getting predictions")
    end = time.time()
    print(f"Total time taken: {end - start}")
    timelist.append(end - start)
    
    df = pd.DataFrame({"predictions": predictions, "ground_truth": references})
    df["model_name"] = model_name
    df["wer"] = df.apply(lambda row: wer(normalizer(row["ground_truth"]), normalizer(row["predictions"])), axis=1)
    df["cer"] = df.apply(lambda row: cer(normalizer(row["ground_truth"]), normalizer(row["predictions"])), axis=1)
    df["total_time"] = end-start
    
    rwer = wer(references, predictions)
    rwer = round(100 * rwer, 2)
    df["total_wer"] = rwer
    werlist.append(rwer)
    print(f"The WER of model: {rwer}")

    rcer = cer(references, predictions)
    rcer = round(100 * rcer, 2)
    df["total_cer"] = rcer
    cerlist.append(rcer)
    print(f"The CER of model: {rcer}")
    
    print(f"The model size is: {get_model_size(whisper_asr.model)}")
    modelsizelist.append(get_model_size(whisper_asr.model))
    df["model_size"] = get_model_size(whisper_asr.model)
    
    save_name = model_name.split("/")
    print(save_name)
    df.to_parquet(f"/home/c/{save_name[0]}_{save_name[1]}_msc.parquet")
    
    clear_gpu_memory()

## Testing with a sample model

In [None]:
#|eval: false
wer_list = []
cer_list = []
model_size_list = []
time_list = []

In [None]:
#|eval: false
evaluate_whisper_model_msc("parambharat/whisper-tiny-ml", wer_list, cer_list, model_size_list, time_list)

Found cached dataset parquet (/home/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /home/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-30f1618974cdefce.arrow
Loading cached processed dataset at /home/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-e4f860ca9b159c26.arrow


process of calculating predictions


0it [00:00, ?it/s]



completed getting predictions
Total time taken: 686.1272251605988
The WER of model: 43.96
The CER of model: 25.78
The model size is: 37.76M
['parambharat', 'whisper-tiny-ml']


OSError: Cannot save file into a non-existent directory: '/home/malayalam_msc_benchmarking'

In [None]:
#|eval: false
wer_list,cer_list, time_list, model_size_list

([43.96], [25.78], [686.1272251605988], ['37.76M'])

In [None]:
#|eval: false
evaluate_whisper_model_msc("openai/whisper-tiny", wer_list, cer_list, model_size_list, time_list)

Found cached dataset parquet (/home/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /home/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-30f1618974cdefce.arrow
Loading cached processed dataset at /home/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-e4f860ca9b159c26.arrow


process of calculating predictions


0it [00:00, ?it/s]



completed getting predictions
Total time taken: 382.24998211860657
The WER of model: 139.63
The CER of model: 177.3
The model size is: 37.76M
['openai', 'whisper-tiny']


In [None]:
#|eval: false
evaluate_whisper_model_msc("openai/whisper-base", wer_list, cer_list, model_size_list, time_list)

Found cached dataset parquet (/home/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /home/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-30f1618974cdefce.arrow
Loading cached processed dataset at /home/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-e4f860ca9b159c26.arrow


process of calculating predictions


0it [00:00, ?it/s]



completed getting predictions
Total time taken: 448.9504859447479
The WER of model: 155.97
The CER of model: 200.05
The model size is: 72.59M
['openai', 'whisper-base']


In [None]:
#|eval: false
evaluate_whisper_model_msc("openai/whisper-small", wer_list, cer_list, model_size_list, time_list)

Found cached dataset parquet (/home/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /home/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-30f1618974cdefce.arrow
Loading cached processed dataset at /home/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-e4f860ca9b159c26.arrow


process of calculating predictions


0it [00:00, ?it/s]



completed getting predictions
Total time taken: 479.73656272888184
The WER of model: 111.57
The CER of model: 123.7
The model size is: 241.73M
['openai', 'whisper-small']


In [None]:
#|eval: false
evaluate_whisper_model_msc("anuragshas/whisper-large-v2-ml", wer_list, cer_list, model_size_list, time_list, bs=4)

Found cached dataset parquet (/home/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /home/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-30f1618974cdefce.arrow
Loading cached processed dataset at /home/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-e4f860ca9b159c26.arrow


process of calculating predictions


0it [00:00, ?it/s]



completed getting predictions
Total time taken: 10467.876322746277
The WER of model: 23.57
The CER of model: 12.33
The model size is: 1.54B
['anuragshas', 'whisper-large-v2-ml']


In [None]:
#|eval: false
evaluate_whisper_model_msc("DrishtiSharma/whisper-large-v2-malayalam", wer_list, cer_list, model_size_list, time_list, bs=4)

Found cached dataset parquet (/home/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /home/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-30f1618974cdefce.arrow
Loading cached processed dataset at /home/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-e4f860ca9b159c26.arrow


process of calculating predictions


0it [00:00, ?it/s]

