<a href="https://colab.research.google.com/github/harrykeeran12/cs4040-miniproject/blob/main/CS4040_Research_Methods_Miniproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A zero-shot evaluation of small speech-to-text systems

The models evaluated are
- the small Fairseq model,
- the tiny Whisper model
- and the medium Conformer-CTC model,
due to their close parameter size.

A small subset of the dataset is loaded first.
Each step for the models is split up into:
- Loading the models.
- Translating the dataset into a form the model can understand using the processor.
- Running each model on the dataset, to check if they can infer the data correctly.

Then an evaluation is performed, with graphs.

**This will require the T4 GPU to be enabled on Google Colab.**


In [None]:
# !huggingface-cli login
!pip3 install torch torchvision torchaudio datasets transformers google.colab  datasets[audio] evaluate
!pip3 install wget matplotlib>=3.3.2 text-unidecode

!apt-get update && apt-get install -y libsndfile1 ffmpeg
!pip install Cython packaging
!pip install "nemo_toolkit[all]"
!pip install nemo_toolkit['asr']

## Install NeMo
# BRANCH = 'main'
# !python -m pip install --upgrade --user git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[all]

## Imports

In [None]:
from torch.utils.data import DataLoader
from pprint import pprint
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration, WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset, IterableDataset, Audio
import pandas as pd
import numpy as np

from evaluate import load
from IPython.display import display
import re
import matplotlib.pyplot as plt

import nemo.collections.asr as nemo_asr

## Load Dataset(Mozilla Common Voice v17)

In [None]:
RANDOMSEED = 42
SAMPLE_NUMBER = 15
SAMPLING_RATE = 16000

cv_17: IterableDataset = load_dataset("mozilla-foundation/common_voice_17_0", "en", split="validated", streaming=True, trust_remote_code=True)

cv_17 = cv_17.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))

random10List : list[dict]= []
iterdataset = iter(cv_17.shuffle(seed=RANDOMSEED, buffer_size = SAMPLE_NUMBER))
for i in range(SAMPLE_NUMBER):
  audio = next(iterdataset)
  random10List.append(audio)
  # pprint(audio)

# pprint(random10List)

## Fairseq Small

In [None]:
modelOne = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
processorOne = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")

fairseqOutput :dict[str,str]= {}
for audio in random10List:
  input_features = processorOne(
      audio["audio"]["array"],
      sampling_rate=SAMPLING_RATE,
      return_tensors="pt",
      padding=True
  ).input_features
  generated_ids = modelOne.generate(input_features=input_features)

  transcription = processorOne.batch_decode(generated_ids, skip_special_tokens=True)[0]
  fairseqOutput[audio["sentence"]] = transcription
  print(f'{audio["sentence"]} : {transcription}')

## Whisper Tiny

In [None]:
# load model and processor
processorTwo = WhisperProcessor.from_pretrained("openai/whisper-tiny")
modelTwo = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
modelTwo.config.forced_decoder_ids = None


whisperOutput :dict[str,str]= {}
for audio in random10List:

  # Whisper requires specific chunking requirements of 30 seconds for each item.
  # The chunk size is represented by 30(seconds) times the sampling rate(16000)
  chunk_size = 30 * SAMPLING_RATE
  audioArray = audio["audio"]["array"]



  chunks = [audioArray[i:i + chunk_size] for i in range(0, len(audioArray), chunk_size)]
  # Pads out the chunks that may be too small.
  padded_chunks = [np.pad(chunk, (0, chunk_size - len(chunk)), 'constant', constant_values=0) for chunk in chunks if chunk.shape[0] < chunk_size]
  # NOTE: Chunking may result in word errors at the boundary of the word.

  transcriptions = []
  for chunk in padded_chunks:
    input_features = processorTwo(chunk, sampling_rate=SAMPLING_RATE,
    return_tensors="pt",padding=True).input_features
    predicted_ids = modelTwo.generate(input_features)
    transcription = processorTwo.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    transcriptions.append(transcription)
  # Joins each part of the chunked transcription together.
  full_transcription = " ".join(transcriptions)
  whisperOutput[audio["sentence"]] = full_transcription
  print(f'{audio["sentence"]} : {full_transcription}')


## Medium Conformer Model

In [None]:
modelThree = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="stt_en_conformer_ctc_medium")


CTCOutput :dict[str,str]= {}
for audio in random10List:
  full_transcription = modelThree.transcribe(audio=audio["audio"]["array"])[0]
  CTCOutput[audio["sentence"]] = full_transcription
  print(f'{audio["sentence"]} :  {full_transcription}')

## Evaluation:


In [None]:
fairseqList = []
whisperList = []
sentenceList = []
CTCList = []

fairseqWERList = []
whisperWERList = []
CTCWERList = []

fairseqCERList = []
whisperCERList = []
CTCCERList = []

# Loads the Word Error Rate Metric + the Character Error Rate Metric from HuggingFace.
wer = load("wer", module_type="metric")
cer = load("cer", module_type="metric")

for audio in random10List:
  # Gets the corresponding output for each sentence.
  sentence = audio["sentence"]

  filteredSentence = re.sub(r'[^\w\s]','',sentence).lower()

  fairseqTranscription = fairseqOutput[sentence].lower()
  whisperTranscription = whisperOutput[sentence].lower()
  CTCTranscription = CTCOutput[sentence].lower()
  # Computes the WER for each model.
  fairseqWER = wer.compute(predictions=[filteredSentence], references=[fairseqTranscription])
  whisperWER = wer.compute(predictions=[filteredSentence], references=[whisperTranscription])
  CTCWER = wer.compute(predictions=[filteredSentence], references=[CTCTranscription])

  fairseqCER = cer.compute(predictions=[filteredSentence], references=[fairseqTranscription])
  whisperCER = cer.compute(predictions=[filteredSentence], references=[whisperTranscription])
  CTCCER = cer.compute(predictions=[filteredSentence], references=[CTCTranscription])


  # Appends each sentence + transcribed sentence to a list to be added to the model later.
  sentenceList.append(filteredSentence)
  fairseqList.append(fairseqTranscription)
  whisperList.append(whisperTranscription)
  CTCList.append(CTCTranscription)
  # Appends the WER to a list + rounds the number.
  fairseqWERList.append(round(fairseqWER, 2))
  whisperWERList.append(round(whisperWER, 2))
  CTCWERList.append(round(CTCWER, 2))

  # Appends the CER to a list + rounds the number.
  fairseqCERList.append(round(fairseqCER, 2))
  whisperCERList.append(round(whisperCER, 2))
  CTCCERList.append(round(CTCCER, 2))


transcriptionDF = pd.DataFrame({"Sentence(after preprocessing)": sentenceList, "Fairseq Transcription": fairseqList, "Whisper Transcription": whisperList, "CTC-Conformer Transcription" : CTCList})

WERDF = pd.DataFrame({"Sentence(after preprocessing)": sentenceList, "Fairseq WER": fairseqWERList,"Whisper WER" : whisperWERList, "CTC-Conformer WER": CTCWERList})

CERDF = pd.DataFrame({"Sentence(after preprocessing)": sentenceList, "Fairseq CER": fairseqCERList,"Whisper CER" : whisperCERList, "CTC-Conformer CER": CTCCERList})

display(WERDF)
display(CERDF)

In [None]:
plt.figure(figsize=(10, 6))

# Create bar plots for each model's WER
plt.bar(WERDF.index, WERDF["Fairseq WER"], label="Fairseq", width=0.25)
plt.bar(WERDF.index + 0.25, WERDF["Whisper WER"], label="Whisper", width=0.25)
plt.bar(WERDF.index + 0.5, WERDF["CTC-Conformer WER"], label="CTC-Conformer", width=0.25)


plt.xlabel("Sentence Number")
plt.ylabel("WER")
plt.title("WER Rates for Each Sentence")
plt.xticks(WERDF.index + 0.25, WERDF.index) # adjust x-axis ticks
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))

# Create bar plots for each model's CER
plt.bar(CERDF.index, CERDF["Fairseq CER"], label="Fairseq", width=0.25)
plt.bar(CERDF.index + 0.25, CERDF["Whisper CER"], label="Whisper", width=0.25)
plt.bar(CERDF.index + 0.5, CERDF["CTC-Conformer CER"], label="CTC-Conformer", width=0.25)


plt.xlabel("Sentence Number")
plt.ylabel("CER")
plt.title("CER Rates for Each Sentence")
plt.xticks(CERDF.index + 0.25, CERDF.index) # adjust x-axis ticks
plt.legend()
plt.tight_layout()
plt.show()