## **Evaluate Fine-Tuned Noise-Augmented Whisper Model**

In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [2]:
!pip uninstall -y torch torchvision torchaudio triton nvidia-cublas-cu12 nvidia-cuda-runtime-cu12 nvidia-cudnn-cu12 nvidia-cufft-cu12 nvidia-curand-cu12 nvidia-cusolver-cu12 nvidia-cusparse-cu12 nvidia-nccl-cu12

Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Found existing installation: triton 3.2.0
Uninstalling triton-3.2.0:
  Successfully uninstalled triton-3.2.0
Found existing installation: nvidia-cublas-cu12 12.5.3.2
Uninstalling nvidia-cublas-cu12-12.5.3.2:
  Successfully uninstalled nvidia-cublas-cu12-12.5.3.2
Found existing installation: nvidia-cuda-runtime-cu12 12.5.82
Uninstalling nvidia-cuda-runtime-cu12-12.5.82:
  Successfully uninstalled nvidia-cuda-runtime-cu12-12.5.82
Found existing installation: nvidia-cudnn-cu12 9.3.0.75
Uninstalling nvidia-cudnn-cu12-9.3.0.75:
  Successfully uninstalled nvidia-cudnn-cu12-9.3.0.75
Found exi

In [3]:
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu125

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu125
Collecting torch
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading torchvision-0.21.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading torchaudio-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6

In [4]:
import torch
import triton
import torch.backends.cudnn as cudnn

print("Torch Version:", torch.__version__)
print("Torch CUDA Version:", torch.version.cuda)
print("CUDA Available:", torch.cuda.is_available())
print("Triton Version:", triton.__version__)
print("cuDNN Version:", cudnn.version())

Torch Version: 2.6.0+cu124
Torch CUDA Version: 12.4
CUDA Available: True
Triton Version: 3.2.0
cuDNN Version: 90100


### **GPU Setup**

In [5]:
!pip install datasets evaluate jiwer pydub

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downl

In [6]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from google.colab import drive
import time
import warnings
import numpy as np
from datasets import load_dataset
from itertools import islice
from torch.utils.data import IterableDataset
import json
import os
import wave
import threading
import textwrap
from IPython.display import clear_output
import evaluate
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
import collections
import pickle
import random
from pydub import AudioSegment
from IPython.display import Audio

In [7]:
import os
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [8]:
num_test_examples = 100

In [9]:
# Tell the progam to use the GPU allocated to us by setting the env variable used by CUDA
# Use the first GPU on your machine
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

**User Action Required**
- Select whisper version
- Select checkpoint number
- Select number of test examples

In [10]:
# Google Drive Env Setup
whisper_ver = 'whisper-base'
checkpoint_num = '2600'
drive.mount('/content/drive')
checkpoint_path = f'/content/drive/My Drive/{whisper_ver}-noiseaugmented-minieval-checkpoints/checkpoint-{checkpoint_num}'

# Model setup code for fine-tuned whisper
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = WhisperForConditionalGeneration.from_pretrained(checkpoint_path).to(device)
model.config.use_cache = True
processor = WhisperProcessor.from_pretrained(f"openai/{whisper_ver}", language="en", task="transcribe")

Mounted at /content/drive


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

### **Load Dataset**

In [11]:
dataset_repo_test = "johnlohjy/imda_nsc_p3_test_noiseaugmented_mini"
dataset_test = load_dataset(dataset_repo_test, split='test', streaming=True, trust_remote_code=True)

imda_nsc_p3_test_noiseaugmented_mini.py:   0%|          | 0.00/3.96k [00:00<?, ?B/s]

In [12]:
dataset_test

IterableDataset({
    features: ['path', 'audio', 'sentence'],
    num_shards: 1
})

### **Initialise Fine-tuned Whisper Model**

In [13]:
!pip install -q bitsandbytes accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25h

### **Prepare Dataset for Whisper**

In [14]:
def prepare_dataset(batch):
    # load audio data
    audio = batch["audio"]

    # Perform feature extraction: Compute log-Mel input features from input audio array
    # Use feature extractor to compute log-Mel spectrogram input features from 1D audio array
    # Pre-process raw audio-inputs
    batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    # Perform tokenization: Encode target text to label ids
    # Encode transcriptions to label ids through use of tokenizer
    # Post-process model outputs to text format
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    return batch

In [15]:
dataset_test.column_names

['path', 'audio', 'sentence']

In [16]:
dataset_test_processed = dataset_test.map(prepare_dataset)

### **Define Generation**

In [17]:
def transcribe(model, example):
    input_features = torch.tensor(example["input_features"]).unsqueeze(0).to(device)
    # Generate token IDs
    generated_ids = model.generate(input_features)
    # Decode token IDs to text
    predicted_transcription = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return predicted_transcription

### **Define Evaluation Metrics**

In [18]:
!pip install evaluate



In [19]:
!pip install jiwer



In [20]:
import evaluate

In [21]:
metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [22]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
normalizer = BasicTextNormalizer()

def calculate_wer(model, dataset):
    predictions = []
    references = []

    # Add in wav file names to extract SNR values later
    wav_fnames = []
    # Add in total test audio length
    total_test_audio = 0

    for sample in dataset:
        transcription = transcribe(model, sample)
        reference = sample["sentence"]

        predictions.append(normalizer(transcription))
        references.append(normalizer(reference))
        # Add the wav file names
        wav_fnames.append(os.path.splitext(sample["path"].split('/')[-1])[0])
        # Accumulate the test audio length
        total_test_audio += len(sample["audio"]["array"])/sample["audio"]["sampling_rate"]

    # Compute WER between predictions and reference labels, as a percentage
    wer = 100 * metric.compute(predictions=predictions, references=references)

    return {"wer": wer, "wav_fnames": wav_fnames, "test_audio_len": total_test_audio}

### **Slice Test Set**

In [23]:
dataset_test_processed_iter = iter(dataset_test_processed)

In [24]:
from itertools import islice
from torch.utils.data import IterableDataset

class SlicedDataset(IterableDataset):
    def __init__(self, dataset, num_examples):
        self.dataset = dataset
        self.num_examples = num_examples

    def __iter__(self):
        return islice(iter(self.dataset), self.num_examples)

    def __len__(self):
        return self.num_examples

dataset_test_processed_iter_reduced = SlicedDataset(dataset_test_processed_iter, num_examples=num_test_examples)

### **Evaluate the fine-tuned model**

In [25]:
res = calculate_wer(model, dataset_test_processed_iter_reduced)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [26]:
res

{'wer': 15.275625415098517,
 'wav_fnames': ['3148-2_1',
  '3148-2_2',
  '3148-2_3',
  '3148-2_4',
  '3148-2_5',
  '3148-2_6',
  '3148-2_7',
  '3148-2_8',
  '3148-2_9',
  '3148-2_10',
  '3148-2_11',
  '3148-2_12',
  '3147-1_1',
  '3147-1_2',
  '3147-1_3',
  '3147-1_4',
  '3147-1_5',
  '3147-1_6',
  '3147-1_7',
  '3147-1_8',
  '3147-1_9',
  '3147-1_10',
  '3147-1_11',
  '3147-1_12',
  '3146-2_1',
  '3146-2_2',
  '3146-2_3',
  '3146-2_4',
  '3146-2_5',
  '3146-2_6',
  '3146-2_7',
  '3146-2_8',
  '3146-2_9',
  '3146-2_10',
  '3146-2_11',
  '3146-2_12',
  '3145-1_1',
  '3145-1_2',
  '3145-1_3',
  '3145-1_4',
  '3145-1_5',
  '3145-1_6',
  '3145-1_7',
  '3145-1_8',
  '3145-1_9',
  '3145-1_10',
  '3145-1_11',
  '3145-1_12',
  '3144-2_1',
  '3144-2_2',
  '3144-2_3',
  '3144-2_4',
  '3144-2_5',
  '3144-2_6',
  '3144-2_7',
  '3144-2_8',
  '3144-2_9',
  '3144-2_10',
  '3144-2_11',
  '3144-2_12',
  '3143-1_1',
  '3143-1_2',
  '3143-1_3',
  '3143-1_4',
  '3143-1_5',
  '3143-1_6',
  '3143-1_7',
  '31

In [None]:
res['wer']

16.319746597514705

In [None]:
test_audio_len_mins = res['test_audio_len']/60

In [None]:
test_audio_len_mins

402.2383260416658

In [None]:
test_audio_len_hours = res['test_audio_len']/3600

In [None]:
test_audio_len_hours

6.70397210069443

**User Action Required**
- Upload the test wav file SNR values

In [None]:
snr_wavfile_values = {}

with open("waves_snr.txt", "r") as file:
    for line in file:
        key, value = line.strip().split(' ')
        snr_wavfile_values[key] = value

In [None]:
snr_wavfile_values

{'3000-1_11': '5',
 '3000-1_17': '15',
 '3000-1_26': '10',
 '3000-1_28': '15',
 '3000-1_29': '15',
 '3000-1_34': 'none',
 '3000-1_35': '15',
 '3000-1_36': '10',
 '3000-1_38': '15',
 '3000-1_39': '10',
 '3000-1_43': '15',
 '3000-1_52': 'none',
 '3000-1_55': 'none',
 '3000-1_56': '5',
 '3000-1_7': '15',
 '3000-1_74': '5',
 '3000-1_77': 'none',
 '3000-1_84': '15',
 '3000-2_21': '5',
 '3000-2_23': 'none',
 '3000-2_24': 'none',
 '3000-2_27': '15',
 '3000-2_28': 'none',
 '3000-2_30': '10',
 '3000-2_38': '10',
 '3000-2_45': 'none',
 '3000-2_51': '5',
 '3000-2_53': '10',
 '3000-2_55': '5',
 '3000-2_7': '15',
 '3000-2_71': 'none',
 '3000-2_8': '10',
 '3000-2_84': '15',
 '3000-2_85': 'none',
 '3001-1_1': 'none',
 '3001-1_101': '10',
 '3001-1_105': '5',
 '3001-1_11': '15',
 '3001-1_110': '15',
 '3001-1_113': '10',
 '3001-1_115': '5',
 '3001-1_118': '5',
 '3001-1_119': '5',
 '3001-1_12': 'none',
 '3001-1_126': 'none',
 '3001-1_127': 'none',
 '3001-1_13': '5',
 '3001-1_130': '5',
 '3001-1_131': '15

In [None]:
import collections
snr_wavfile_counts = collections.defaultdict(int)

In [None]:
for fname in res['wav_fnames']:
    snr_wavfile_counts[snr_wavfile_values[fname]] += 1

In [None]:
snr_wavfile_counts

defaultdict(int, {'5': 259, '15': 235, '10': 257, 'none': 249})

In [None]:
len(res['wav_fnames'])

1000

In [None]:
len(snr_wavfile_values)

37338