## **Evaluate Fine-Tuned Whisper**

### **GPU Setup**

In [1]:
import os

In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Nov 18 17:28:09 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0              50W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
# Tell the progam to use the GPU allocated to us by setting the env variable used by CUDA
# Use the first GPU on your machine
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

### **GoogleDrive Environment Setup**

- Get stored model checkpoints

In [4]:
from google.colab import drive
google_drive_folder = 'whisper-small-checkpoints'
google_drive_path = f'/content/drive/My Drive/{google_drive_folder}'
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### **Load Dataset**

In [5]:
!pip install datasets



In [6]:
from datasets import load_dataset
from IPython.display import Audio

In [7]:
dataset_repo_test = "johnlohjy/imda_nsc_p3_same_closemic_test"
dataset_test = load_dataset(dataset_repo_test, split='test', streaming=True, trust_remote_code=True)

### **Initialise Fine-tuned Whisper Model**

In [8]:
!pip install -q bitsandbytes accelerate

In [9]:
from transformers import WhisperForConditionalGeneration
from transformers import WhisperProcessor

In [10]:
checkpoint_path = '/content/drive/My Drive/whisper-small-checkpoints/checkpoint-400'

In [11]:
model = WhisperForConditionalGeneration.from_pretrained(checkpoint_path)
model.config.use_cache = True
processor = WhisperProcessor.from_pretrained("openai/whisper-small")

### **Prepare Dataset for Whisper**

In [12]:
def prepare_dataset(batch):
    # load audio data
    audio = batch["audio"]

    # Perform feature extraction: Compute log-Mel input features from input audio array
    # Use feature extractor to compute log-Mel spectrogram input features from 1D audio array
    # Pre-process raw audio-inputs
    batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    # Perform tokenization: Encode target text to label ids
    # Encode transcriptions to label ids through use of tokenizer
    # Post-process model outputs to text format
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    return batch

In [13]:
dataset_test_processed = dataset_test.map(prepare_dataset, remove_columns=dataset_test.column_names)

### **Define Data Collator**

In [14]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

In [15]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Data collator takes pre-processed data and prepares PyTorch tensors ready for the model
        # Treat input_features and labels independently.
        # input_features are handled by the feature extractor
        # labels are handled by the tokenizer

        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        # By replacing padding tokens with -100, they are not taken into account
        # when computing the loss
        # Error: The attention mask is not set and cannot be inferred from input because pad token is same as eos token.
        # eos_token_id and pad_token_id are actually both 50257
        # but we replace it with -100 in this line of code
        # https://discuss.huggingface.co/t/finetuning-whisper-attention-mask-not-set-and-canot-be-inferred/97456
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        # beginning of sentence token
        # Cut the BOS token from the start of the label seq as it is appended later during training
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [16]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

### **Define Evaluation Metrics**

In [17]:
!pip install evaluate



In [18]:
!pip install jiwer



In [19]:
import evaluate

In [20]:
metric = evaluate.load("wer")

In [21]:
import re
# https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
def normalize_wer(token):
    token = token.lower()
    token = re.sub(r'[^\w\s]', '', token)
    return token.strip()

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id in label_ids
    # Undoing the step in the data collator to ignore padded tokens correctly to calculate loss
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    # Decode the predicted and label ids to strings
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)


    #print('Pred str before')
    #print(pred_str)
    #print('Pred str after')
    pred_str = [normalize_wer(token) for token in pred_str]
    #print(pred_str)
    #print("")
    #print('Label str before')
    #print(label_str)
    #print('Label str after')
    label_str = [normalize_wer(token) for token in label_str]
    #print(label_str)
    #print("")
    #print("")

    # Compute WER between predictions and reference labels
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

### **Evaluate the fine-tuned model**

In [22]:
from itertools import islice
from torch.utils.data import IterableDataset

class SlicedDataset(IterableDataset):
    def __init__(self, dataset, num_examples):
        self.dataset = dataset
        self.num_examples = num_examples

    def __iter__(self):
        return islice(iter(self.dataset), self.num_examples)

    def __len__(self):
        return self.num_examples

dataset_test_processed_reduced = SlicedDataset(dataset_test_processed, num_examples=500)

In [23]:
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer, TrainerCallback, TrainingArguments, TrainerState, TrainerControl

training_args = Seq2SeqTrainingArguments(
    output_dir = './output',
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=True
)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    eval_dataset=dataset_test_processed_reduced,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

  trainer = Seq2SeqTrainer(


In [24]:
results = trainer.evaluate()

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [25]:
results

{'eval_loss': 1.0822489261627197,
 'eval_model_preparation_time': 0.0065,
 'eval_wer': 11.689510688161352,
 'eval_runtime': 192.6784,
 'eval_samples_per_second': 2.595,
 'eval_steps_per_second': 0.327}