## **Compare Models**

### **GPU Setup**

In [1]:
import os
import torch

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue Nov 19 13:53:53 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0              42W / 400W |      5MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
# Tell the progam to use the GPU allocated to us by setting the env variable used by CUDA
# Use the first GPU on your machine
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

**User Action Required**
- Select whisper version
- Select checkpoint number
- Select number of test examples

In [5]:
whisper_ver = 'whisper-base'

In [6]:
checkpoint_num = '2100'

In [7]:
num_test_examples = 600

### **GoogleDrive Environment Setup**

- Get stored model checkpoints

In [8]:
from google.colab import drive
google_drive_path = f'/content/drive/My Drive/{whisper_ver}-checkpoints'
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### **Load Dataset**

In [9]:
!pip install datasets



In [10]:
from datasets import load_dataset
from IPython.display import Audio

In [11]:
dataset_repo_test = "johnlohjy/imda_nsc_p3_same_closemic_train"
dataset_train = load_dataset(dataset_repo_test, split='train', streaming=True, trust_remote_code=True)

In [12]:
dataset_train_iter = iter(dataset_train)

In [13]:
train_example_1 = next(dataset_train_iter)

In [14]:
train_example_1

{'path': '../../dataset/train/waves/3000-1_11.wav',
 'audio': {'path': '../../dataset/train/waves/3000-1_11.wav',
  'array': array([-9.15527344e-05, -1.22070312e-04, -9.15527344e-05, ...,
          3.05175781e-04,  3.05175781e-04,  2.44140625e-04]),
  'sampling_rate': 16000},
 'sentence': "you know yeah she was from france so that's she shared with me about her culture lah so you know err she would show me around when she would when i would visit lah when i travel to france looking forward to that it's really not interesting generation gap what's the biggest similarity or difference between your generation and your parent's generation okay"}

In [15]:
dataset_repo_test = "johnlohjy/imda_nsc_p3_same_closemic_test"
dataset_test = load_dataset(dataset_repo_test, split='test', streaming=True, trust_remote_code=True)

### **Initialise Whisper Models**

In [16]:
!pip install -q bitsandbytes accelerate

In [17]:
from transformers import WhisperForConditionalGeneration
from transformers import WhisperProcessor

In [18]:
checkpoint_path = f'/content/drive/My Drive/{whisper_ver}-checkpoints/checkpoint-{checkpoint_num}'

In [19]:
model = WhisperForConditionalGeneration.from_pretrained(checkpoint_path).to(device)
model.config.use_cache = True
processor = WhisperProcessor.from_pretrained(f"openai/{whisper_ver}", language="en", task="transcribe")

In [20]:
model_org = WhisperForConditionalGeneration.from_pretrained(f"openai/{whisper_ver}", device_map="auto").to(device)
model_org.config.use_cache = True

In [32]:
model_org_en = WhisperForConditionalGeneration.from_pretrained(f"openai/{whisper_ver}.en", device_map="auto").to(device)
model_org_en.config.use_cache = True
model_org_en.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="English", task="transcribe"
)

In [36]:
!pip install whisper



In [40]:
from transformers import pipeline
pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", device='cuda')

### **Prepare Dataset for Whisper**

In [22]:
def prepare_dataset(batch):
    # load audio data
    audio = batch["audio"]

    # Perform feature extraction: Compute log-Mel input features from input audio array
    # Use feature extractor to compute log-Mel spectrogram input features from 1D audio array
    # Pre-process raw audio-inputs
    batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    # Perform tokenization: Encode target text to label ids
    # Encode transcriptions to label ids through use of tokenizer
    # Post-process model outputs to text format
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    return batch

In [23]:
dataset_test_processed = dataset_test.map(prepare_dataset, remove_columns=dataset_test.column_names)

### **Run Generation**

In [24]:
import torch
from IPython.display import Audio

In [25]:
def transcribe(model, example):
    input_features = torch.tensor(example["input_features"]).unsqueeze(0).to(device)
    # Generate token IDs
    generated_ids = model.generate(input_features)
    # Decode token IDs to text
    predicted_transcription = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    original_transcription = processor.tokenizer.decode(example["labels"], skip_special_tokens=True)
    print(f'The predicted transcription is: {predicted_transcription}')
    print(f'The original transcription is: {original_transcription}')

In [26]:
dataset_test_processed_iter_finetuned = iter(dataset_test_processed)

In [41]:
dataset_test_iter_original_en = iter(dataset_test)

In [27]:
dataset_test_processed_iter_original = iter(dataset_test_processed)

In [33]:
dataset_test_processed_iter_original_en = iter(dataset_test_processed)

In [29]:
for i in range(10):
  example = next(dataset_test_processed_iter_finetuned)
  transcribe(model, example)
  print("")


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


The predicted transcription is: mm you can go first you guys are gonna stand here they're like wow this is a weird topic um singapore and malaysia are like you know brothers but not really brothers brothers on a on a tricky relationship you know what let's skip this topic next do i go do i go next okay heng suay what's the best worst thing best or worst thing that can happen to you in singapore
The original transcription is: um you can go first you guys are going to stand here ah they are like wow this is a weird topic um singapore and malaysia are like you know brothers but not really brothers brothers on a on a tricky relationship you know what let's skip this topic next do i go do i go next okay heng suay what's the best worst thing best or worst thing that can happen to you in singapore

The predicted transcription is: um when was last time i interacted with a foreigner last week i guess so just a person i met through an event and then i mean so i i brought her around lah like in s

In [42]:
for i in range(10):
  example = next(dataset_test_iter_original_en)
  predicted_transcription = pipe(example['audio']['array'])
  original_transcription = example['sentence']
  print(f'The predicted transcription is: {predicted_transcription["text"]}')
  print(f'The original transcription is: {original_transcription}')
  print("")



The predicted transcription is:  You can go first. You guys are gonna stand here. They're like, wow, this is a weird topic. Singapore and Malaysia are like, you know, brothers, but not really brothers. Brothers on a tricky relationship. You know what, let's skip this topic. Next. Do I go next? Okay, Hing Sui, what's the best or worst thing that can happen to you in Singapore?
The original transcription is: um you can go first you guys are going to stand here ah they are like wow this is a weird topic um singapore and malaysia are like you know brothers but not really brothers brothers on a on a tricky relationship you know what let's skip this topic next do i go do i go next okay heng suay what's the best worst thing best or worst thing that can happen to you in singapore

The predicted transcription is:  When was last time I interacted with her for the last week I guess? So just a person I met through an event and then I mean so I brought her around like in Singapore to you know like 

In [30]:
for i in range(10):
  example = next(dataset_test_processed_iter_original)
  transcribe(model_org, example)
  print("")

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


The predicted transcription is:  Mereka boleh pergi? Mereka akan berhenti sini. Mereka suka, wow, ini adalah untuk berhenti. Singapore dan Malaysia berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhenti berhen

In [34]:
for i in range(10):
  example = next(dataset_test_processed_iter_original_en)
  transcribe(model_org_en, example)
  print("")

The predicted transcription is: az G It dis.azano H νalth didn. Is There very, Harvey, your S a athletober.анд g landed H very,ad down,��,ink So An��.keep re a�ulation.az down part, thankel ved yourober. replace. key T It ver? translated, ring reasoni, partel s 10 can yo에ac G�orad lанд?
The original transcription is: um you can go first you guys are going to stand here ah they are like wow this is a weird topic um singapore and malaysia are like you know brothers but not really brothers brothers on a on a tricky relationship you know what let's skip this topic next do i go do i go next okay heng suay what's the best worst thing best or worst thing that can happen to you in singapore

The predicted transcription is:  8ill longass T 팔�ifberos s long пр T ب? supportence a inv Tç� to state gign T seencause T obsberities very lандorad down very ש veryert op g r времени traditional g g isol wentir support
The original transcription is: um when was the last time i interacted with a foreigner 