## **USER ACTION REQUIRED**

- Upload ```silero_vad.onnx```
- Upload ```waves_snr.txt```

In [None]:
num_test_examples = 80

## **Define ASR Model**

In [None]:
!nvcc --version

In [None]:
!pip uninstall -y torch torchvision torchaudio triton nvidia-cublas-cu12 nvidia-cuda-runtime-cu12 nvidia-cudnn-cu12 nvidia-cufft-cu12 nvidia-curand-cu12 nvidia-cusolver-cu12 nvidia-cusparse-cu12 nvidia-nccl-cu12

In [None]:
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu125

In [None]:
!pip install -q datasets bitsandbytes accelerate onnxruntime evaluate jiwer

In [None]:
import torch
import triton
import torch.backends.cudnn as cudnn

print("Torch Version:", torch.__version__)
print("Torch CUDA Version:", torch.version.cuda)
print("CUDA Available:", torch.cuda.is_available())
print("Triton Version:", triton.__version__)
print("cuDNN Version:", cudnn.version())

In [None]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from google.colab import drive
import time
import warnings
import numpy as np
import onnxruntime
from datasets import load_dataset
from itertools import islice
from torch.utils.data import IterableDataset
import json
import os
import wave
import threading
import textwrap
from IPython.display import clear_output
import evaluate
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
import collections
import pickle
import random

<br/>
<br/>
<br/>

**User Action Required**
- Define model to use

In [None]:
# Google Drive Env Setup
whisper_ver = 'whisper-base'
checkpoint_num = '5400'
drive.mount('/content/drive')
checkpoint_path = f'/content/drive/My Drive/{whisper_ver}-noiseaugmented-continued-checkpoints/checkpoint-{checkpoint_num}'

# Model setup code for fine-tuned whisper
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = WhisperForConditionalGeneration.from_pretrained(checkpoint_path).to(device)
model.config.use_cache = True
processor = WhisperProcessor.from_pretrained(f"openai/{whisper_ver}", language="en", task="transcribe")

## **Load HF Datasets**

**User Action Required**

- Define the number of test samples

In [None]:
dataset_repo_test = "johnlohjy/imda_nsc_p3_test_noiseaugmented"
dataset_test = load_dataset(dataset_repo_test, split='test', streaming=True, trust_remote_code=True)

In [None]:
class SlicedDataset(IterableDataset):
    def __init__(self, dataset, num_examples):
        self.dataset = dataset
        self.num_examples = num_examples

    def __iter__(self):
        return islice(iter(self.dataset), self.num_examples)

    def __len__(self):
        return self.num_examples

dataset_test_reduced = SlicedDataset(dataset_test, num_examples=num_test_examples)

<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>

## **Transcribe chunks of 20s, 15s, 10s**

In [None]:
file_paths = [
    "waves/3000-1_11.wav",
    "waves/3000-1_28.wav",
    "waves/3000-1_52.wav",
    "waves/3001-1_101.wav",
    "waves/3001-1_105.wav",
    "waves/3001-1_11.wav",
    "waves/3001-1_110.wav",
    "waves/3001-1_113.wav",
    "waves/3001-1_127.wav",
    "waves/3001-1_13.wav",
    "waves/3001-1_130.wav",
    "waves/3001-1_142.wav",
    "waves/3001-1_197.wav",
    "waves/3001-1_75.wav",
    "waves/3001-1_83.wav",
    "waves/3001-1_96.wav"
]

In [None]:
samples_20s = []
samples_15s = []
samples_10s = []

sample_rate = 16000

for sample in dataset_test_reduced:
  audio_path = sample["audio"]["path"]
  if audio_path in file_paths:
    # Get a single audio samples data and convert from float 64 to float 32
    audio_sample = sample["audio"]["array"].astype(np.float32)

    # Compute samples for each duration and append it
    samples_20s.append(audio_sample[:20 * sample_rate])
    samples_15s.append(audio_sample[:15 * sample_rate])
    samples_10s.append(audio_sample[:10 * sample_rate])

In [None]:
def transcribe_audio(input_bytes):
    input_features = processor.feature_extractor(input_bytes, sampling_rate=16000).input_features[0]
    input_features = torch.tensor(input_features).unsqueeze(0).to(device)
    generated_ids = model.generate(input_features)
    last_segment = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return last_segment

In [None]:
samples_20s_transcribe_time = []
samples_15s_transcribe_time = []
samples_10s_transcribe_time = []

In [None]:
for i in range(len(samples_20s)):
  start_time = time.time()
  res = transcribe_audio(samples_20s[i])
  end_time = time.time()
  print('')
  print(res)
  samples_20s_transcribe_time.append(end_time - start_time)

In [None]:
for i in range(len(samples_15s)):
  start_time = time.time()
  res = transcribe_audio(samples_15s[i])
  end_time = time.time()
  print('')
  print(res)
  samples_15s_transcribe_time.append(end_time - start_time)

In [None]:
for i in range(len(samples_10s)):
  start_time = time.time()
  res = transcribe_audio(samples_10s[i])
  end_time = time.time()
  print('')
  print(res)
  samples_10s_transcribe_time.append(end_time - start_time)