In [None]:
!nvcc --version

In [None]:
!pip uninstall -y torch torchvision torchaudio triton nvidia-cublas-cu12 nvidia-cuda-runtime-cu12 nvidia-cudnn-cu12 nvidia-cufft-cu12 nvidia-curand-cu12 nvidia-cusolver-cu12 nvidia-cusparse-cu12 nvidia-nccl-cu12

In [None]:
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu125

In [None]:
!pip install -q datasets bitsandbytes accelerate onnxruntime evaluate jiwer

In [None]:
!pip install pydub

In [None]:
import torch
import triton
import torch.backends.cudnn as cudnn

print("Torch Version:", torch.__version__)
print("Torch CUDA Version:", torch.version.cuda)
print("CUDA Available:", torch.cuda.is_available())
print("Triton Version:", triton.__version__)
print("cuDNN Version:", cudnn.version())

In [None]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from google.colab import drive
import time
import warnings
import numpy as np
import onnxruntime
from datasets import load_dataset
from itertools import islice
from torch.utils.data import IterableDataset
import json
import os
import wave
import threading
import textwrap
from IPython.display import clear_output
import evaluate
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
import collections
import pickle
import random
from pydub import AudioSegment

In [None]:
fnumbers = ['3148-2','3147-1','conf_2690_2690_00862467','conf_2688_2688_00862561']
wav_fnames = []
for f in fnumbers:
    for i in range(1,9):
        wav_fnames.append(f + f'_{i}.wav')

In [None]:
wav_fnames

In [None]:
dataset_repo_test = "johnlohjy/imda_nsc_p3_test_noiseaugmented_mini"
dataset_test = load_dataset(dataset_repo_test, split='test', streaming=True, trust_remote_code=True)

In [None]:
RATE = 16000
chunk = 4096
no_voice_activity_threshold = 20
references = []


#####################CREATE CONTINOUS AUDIO ARRAY ONCE S.T ITS REPRODUCIBLE############
# Store all audio samples
all_audio_samples = []

for sample in dataset_test:
  # If its a sample we want
  if sample['path'].split('/')[1] in wav_fnames:
    # Get a single audio samples data and convert from float 64 to float 32
    # Append it to all_audio_samples
    audio_sample = sample["audio"]["array"].astype(np.float32)
    all_audio_samples.append(audio_sample)

    # Increment the audio length
    total_audio_len += len(audio_sample)/16000
    total_audio_len_extra_silences += len(audio_sample)/16000

    # Insert random lengths of silence (1s to 5s) at the end of the curr sample
    silence = np.zeros(random.randint(1, 5)*RATE, dtype=np.float32)
    all_audio_samples.append(silence)

    # Increment the total_audio_len_extra_silences with silence
    total_audio_len_extra_silences += len(silence)/16000

    # Add the reference
    references.append(sample["sentence"])

# Concatenate all audio samples (with silences)
audio_samples = np.concatenate(all_audio_samples)

# Add no_voice_activity_threshold chunks of silence to the concatenated audio samples so the lastest segment of audio will be saved
silence = np.zeros(chunk*(no_voice_activity_threshold+5), dtype=np.float32)
audio_samples = np.concatenate((audio_samples, silence))
total_audio_len_extra_silences += len(silence)/16000
#####################CREATE CONTINOUS AUDIO ARRAY ONCE S.T ITS REPRODUCIBLE############

In [None]:
######################SAVE CONTINOUS ARRAYS######################
with open(f"test_audio_{total_audio_len/60}_mins.pkl", "wb") as f:
    pickle.dump(audio_samples, f)

with open(f"test_references.pkl", "wb") as f:
    pickle.dump(references, f)

with open(f"test_wavfnames.pkl", "wb") as f:
    pickle.dump(wav_fnames, f)
######################SAVE CONTINOUS ARRAYS######################

In [None]:
print(f'The total audio length is {total_audio_len}')
print(f'The total audio length with silences is {total_audio_len_extra_silences}')

In [None]:
snr_wavfile_values = {}

with open("data_test_waves_snr.txt", "r") as file:
    for line in file:
        key, value = line.strip().split(' ')
        snr_wavfile_values[key] = value
     
snr_wavfile_counts = collections.defaultdict(int)

for fname in wav_fnames:
    snr_wavfile_counts[snr_wavfile_values[fname]] += 1

In [None]:
snr_wavfile_counts