## Import libraries

In [96]:
# !pip install transformers
# !pip install datasets
# !pip install soundfile
# !pip install librosa
# !pip install numba

In [1]:
import os, json, time
import gc
from IPython.display import display, Markdown
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import transformers
import torch
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM, TrainingArguments
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from langchain_community.document_loaders import TextLoader # new version
from langchain.prompts.prompt import PromptTemplate
from langchain_core.runnables import ConfigurableField
from langchain_community.vectorstores import FAISS, Chroma

# Text chunk spliter
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter , SentenceTransformersTokenTextSplitter# Text Splitter
from langchain.embeddings import HuggingFaceEmbeddings
from datasets import Dataset, DatasetDict, load_dataset

# for evaluate  LLM 
import evaluate
import pytest
import trulens
import wandb # for LLM  logging




## Configure

In [2]:
class CFG:
    OFFLINE = False # later we test on-premises evnironment use
    USE_WANDB = False # for LLM Evaluation and  Tracking
    ASR_MODEL = "openai/whisper-large-v3" # ASR model
    LLM_MODEL1 = "google/gemma-2b-it" # LLM model
    LLM_MODEL2 = "google/gemma-7b-it" # LLM model
    LLM_MODEL3 = "google/gemma-2-9b"# for new LLM model
    LLM_MODEL4 = "meta-llama/Meta-Llama-3-8B" # new LLama3 8B 

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
asr = AutoModelForSpeechSeq2Seq.from_pretrained(CFG.ASR_MODEL,
                                                 torch_dtype=torch.float16, 
                                                 low_cpu_mem_usage=True, 
                                                 use_safetensors=True
                                                ).to(device)
audioProc = AutoProcessor.from_pretrained(CFG.ASR_MODEL) 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Test ASR model

In [5]:
testspeechDataset1 = "hf-internal-testing/librispeech_asr_dummy"
testspeechDataset2 = "distil-whisper/librispeech_long"
testspeechDataset3 = "librispeech_asr"
testspeechDataset4 = "Nexdata/chinese_dialect" # cantonese sample datasets
audioDataset1 = load_dataset(testspeechDataset1, "clean", split="validation")
audioDataset2 = load_dataset(testspeechDataset3,  split="train.clean.100", streaming=True, trust_remote_code=True)
audioDataset3 = load_dataset(testspeechDataset4)

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/64 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/64 [00:00<?, ?it/s]

In [6]:
asr = pipeline(
    "automatic-speech-recognition",
    model=asr,
    tokenizer=audioProc.tokenizer,
    feature_extractor=audioProc.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch.float16,
    device=device,
)

In [7]:
audioDataset1[0]["audio"]

{'path': '1272-128104-0000.flac',
 'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
        0.0010376 ]),
 'sampling_rate': 16000}

In [8]:
# audioDataset2[0]["audio"]

In [9]:
example1 = next(iter(audioDataset1))
example2 = next(iter(audioDataset2))
example3 = next(iter(audioDataset3["train"]))

Playing sample audio dataset

In [10]:
from IPython.display import  Audio as IPythonAudio


In [11]:
example2["audio"]

{'path': '374-180298-0000.flac',
 'array': array([ 7.01904297e-04,  7.32421875e-04,  7.32421875e-04, ...,
        -2.74658203e-04, -1.83105469e-04, -3.05175781e-05]),
 'sampling_rate': 16000}

In [12]:
example1["audio"]

{'path': '1272-128104-0000.flac',
 'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
        0.0010376 ]),
 'sampling_rate': 16000}

In [13]:
audioDataset3["train"]["audio"]

[{'path': '/home/johnsonhk88/.cache/huggingface/datasets/downloads/d44604004879fc607c568b95045344dda961da10209bf5cbc359ae1561e2985d',
  'array': array([ 0.00015259,  0.0015564 ,  0.0015564 , ..., -0.00097656,
         -0.00085449, -0.00134277]),
  'sampling_rate': 16000},
 {'path': '/home/johnsonhk88/.cache/huggingface/datasets/downloads/60a1888fafa7fac0848c6b2b2413a73af57479d7e6e7cd94a9fb4e56d77f9e0e',
  'array': array([0.00350952, 0.00305176, 0.00088501, ..., 0.00054932, 0.00073242,
         0.00024414]),
  'sampling_rate': 16000}]

In [14]:
example3

{'audio': {'path': '/home/johnsonhk88/.cache/huggingface/datasets/downloads/d44604004879fc607c568b95045344dda961da10209bf5cbc359ae1561e2985d',
  'array': array([ 0.00015259,  0.0015564 ,  0.0015564 , ..., -0.00097656,
         -0.00085449, -0.00134277]),
  'sampling_rate': 16000}}

In [15]:
IPythonAudio(example2["audio"]["array"], 
             rate=example2["audio"]["sampling_rate"])

In [16]:
IPythonAudio(example1["audio"]["array"], 
             rate=example1["audio"]["sampling_rate"])

In [17]:
IPythonAudio(example3["audio"]["array"], 
             rate=example3["audio"]["sampling_rate"])

In [18]:
asr.feature_extractor.sampling_rate #fh

16000

In [19]:
example1["audio"]["sampling_rate"]

16000

In [20]:
example1

{'file': '/Users/sanchitgandhi/.cache/huggingface/datasets/downloads/extracted/aad76e6f21870761d7a8b9b34436f6f8db846546c68cb2d9388598d7a164fa4b/dev_clean/1272/128104/1272-128104-0000.flac',
 'audio': {'path': '1272-128104-0000.flac',
  'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
         0.0010376 ]),
  'sampling_rate': 16000},
 'text': 'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',
 'speaker_id': 1272,
 'chapter_id': 128104,
 'id': '1272-128104-0000'}

In [21]:
asr(example3["audio"]["array"])

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


{'text': '算你個死人頭,我們一起共過患難',
 'chunks': [{'timestamp': (0.0, 4.0), 'text': '算你個死人頭,我們一起共過患難'}]}