In [None]:
from torchaudio import transforms
from datasets import DatasetDict
import jiwer
from jiwer import wer
from functools import reduce
from pathlib import Path
import torchaudio
import torch
import torch
import torch.quantization
import torch.nn as nn

from transformers import WhisperProcessor, WhisperForConditionalGeneration

def predict_audio_from_file(file_path, model):
 
    speech_array, sampling_rate = torchaudio.load(file_path)
    
    # resample to 16000 hz (required by model)
    if sampling_rate != 16000:
        transform = transforms.Resample(sampling_rate, 16000)
        speech_array = transform(speech_array)
        
        
    sample_audio = DatasetDict({
        'array': speech_array.squeeze(0),
        'sampling_rate': 16000
    })
    
    input_features = processor(sample_audio["array"], sampling_rate=sample_audio["sampling_rate"], return_tensors="pt").input_features
    # input_features = input_features.to(device)
    
    # generate predicted token ids
    predicted_ids = model.generate(input_features)
    # decode predicted token ids to text
    prediction = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    
    return prediction

In [None]:
from transformers import AutoConfig
# load config and dummy model
config = AutoConfig.from_pretrained("whisper-quantized-config")
dummy_model = WhisperForConditionalGeneration(config)

reconstructed_quantized_model = torch.quantization.quantize_dynamic(
    dummy_model, {torch.nn.Linear}, dtype=torch.qint8
)
reconstructed_quantized_model.load_state_dict(torch.load("whisper-quantized.pt"))
reconstructed_quantized_model.eval()

processor = WhisperProcessor.from_pretrained("openai/whisper-small")

In [1]:
import jsonlines
import torchaudio
from datasets import Dataset, load_metric, DatasetDict
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments
from pathlib import Path
import torch
import librosa
import IPython.display as ipd
import jiwer
import base64


# Define the path to the directory
current_directory = Path.cwd()
file_path = current_directory / '..' / '..' / 'novice'
data_dir = file_path.resolve()
print(data_dir, current_directory)

# Read data from a jsonl file and reformat it
data = {'key': [], 'audio': [], 'transcript': []}
with jsonlines.open(data_dir / "asr.jsonl") as reader:
    for obj in reader:
        if len(data['key']) < 10: 
            for key, value in obj.items():
                data[key].append(value)
                
print(data)

data2 = {"instances": []}                
for j, i in enumerate(data['key']):
    with open(data_dir / 'audio' / data['audio'][j], "rb") as file:
        audio_bytes = file.read()
        instance = {
            "key": i,
            "b64": base64.b64encode(audio_bytes).decode("ascii"),
            "transcript": data['transcript'][j]
        }
        data2['instances'].append(instance)
        

# Convert to a Hugging Face dataset
dataset = Dataset.from_dict(data2) # converts it into a dataset object which has in-built helper functions to help us later on when we need to do operations on it

/home/jupyter/novice /home/jupyter/til-24-base/asr
{'key': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'audio': ['audio_0.wav', 'audio_1.wav', 'audio_2.wav', 'audio_3.wav', 'audio_4.wav', 'audio_5.wav', 'audio_6.wav', 'audio_7.wav', 'audio_8.wav', 'audio_9.wav'], 'transcript': ['Heading is one five zero, target is green commercial aircraft, tool to deploy is electromagnetic pulse.', 'Heading is two six zero, target is black, white, and yellow commercial aircraft, tool to deploy is surface-to-air missiles.', 'Heading is one zero five, target is silver, green, and yellow light aircraft, tool to deploy is anti-air artillery.', 'Heading is two niner zero, target is brown and blue cargo aircraft, tool to deploy is electromagnetic pulse.', 'Heading is zero one five, target is yellow camouflage drone, tool to deploy is EMP.', 'Heading is two seven five, target is purple, orange, and blue cargo aircraft, tool to deploy is interceptor jets.', 'Heading is one seven five, target is black, blue, and grey figh

In [None]:
from pathlib import Path

# Define the path to the directory
current_directory = Path.cwd()
file_path = current_directory / '..' / '..' / 'novice'
data_dir = file_path.resolve()
print(data_dir, current_directory)

In [None]:
import torch
import torch.distributed as dist
import torch.multiprocessing as mp

def run_inference(rank, world_size):
    # create default process group
    print(rank, world_size)
    dist.init_process_group("gloo", rank=rank, world_size=world_size)
    
    # load a model 
    model = reconstructed_quantized_model
    # model.load_state_dict(PATH)
    model.eval()
    model.to(rank)

    # create a dataloader
    loader = torch.utils.data.DataLoader(dataset=dataset,
                                               batch_size=batch_size,
                                               shuffle=False,
                                               num_workers=4)
    print(loader)

    # iterate over the loaded partition and run the model
    for idx, data in enumerate(loader):
        print(data)
        pass
    
def main():
    world_size = torch.cuda.device_count()
    mp.spawn(run_inference,
        args=(world_size,),
        nprocs=world_size,
        join=True)

if __name__=="__main__":
    main()

In [None]:
!pip install packaging
!pip install ninja
!pip install flash-attn --no-build-isolation

In [16]:
!pip install noisereduce

Collecting noisereduce
  Using cached noisereduce-3.0.2-py3-none-any.whl.metadata (14 kB)
Using cached noisereduce-3.0.2-py3-none-any.whl (22 kB)
Installing collected packages: noisereduce
Successfully installed noisereduce-3.0.2


In [18]:
import noisereduce as nr 
import librosa 
import soundfile as sf

audio_path = data_dir / 'audio' / 'audio_8.wav'
audio, rate = librosa.load(audio_path, sr=None)

print(rate)

# Perform noise reduction
noisy_part = audio[0:int(rate*0.5)]  # Identify the noisy part
reduced_noise_audio = nr.reduce_noise(y=audio, sr=rate, y_noise=noisy_part)

# Save the cleaned audio
clean_audio_path = 'cleaned_audio.wav'
sf.write(clean_audio_path, reduced_noise_audio, rate)


16000


In [13]:
from io import BytesIO
import base64

dataset2 = []

def retrieve_audio(batch):
    audio = batch['instances']['b64']
    x = base64.b64decode(audio)
    y, s = librosa.load(BytesIO(x), sr=16000)
    dataset2.append(y)
        
dataset = dataset.map(retrieve_audio)
dataset

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset({
    features: ['instances'],
    num_rows: 10
})

In [14]:
dataset2

[array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -9.1552734e-05, -6.1035156e-05, -3.0517578e-05], dtype=float32),
 array([ 0.0000000e+00,  9.1552734e-05,  5.4931641e-04, ...,
        -4.2724609e-04, -1.0375977e-03, -1.3732910e-03], dtype=float32),
 array([ 0.0000000e+00, -3.0517578e-05, -6.1035156e-05, ...,
        -1.3427734e-03, -5.7983398e-04,  2.7465820e-04], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([ 0.0000000e+00, -3.0517578e-05, -9.1552734e-05, ...,
        -2.2888184e-03, -2.4108887e-03, -1.5258789e-03], dtype=float32),
 array([ 0.0000000e+00, -3.0517578e-05, -1.2207031e-04, ...,
         1.3732910e-03,  1.4038086e-03,  8.5449219e-04], dtype=float32),
 array([ 0.0000000e+00,  0.0000000e+00,  9.1552734e-05, ...,
        -2.3803711e-03, -3.1738281e-03, -1.2817383e-03], dtype=float32),
 array([ 0.0000000e+00,  3.0517578e-05,  2.1362305e-04, ...,
        -2.4414062e-04, -3.

In [16]:

import time 

start = time.time()

import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
from transformers import AutoConfig
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torchaudio
from torchaudio import transforms
import librosa
from io import BytesIO
import base64


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "../asr_2/src/checkpoint-1000"

model = WhisperForConditionalGeneration.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.config.forced_decoder_ids = None

model.to(device)

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="English", task="transcribe")

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=False,
    torch_dtype=torch_dtype,
    device=device,
)

mid = time.time()

# speech_array, sampling_rate = torchaudio.load("audio_2.m4a")
# if sampling_rate != 16000:
#     transform = transforms.Resample(sampling_rate, 16000)
#     speech_array = transform(speech_array)

  
# # y, s = librosa.load('audio.wav', sr=16000) # Downsample 44.1kHz to 8kHz
# x = None
# with open(data_dir / 'audio' / 'audio_8.wav', "rb") as file:
#     audio_bytes = file.read()
#     x = base64.b64encode(audio_bytes).decode("ascii")
#     x = base64.b64decode(x)
    
# print(type(x))
# # for i in data['audio']:
# #     y, s = librosa.load(data_dir / 'audio' / i, sr=16000) # Downsample 44.1kHz to 8kHz
# #     result = pipe(y,generate_kwargs={"language": "english"})
# #     print(result["text"])

# y, s = librosa.load(BytesIO(x), sr=16000) # Downsample 44.1kHz to 8kHz
result = pipe(dataset2,generate_kwargs={"language": "english"})
print(result)
    
end = time.time()

print(f'loading time: {mid-start:2f}')
print(f'inference time: {end-mid:2f}')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[{'text': 'Heading is one five zero, target is green commercial aircraft, tool to deploy is electromagnetic pulse.'}, {'text': 'Heading is two six zero, target is black, white, and yellow commercial aircraft, tool to deploy is surface-to-air missiles.'}, {'text': 'Heading is one zero five, target is silver, green, and yellow light aircraft, tool to deploy is anti-air artillery.'}, {'text': 'Heading is two niner zero, target is brown and blue cargo aircraft, tool to deploy is electromagnetic pulse.'}, {'text': 'Heading is zero one five, target is yellow camouflage drone, tool to deploy is EMP.'}, {'text': 'Heading is two seven five, target is purple, orange, and blue cargo aircraft, tool to deploy is interceptor jets.'}, {'text': 'Heading is one seven five, target is black, blue, and grey fighter jet, tool to deploy is machine gun.'}, {'text': 'Heading is three two zero, target is purple and brown cargo aircraft, tool to deploy is surface-to-air missiles.'}, {'text': 'Heading is one zer

In [32]:
# KeyDataset is a util that will just output the item we're interested in.
from transformers.pipelines.pt_utils import KeyDataset
from datasets import load_dataset

pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0)
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]")

for out in pipe(KeyDataset(dataset, "audio")):
    print(out)

config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/829k [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at hf-internal-testing/tiny-random-wav2vec2 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.feature_extractor.conv_layers.1.layer_norm.bias', 'wav2vec2.feature_extractor.conv_layers.1.layer_norm.weight', 'wav2vec2.feature_extractor.conv_layers.2.layer_norm.bias', 'wav2vec2.feature_extractor.conv_layers.2.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/554 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Generating validation split: 0 examples [00:00, ? examples/s]

{'text': "EYB  ZB COE C BEZCYCZ HO MOWWB EM BWOB ZMEG  B COEB BE BEC B U OB BE BCB BEWUBB BXYWBESWYCB SBBB SSEZ C Z WH UB F IGVB SB Z<unk> XOES CZ BBXOXFBB  OBY W B VM OFOWUONFWB ZCX B M WZ Q S C Q BC CQBF FOMB BOT ZWYBZ WB  B CM B C B WZCWWW BHU EOYTO YWB BZ SHZBGEM Q OO T B BM XZ QW C OFBZMSEHB BE ZZBX M Q XB<unk> CEVWZ FOHSB W B O Z ZW S ZB O VM <s> D EUCKH XNC D Q BG B O BW U  U  MBE CBYE  WB HFQUBQBUWZ B MW BMPY F ZB  EB B WBOF S XFOBB ZB X B MOT W B CEO WBM   BBXBBEOBECB B UM C BP FMBWB BZ WFCED Z B B FXB Z OZ OBBZ NVD UBZC W B WYCWY X CE CW B WB MWU BWN B DECF GEF'C WZS CS BYWB<s>FZ'Z<s>ZGBU ECFEY BF ZOZ O UWBSSZBBBBW   O O DBB BZWFUW ZWOZYCGOYCOT WC O CZ BD BBBBBBX X W T B BC BZC FWYBFO FBCE X Z PEZ CE B WEDBMBO BN B BY Y  W B BMCB XOXQ  BSZES Z M CF S FB BBXBB B C CSZ EF SEQF S BEC BNO BN  SU EH  WRFBS WB  W B OEZ WS X B F B X ZBBE BBEHB B BU BECBSXHB BSQWFW BSZXH BWSEG W VQETZMCZ UCXW Z DBE<s> O SXZX MB W RX YYOBSUBWOCFYEF O B O B C Z UBEZBE BTB C   CBFCB V W B BF W ZBBESBBEC