In [None]:
import torch.nn.utils.prune as prune
from transformers import WhisperProcessor, WhisperForConditionalGeneration


model = WhisperForConditionalGeneration.from_pretrained("./whisper-small-hi/checkpoint-1000")

# Apply pruning to the model's first convolutional layer
prune.l1_unstructured(model.conv1, name="weight", amount=0.3)
prune.remove(model.conv1, 'weight')  # Make pruning permanent

In [None]:
import torch
import torch.quantization
import torch.nn as nn

from transformers import WhisperProcessor, WhisperForConditionalGeneration


model = WhisperForConditionalGeneration.from_pretrained("./src/checkpoint-9000")
processor = WhisperProcessor.from_pretrained("openai/whisper-small")

model.config.forced_decoder_ids = None


model.eval()  # Ensure the model is in evaluation mode for quantization
quantized_model = torch.quantization.quantize_dynamic(model, {nn.Linear, nn.Conv2d, nn.EmbeddingBag, nn.LSTM, nn.GRU}, dtype=torch.qint8)

# save config
quantized_model.config.save_pretrained("whisper-quantized-config")
# save state dict
quantized_state_dict = quantized_model.state_dict()
torch.save(quantized_state_dict, "whisper-quantized.pt")





# quantized_model.save_model('./whisper-small-hi-quantized')




In [None]:
from transformers import AutoConfig
# load config and dummy model
config = AutoConfig.from_pretrained("whisper-quantized-config")
dummy_model = WhisperForConditionalGeneration(config)

reconstructed_quantized_model = torch.quantization.quantize_dynamic(
    dummy_model, {torch.nn.Linear}, dtype=torch.qint8
)
reconstructed_quantized_model.load_state_dict(quantized_state_dict)


In [None]:
from torchaudio import transforms
from datasets import DatasetDict
import jiwer
from jiwer import wer
from functools import reduce
from pathlib import Path
import torchaudio
import torch
import torch
import torch.quantization
import torch.nn as nn

from transformers import WhisperProcessor, WhisperForConditionalGeneration

def predict_audio_from_file(file_path, model):
 
    speech_array, sampling_rate = torchaudio.load(file_path)
    
    # resample to 16000 hz (required by model)
    if sampling_rate != 16000:
        transform = transforms.Resample(sampling_rate, 16000)
        speech_array = transform(speech_array)
        
        
    sample_audio = DatasetDict({
        'array': speech_array.squeeze(0),
        'sampling_rate': 16000
    })
    
    input_features = processor(sample_audio["array"], sampling_rate=sample_audio["sampling_rate"], return_tensors="pt").input_features
    # input_features = input_features.to(device)
    
    # generate predicted token ids
    predicted_ids = model.generate(input_features)
    # decode predicted token ids to text
    prediction = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    
    return prediction

In [None]:
import time

start = time.time()

model = WhisperForConditionalGeneration.from_pretrained("./src/checkpoint-9000")
processor = WhisperProcessor.from_pretrained("openai/whisper-small")

model.config.forced_decoder_ids = None

end = time.time()
print(f"Normal Time Taken (Load Model): {end - start:.2f}s")
start = time.time()

prediction = predict_audio_from_file('audio_2.m4a', model)
transcript = "Heading is one seven zero, target is purple, blue, grey fighter jet, tool to deploy is electromagnetic pulse."
print(f"Actual: {transcript}\n")
print(f"Prediction: {prediction}\n")
print(f"WER%: {100* wer(transcript, prediction)}\n")

end = time.time()
print(f"Normal Time Taken (Inference): {end - start:.2f}s")




In [None]:
import time 

start = time.time()

model = WhisperForConditionalGeneration.from_pretrained("./src/checkpoint-9000")
processor = WhisperProcessor.from_pretrained("openai/whisper-small")

model.config.forced_decoder_ids = None


model.eval()  # Ensure the model is in evaluation mode for quantization
quantized_model = torch.quantization.quantize_dynamic(model, {nn.Linear, nn.Conv2d, nn.EmbeddingBag, nn.LSTM, nn.GRU}, dtype=torch.qint8)

end = time.time()
print(f"Quantized Time Taken (Load Model): {end - start:.2f}s")
start = time.time()

prediction = predict_audio_from_file('audio_2.m4a', quantized_model)
transcript = "Heading is one seven zero, target is purple, blue, grey fighter jet, tool to deploy is electromagnetic pulse."
print(f"Actual: {transcript}\n")
print(f"Prediction: {prediction}\n")
print(f"WER%: {100* wer(transcript, prediction)}\n")

end = time.time()
print(f"Quantized Time Taken (Inference): {end - start:.2f}s")

In [1]:
import jsonlines
import torchaudio
from datasets import Dataset, load_metric, DatasetDict
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments
from pathlib import Path
import torch
import librosa
import IPython.display as ipd
import jiwer
import base64


# Define the path to the directory
current_directory = Path.cwd()
file_path = current_directory / '..' / '..' / 'novice'
data_dir = file_path.resolve()
print(data_dir, current_directory)

# Read data from a jsonl file and reformat it
data = {'key': [], 'audio': [], 'transcript': []}
with jsonlines.open(data_dir / "asr.jsonl") as reader:
    for obj in reader:
        if len(data['key']) < 10: 
            for key, value in obj.items():
                data[key].append(value)
                
print(data)

/home/jupyter/novice /home/jupyter/til-24-base/asr
{'key': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'audio': ['audio_0.wav', 'audio_1.wav', 'audio_2.wav', 'audio_3.wav', 'audio_4.wav', 'audio_5.wav', 'audio_6.wav', 'audio_7.wav', 'audio_8.wav', 'audio_9.wav'], 'transcript': ['Heading is one five zero, target is green commercial aircraft, tool to deploy is electromagnetic pulse.', 'Heading is two six zero, target is black, white, and yellow commercial aircraft, tool to deploy is surface-to-air missiles.', 'Heading is one zero five, target is silver, green, and yellow light aircraft, tool to deploy is anti-air artillery.', 'Heading is two niner zero, target is brown and blue cargo aircraft, tool to deploy is electromagnetic pulse.', 'Heading is zero one five, target is yellow camouflage drone, tool to deploy is EMP.', 'Heading is two seven five, target is purple, orange, and blue cargo aircraft, tool to deploy is interceptor jets.', 'Heading is one seven five, target is black, blue, and grey figh

In [3]:
import time 

start = time.time()


from torchaudio import transforms
from datasets import DatasetDict
# import jiwer
# from jiwer import wer
# from functools import reduce
# from pathlib import Path
import torchaudio
import torch
import torch.quantization
import torch.nn as nn
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import AutoConfig

def predict_audio_from_file(file_path, model):
 
    speech_array, sampling_rate = torchaudio.load(file_path)
    
    # resample to 16000 hz (required by model)
    if sampling_rate != 16000:
        transform = transforms.Resample(sampling_rate, 16000)
        speech_array = transform(speech_array)
        
        
    sample_audio = DatasetDict({
        'array': speech_array.squeeze(0),
        'sampling_rate': 16000
    })
    
    input_features = processor(sample_audio["array"], sampling_rate=sample_audio["sampling_rate"], return_tensors="pt").input_features
    # input_features = input_features.to(device)
    
    # generate predicted token ids
    predicted_ids = model.generate(input_features)
    # decode predicted token ids to text
    prediction = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    
    return prediction

# load config and dummy model
config = AutoConfig.from_pretrained("whisper-quantized-config")
dummy_model = WhisperForConditionalGeneration(config)

reconstructed_quantized_model = torch.quantization.quantize_dynamic(
    dummy_model,  {nn.Linear, nn.Conv2d, nn.EmbeddingBag, nn.LSTM, nn.GRU}, dtype=torch.qint8
)
reconstructed_quantized_model.load_state_dict(torch.load("whisper-quantized.pt"))
reconstructed_quantized_model.eval()

processor = WhisperProcessor.from_pretrained("openai/whisper-small")


end = time.time()
print(f"Reconstructed Quantized Time Taken (Load Model): {end - start:.2f}s")
start = time.time()
for i in data['audio']:
    prediction = predict_audio_from_file(data_dir / 'audio' / i, reconstructed_quantized_model)
    print(prediction)
transcript = "Heading is one seven zero, target is purple, blue, grey fighter jet, tool to deploy is electromagnetic pulse."
print(f"Actual: {transcript}\n")
print(f"Prediction: {prediction}\n")
# print(f"WER%: {100* wer(transcript, prediction)}\n")

end = time.time()
print(f"Reconstructed Quantized Time Taken (Inference): {end - start:.2f}s")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Reconstructed Quantized Time Taken (Load Model): 9.09s
Heading is one five zero, target is green commercial aircraft, tool to deploy is electromagnetic pulse.
Heading is two six zero, target is black, white, and yellow commercial aircraft, tool to deploy is surface-to-air missiles.
Heading is one zero five, target is silver, green, and yellow light aircraft, tool to deploy is anti-air artillery.
Heading is two niner zero, target is brown and blue cargo aircraft, tool to deploy is electromagnetic pulse.
Heading is zero one five, target is yellow camouflage drone, tool to deploy is EMP.
Heading is two seven five, target is purple, orange, and blue cargo aircraft, tool to deploy is interceptor jets.
Heading is one seven five, target is black, blue, and grey fighter jet, tool to deploy is machine gun.
Heading is three two zero, target is purple and brown cargo aircraft, tool to deploy is surface-to-air missiles.
Heading is one zero zero, target is blue, brown, and grey commercial aircraft,

In [None]:
import jsonlines
import torchaudio
from datasets import Dataset, load_metric, DatasetDict
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments
from pathlib import Path
import torch
import librosa
import IPython.display as ipd
import jiwer
import base64


# Define the path to the directory
current_directory = Path.cwd()
file_path = current_directory / '..' / '..' / 'novice'
data_dir = file_path.resolve()
print(data_dir, current_directory)

# Read data from a jsonl file and reformat it
data = {'key': [], 'audio': [], 'transcript': []}
with jsonlines.open(data_dir / "asr.jsonl") as reader:
    for obj in reader:
        if len(data['key']) < 10: 
            for key, value in obj.items():
                data[key].append(value)

data2 = {"instances": []}                
for j, i in enumerate(data['key']):
    with open(data_dir / 'audio' / data['audio'][j], "rb") as file:
        audio_bytes = file.read()
        instance = {
            "key": i,
            "b64": base64.b64encode(audio_bytes).decode("ascii"),
            "transcript": data['transcript'][j]
        }
        data2['instances'].append(instance)
        

# Convert to a Hugging Face dataset
dataset = Dataset.from_dict(data2) # converts it into a dataset object which has in-built helper functions to help us later on when we need to do operations on it


In [None]:
dataset[1]['instances']['transcript']

In [None]:
import torch
import torch.distributed as dist
import torch.multiprocessing as mp

def run_inference(rank, world_size):
    # create default process group
    dist.init_process_group("gloo", rank=rank, world_size=world_size)
    
    # load a model 
    model = reconstructed_quantized_model
    # model.load_state_dict(PATH)
    model.eval()
    model.to(rank)

    # create a dataloader
    loader = torch.utils.data.DataLoader(dataset=dataset,
                                               batch_size=batch_size,
                                               shuffle=False,
                                               num_workers=4)
    print(loader)

    # iterate over the loaded partition and run the model
    for idx, data in enumerate(loader):
        print(data)
        pass
    
def main():
    world_size = 4
    mp.spawn(run_inference,
        args=(world_size,),
        nprocs=world_size,
        join=True)

main()

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

In [None]:
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# load fine-tuned model and tokenizer
model_ckpt = "google/pegasus-cnn_dailymail"
model = PegasusForConditionalGeneration.from_pretrained(model_ckpt)
# quantize model
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)