In [1]:
!pip install --upgrade --q datasets transformers accelerate soundfile librosa evaluate jiwer tensorboard gradio

In [2]:
import jsonlines
import torchaudio
from datasets import Dataset, load_metric, DatasetDict
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments
from pathlib import Path
import torch
import librosa
import IPython.display as ipd
import jiwer
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [20]:
# Define the path to the directory
current_directory = Path.cwd()
file_path = current_directory / '..' / '..' / 'novice'
data_dir = file_path.resolve()
print(data_dir, current_directory)

# Read data from a jsonl file and reformat it
data = {'key': [], 'image': [], 'caption': [], 'bbox': []}
counter = 0
with jsonlines.open(data_dir / "vlm.jsonl") as reader:
    for obj in reader:
        if len(data['image']) < 3: 
            for item in obj['annotations']:
                data['key'].append(counter)
                data['image'].append(obj['image'])
                data['caption'].append(item['caption'])
                data['bbox'].append(item['bbox'])
                counter += 1

/home/jupyter/novice /home/jupyter/til-24-base/vlm


In [21]:
# Convert to a Hugging Face dataset
dataset = Dataset.from_dict(data) # converts it into a dataset object which has in-built helper functions to help us later on when we need to do operations on it
# think of it as a special pandas library :)

# Shuffle the dataset
dataset = dataset.shuffle(seed=42) # shuffle the dataset (one of the in-built helper functions of the Hugging Face dataset)

# Split the dataset into training, validation, and test sets
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
print(train_size, val_size, test_size)

train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))
test_dataset = dataset.select(range(train_size + val_size, train_size + val_size + test_size))

dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'val': val_dataset})

dataset

2 0 1


DatasetDict({
    train: Dataset({
        features: ['key', 'image', 'caption', 'bbox'],
        num_rows: 2
    })
    test: Dataset({
        features: ['key', 'image', 'caption', 'bbox'],
        num_rows: 1
    })
    val: Dataset({
        features: ['key', 'image', 'caption', 'bbox'],
        num_rows: 0
    })
})

In [22]:
from PIL import Image
from datasets import DatasetDict
import numpy as np

def replace_image(batch):
    image_path = batch['image']
    # Load the image using PIL
    image = Image.open(data_dir / "images" / image_path)
    # Update the batch with the image information
    batch['image'] = {
        'array': image,
        'path': image_path,
        'size': image.size,
        'mode': image.mode
    }
    print(image.size)
    return batch

# Assuming `dataset` is your original dataset loaded with the appropriate library
dataset = dataset.map(replace_image)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

(1520, 870)
(1520, 870)


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

(1520, 870)


In [24]:
print("\nFirst Item in the Dataset:")
print(dataset['train'][0])


First Item in the Dataset:
{'key': 2, 'image': {'array': {'bytes': None, 'path': '/home/jupyter/novice/images/image_0.jpg'}, 'mode': 'RGB', 'path': 'image_0.jpg', 'size': [1520, 870]}, 'caption': 'blue and white commercial aircraft', 'bbox': [800, 320, 128, 36]}


In [32]:
from typing import List
import requests

import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection 
import io

import json
import torch
from datasets import load_dataset
from PIL import Image
from torchvision.io import ImageReadMode, read_image
from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize
from torchvision.transforms.functional import InterpolationMode
from transformers import (
    Trainer,
    TrainingArguments,
    VisionTextDualEncoderModel,
    VisionTextDualEncoderProcessor,
    AutoTokenizer,
    AutoImageProcessor
)
from transformers import BertConfig, BertModel
from transformers import AutoTokenizer

model_id = "IDEA-Research/grounding-dino-tiny"
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

'''
model = VisionTextDualEncoderModel.from_vision_text_pretrained(
    "openai/clip-vit-base-patch32", "FacebookAI/roberta-base"
)
'''
# tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
# image_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
# processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)

config = BertConfig.from_pretrained("bert-base-uncased")
nlp_model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, config=config)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")



In [38]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [37]:
!pip --q install trl
from trl.commands.cli_utils import SftScriptArguments, TrlParser

parser = TrlParser((SftScriptArguments, TrainingArguments))
args, training_args = parser.parse_args_and_config()

Collecting trl
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.4-py3-none-any.whl.metadata (7.9 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Downloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading tyro-0.8.4-py3-none-any.whl (102 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/102.4 kB[0m [31m542.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading shtab-1.7.1-py3-none-any.whl (14 kB)
Installing collected packages: shtab, tyro, trl
Successfully installed shtab-1.7.1 trl-0.8.6 tyro-0.8.4


usage: ipykernel_launcher.py [-h] [--dataset_name DATASET_NAME]
                             [--dataset_text_field DATASET_TEXT_FIELD]
                             [--dataset_train_name DATASET_TRAIN_NAME]
                             [--dataset_test_name DATASET_TEST_NAME]
                             [--max_seq_length MAX_SEQ_LENGTH]
                             [--packing [PACKING]] [--config CONFIG]
                             [--gradient_checkpointing_use_reentrant [GRADIENT_CHECKPOINTING_USE_REENTRANT]]
                             --output_dir OUTPUT_DIR
                             [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]]
                             [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]]
                             [--do_predict [DO_PREDICT]]
                             [--eval_strategy {no,steps,epoch}]
                             [--prediction_loss_only [PREDICTION_LOSS_ONLY]]
                             [--per_device_train_batch_size PER_DEVICE_TRAIN_BATC

SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [34]:
train_dataset = dataset['train']
test_dataset = dataset['test']
eval_dataset = dataset['val']

In [35]:
# initialize Trainer
training_args = TrainingArguments(
    learning_rate=5e-5,
    warmup_steps=0,
    weight_decay=0.1,
    per_device_train_batch_size=16,
    logging_steps=5,
    save_steps=5,
    remove_unused_columns=False,
    output_dir="clip-finetune",
    report_to='none', # disable wandb
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
)
train_result = trainer.train()

NameError: name 'collate_fn' is not defined

In [7]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="English", task="transcribe")

import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

# put together a list of samples into a mini training batch, https://www.youtube.com/watch?v=-RPeakdlHYo
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
import evaluate

metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [12]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
model.to(device)

model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-hi",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=10,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    # use_cpu=False
)

from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

trainer.train()

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss,Wer
1000,0.0,0.011708,1.169591
2000,0.0,0.013134,1.169591
3000,0.0,0.014207,1.169591
4000,0.0,0.01441,1.169591


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


TrainOutput(global_step=4000, training_loss=0.009893727838207269, metrics={'train_runtime': 14378.5323, 'train_samples_per_second': 4.451, 'train_steps_per_second': 0.278, 'total_flos': 1.846946562048e+19, 'train_loss': 0.009893727838207269, 'epoch': 800.0})

In [14]:
trainer.save_model('./whisper-small-hi')

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


In [7]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
from jiwer import wer
from functools import reduce


# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("./whisper-small-hi/checkpoint-1000")
model.config.forced_decoder_ids = None

# Define the path to the directory
current_directory = Path.cwd()
file_path = current_directory / '..' / '..' / 'novice'
data_dir = file_path.resolve()
print(data_dir, current_directory)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


/home/jupyter/novice /home/jupyter/til-24-base/asr


In [None]:
def predict_audio(sample):
    sample_audio = sample['audio']
    actual_transcript = sample['transcript']
    
    input_features = processor(sample_audio["array"], sampling_rate=sample_audio["sampling_rate"], return_tensors="pt").input_features 
    # generate predicted token ids
    predicted_ids = model.generate(input_features)
    # decode predicted token ids to text
    predicted_transcript = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    
    wer_score = wer(actual_transcript, predicted_transcript)
    
    sample['wer'] = wer_score
    return sample
    
prediction = dataset['val'].map(predict_audio)
print(prediction)

val_wer = reduce(lambda a, b: a+b['wer'], prediction, 0)/len(prediction)

print(f"WER%: {val_wer}")

In [15]:
from torchaudio import transforms

def predict_audio_from_file(file_path):
 
    speech_array, sampling_rate = torchaudio.load(file_path)
    
    # resample to 16000 hz (required by model)
    if sampling_rate != 16000:
        transform = transforms.Resample(sampling_rate, 16000)
        speech_array = transform(speech_array)
        
        
    sample_audio = DatasetDict({
        'array': speech_array.squeeze(0),
        'sampling_rate': 16000
    })

    input_features = processor(sample_audio["array"], sampling_rate=sample_audio["sampling_rate"], return_tensors="pt").input_features 
    
    # generate predicted token ids
    predicted_ids = model.generate(input_features)
    # decode predicted token ids to text
    prediction = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    
    return prediction

In [16]:
import time

start = time.time()

# prediction = predict_audio_from_file(data_dir / 'audio' / 'audio_1000.wav')
prediction = predict_audio_from_file('audio_2.m4a')
transcript = "Heading is one niner five, target is yellow missile, tool to deploy is surface-to-air missiles."
print(f"Actual: {transcript}\n")
print(f"Prediction: {prediction}\n")
print(f"WER%: {100* wer(transcript, prediction)}\n")

end = time.time()
print(f"Time Taken: {end - start:.2f}s")

Actual: Heading is one niner five, target is yellow missile, tool to deploy is surface-to-air missiles.

Prediction: Heading is one seven zero, target is purple, blue, grey fighter jet, tool to deploy is electromagnetic pulse.

WER%: 60.0

Time Taken: 7.30s


In [3]:
speech_array, sampling_rate = torchaudio.load('audio_2.m4a')
speech_array, sampling_rate

(tensor([[ 0.0000,  0.0000,  0.0000,  ..., -0.0002, -0.0002, -0.0002]]), 48000)

In [14]:
from torchaudio import functional




speech_array, sampling_rate = torchaudio.load('audio_2.m4a')


transform = functional.resample(speech_array, sampling_rate, 16000)
speech_array = transform(speech_array)

print(speech_array)

tensor([[ 0.0000,  0.0000,  0.0000,  ..., -0.0001, -0.0002, -0.0002]])
