# Fine-tuning CLIP
Run the following cell if running this Jupyter Notebook on Google Colab to install additional necessary libraries before you begin. If you are running this on your Vertex AI Workbench Instance, you will likely already have installed these libraries.

In [1]:
%%capture
# for google Colab
!pip install accelerate transformers==4.37.0 datasets
!pip install --upgrade --q datasets transformers accelerate soundfile librosa evaluate jiwer tensorboard gradio

## Initialize CLIP Model
Here we initialize the CLIP model as well as a particular tokenizer; here we've chosen the RoBERTa tokenizer.

In [1]:
import torch
from datasets import load_dataset
from PIL import Image
from torchvision.io import ImageReadMode, read_image
from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize
from torchvision.transforms.functional import InterpolationMode
from transformers import (
    Trainer,
    TrainingArguments,
    VisionTextDualEncoderModel,
    VisionTextDualEncoderProcessor,
    AutoTokenizer,
    AutoImageProcessor
)

from typing import List
import io
from transformers import AutoProcessor, Owlv2ForObjectDetection
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
name = "google/owlv2-base-patch16-ensemble"
processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
model.to(device)
image_processor = AutoImageProcessor.from_pretrained(name)
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")

config = model.config
image_processor

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f191f7cb1c0>>
Traceback (most recent call last):
  File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
  warn(f"Failed to load image Python extension: {e}")


Owlv2ImageProcessor {
  "_valid_processor_keys": [
    "images",
    "do_pad",
    "do_resize",
    "size",
    "do_rescale",
    "rescale_factor",
    "do_normalize",
    "image_mean",
    "image_std",
    "return_tensors",
    "data_format",
    "input_data_format"
  ],
  "do_normalize": true,
  "do_pad": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "Owlv2ImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "processor_class": "Owlv2Processor",
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 960,
    "width": 960
  }
}

Now we load our datasets. Here we're loading a small dummy COCO dataset.

# Example of data (image, text pairs)

In [24]:
import jsonlines
import torchaudio
from datasets import Dataset, load_metric, DatasetDict
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments
from pathlib import Path
import torch
import librosa
import IPython.display as ipd
import jiwer

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Define the path to the directory
current_directory = Path.cwd()
file_path = current_directory / '..' / '..' / 'novice'
data_dir = file_path.resolve()
print(data_dir, current_directory)

/home/jupyter/novice /home/jupyter/til-24-base/vlm


In [26]:
import jsonlines
import torchaudio
from datasets import Dataset, load_metric, DatasetDict
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments
from pathlib import Path
import torch
import librosa
import IPython.display as ipd
import jiwer

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Define the path to the directory
current_directory = Path.cwd()
file_path = current_directory / '..' / '..' / 'novice'
data_dir = file_path.resolve()
print(data_dir, current_directory)

# Read data from a jsonl file and reformat it
data = {'key': [], 'image': [], 'caption': [], 'bbox': []}
counter = 0
with jsonlines.open(data_dir / "vlm.jsonl") as reader:
    for i, obj in enumerate(reader):
        if len(data['image']) < 10:
            for item in obj['annotations']:
                data['key'].append(counter)
                data['caption'].append(item['caption'])
                data['image'].append(obj['image'])
                data['bbox'].append(item['bbox'])
                counter += 1
                
# Convert to a Hugging Face dataset
dataset = Dataset.from_dict(data) # converts it into a dataset object which has in-built helper functions to help us later on when we need to do operations on it
# think of it as a special pandas library :)

# Shuffle the dataset
dataset = dataset.shuffle(seed=42) # shuffle the dataset (one of the in-built helper functions of the Hugging Face dataset)

# Split the dataset into training, validation, and test sets
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
print(train_size, val_size, test_size)

train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))
test_dataset = dataset.select(range(train_size + val_size, train_size + val_size + test_size))

dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'val': val_dataset})

dataset

/home/jupyter/novice /home/jupyter/til-24-base/vlm
8 1 2


DatasetDict({
    train: Dataset({
        features: ['key', 'image', 'caption', 'bbox'],
        num_rows: 8
    })
    test: Dataset({
        features: ['key', 'image', 'caption', 'bbox'],
        num_rows: 2
    })
    val: Dataset({
        features: ['key', 'image', 'caption', 'bbox'],
        num_rows: 1
    })
})

In [20]:
dataset['train'][0]

{'key': 6,
 'image': 'image_2.jpg',
 'caption': 'blue and yellow fighter jet',
 'bbox': [836, 464, 36, 36]}

# Preprocess the data

We need to pre-process our dataset such that our model will be able to recognize it. So first we define our image preprocessing logic (e.g. resizing, converting to the correct datatype, normalization, etc.), as well as our text preprocessing logic (i.e. tokenization), then apply it to our datasets, both train and eval.

In [29]:
# We use torchvision for faster image pre-processing. The transforms are implemented as nn.Module,
# so we jit it to be faster.
class Transform(torch.nn.Module):
    def __init__(self, image_size, mean, std):
        super().__init__()
        self.transforms = torch.nn.Sequential(
            Resize([image_size], interpolation=InterpolationMode.BICUBIC),
            CenterCrop(image_size),
            ConvertImageDtype(torch.float),
            Normalize(mean, std),
        )

    def forward(self, x) -> torch.Tensor:
        """`x` should be an instance of `PIL.Image.Image`"""
        with torch.no_grad():
            x = self.transforms(x)
        return x

# For preprocessing the datasets.
# Initialize torchvision transforms and jit it for faster processing.
image_transformations = Transform(
    config.vision_config.image_size, image_processor.image_mean, image_processor.image_std
)
image_transformations = torch.jit.script(image_transformations)

960

In [30]:
def preprocess_dataset(dataset, split):
    # Preprocessing the datasets.
    data = dataset[split]
    # We need to tokenize inputs and targets.
    column_names = data.column_names

    # 6. Get the column names for input/target.
    image_column = "image_path"
    caption_column = "caption"
    dataset_columns = (image_column, caption_column)

    # Preprocessing the datasets.
    # We need to tokenize input captions and transform the images.
    def tokenize_captions(examples):
        captions = list(examples[caption_column])
        text_inputs = tokenizer(captions, padding="max_length", truncation=True)
        examples["input_ids"] = text_inputs.input_ids
        examples["attention_mask"] = text_inputs.attention_mask
        return examples

    def transform_images(examples):
        images = [read_image(image_file, mode=ImageReadMode.RGB) for image_file in examples[image_column]]
        examples["pixel_values"] = [image_transformations(image) for image in images]
        return examples

    data = data.map(
        function=tokenize_captions,
        batched=True,
        remove_columns=[col for col in column_names if col != image_column],
        desc=f"Running tokenizer on {split} dataset",
    )

    # Transform images on the fly as doing it on the whole dataset takes too much time.
    data.set_transform(transform_images)
    return data

In [34]:
train_dataset = preprocess_dataset(dataset, "train")
eval_dataset = preprocess_dataset(dataset, "val")

Running tokenizer on train dataset:   0%|          | 0/8 [00:00<?, ? examples/s]

Running tokenizer on val dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Finally we need to write a small function to handle the batching logic for our training. This collates all passed training items in the batch together such that we can pass it to the model for training, along with the kwarg `return_loss=True` such that the model will return its loss for backpropagation.

In [35]:
def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    input_ids = torch.tensor([example["input_ids"] for example in examples], dtype=torch.long)
    attention_mask = torch.tensor([example["attention_mask"] for example in examples], dtype=torch.long)
    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "return_loss": True,
    }

# Train

Now we're ready to actually train our CLIP model!

In [36]:
# initialize Trainer
training_args = TrainingArguments(
    learning_rate=5e-5,
    warmup_steps=0,
    weight_decay=0.1,
    per_device_train_batch_size=16,
    logging_steps=5,
    save_steps=5,
    remove_unused_columns=False,
    output_dir="clip-finetune",
    report_to='none', # disable wandb
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
)
train_result = trainer.train()

KeyError: 'image_path'

In [None]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 2.0890517234802246, 'eval_runtime': 2.916, 'eval_samples_per_second': 27.435, 'eval_steps_per_second': 3.429, 'epoch': 3.0}


Once the model is trained, we can save it to our defined `output_dir` (in this case `clip-finetune`) so we can import it into our applications later.

In [None]:
trainer.save_model("clip-finetune")
tokenizer.save_pretrained("clip-finetune")
image_processor.save_pretrained("clip-finetune")

['clip-finetune/preprocessor_config.json']

## Resources
* [HF Transformers on training CLIP](https://github.com/huggingface/transformers/tree/main/examples/pytorch/contrastive-image-text)