In [1]:
%cd ..

/scratch/hb2546/dphil-svt/SemCLIP


In [2]:
# !pip3 install -r requirements.txt
# !pip3 install supervision torch tqdm fire datasets opencv-python openai-clip huggingface-hub torch python-dotenv clip torchvision Pillow pandas numpy matplotlib transformers
# !pip install --upgrade notebook
# !pip install --upgrade ipywidgets
# !jupyter nbextension enable --py widgetsnbextension

In [3]:
# !bash sam_model_setup.sh
# !pip3 install -q 'git+https://github.com/facebookresearch/segment-anything.git'

In [4]:
from SemCLIP.semclip import SemCLIP
from SemCLIP.image_utils import DEVICE, create_batches, pil_to_cv2
from SemCLIP.model_utils import convert_models_to_fp32, convert_models_to_fp16


semclip = SemCLIP(model_name="openai/clip-vit-base-patch32", pool_type='attention', projection_dim=512, device=DEVICE)



In [5]:
import torch

from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

dataset = load_dataset("hunarbatra/CLIP-LLaVA-Instruct-COCO-13k")

## Download Data Files Locally [Not required to run]

In [17]:
import os
import requests
from datasets import load_dataset


dataset = load_dataset("hunarbatra/CLIP-LLaVA-Instruct-COCO-13k")
dataset = dataset.remove_columns("downloaded_img")

save_dir = "data/CLIP-LLaVA-Instruct-COCO-13k/"

os.makedirs(save_dir, exist_ok=True)

def download_image(record):
    url = record['coco_url']
    file_name = os.path.join(save_dir, f"{record['image']}")
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open(file_name, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
        record['local_path'] = file_name
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        record['local_path'] = None
    return record

dataset = dataset.map(download_image, num_proc=8)

# dataset.save_to_disk("CLIP-LLaVA-Instruct-COCO-13k-no-img")


Map (num_proc=8):   0%|          | 0/13430 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/3365 [00:00<?, ? examples/s]

## Finetune Model with local data/ [Not required to run - only if data/ files are local]

In [None]:
import cv2
import numpy as np
import clip
import torch
import torch.nn as nn
from tqdm import tqdm
from PIL import Image


semclip.model.to(DEVICE)

if DEVICE == "cpu":
    semclip.model.float() # convert the model params to float if using CPU

optimizer = torch.optim.Adam(semclip.model.parameters(), lr=1e-5)

loss = torch.nn.CrossEntropyLoss()

# Training loop
num_epochs = 1
batch_size = 64
data_dir = 'CLIP-LLaVA-Instruct-COCO-13k'

for epoch in range(num_epochs):
    semclip.model.train()
    train_loader = create_batches(dataset['train'], batch_size)
    
    pbar = tqdm(train_loader, total=len(dataset['train']) // batch_size, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in pbar:
        optimizer.zero_grad()
        
        image_batch = batch['image']
        text_batch = batch['caption']
        
        if DEVICE != "cpu":
            convert_models_to_fp32(semclip.model)
            
        # Forward pass through the model
        logits_per_image, logits_per_text = semclip.get_semclip_embeddings(images=image_batch, captions=text_batch, images_folder=data_dir)
            
        # Compute the loss
        ground_truth = torch.arange(len(image_batch), dtype=torch.long).to(DEVICE)
        logits_per_image.requires_grad_()
        logits_per_text.requires_grad_()
        total_loss = (loss(logits_per_image, ground_truth) + loss(logits_per_text, ground_truth)) / 2

        # Backward pass
        total_loss.backward()
        
        # if the device is CPU, directly update the model
        if DEVICE == "cpu":
            optimizer.step()
            scheduler.step()
        else:
            convert_models_to_fp32(semclip.model)
            optimizer.step()
            scheduler.step()
            convert_models_to_fp16(semclip.model)
        
        # Update the progress bar with the current loss
        pbar.set_postfix(Loss=total_loss.item())

## Finetune Model without downloading images locally [with HF data]

In [7]:
import cv2
import numpy as np
import clip
import torch

import torch.nn as nn

from tqdm import tqdm
from PIL import Image


semclip.model.to(DEVICE)

if DEVICE == "cpu":
    semclip.model.float() # convert the model params to float if using CPU

# optimizer = torch.optim.AdamW(semclip.model.parameters(), lr=5e-5, betas=(0.9, 0.98), eps=1e-6, weight_decay=0.2) # weight decay adds L2 regularization to the optimizer
optimizer = torch.optim.Adam(semclip.model.parameters(), lr=1e-5)

loss = torch.nn.CrossEntropyLoss()

# Training loop
num_epochs = 1
batch_size = 64

train_loader = create_batches(dataset['train'], batch_size)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, (len(dataset['train']) // batch_size) * num_epochs)

for epoch in range(num_epochs):
    semclip.model.train()
    
    pbar = tqdm(train_loader, total=len(dataset['train']) // batch_size, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in pbar:
        optimizer.zero_grad()
        
        image_batch_pil = batch['downloaded_img']
        text_batch = batch['caption']
        
        # Convert the batch of PIL images to OpenCV images
        image_batch_cv2 = [pil_to_cv2(img) for img in image_batch_pil]

        if DEVICE != "cpu":
            convert_models_to_fp32(semclip.model)

        # Forward pass through the model
        logits_per_image, logits_per_text = semclip.get_semclip_embeddings_direct_img(images=image_batch_cv2, captions=text_batch)
            
        # Compute the loss
        ground_truth = torch.arange(batch_size).to(DEVICE)
        logits_per_image.requires_grad_()
        logits_per_text.requires_grad_()
        total_loss = (loss(logits_per_image, ground_truth) + loss(logits_per_text, ground_truth)) / 2

        # Backward pass
        total_loss.backward()
        
        # if the device is CPU, directly update the model
        if DEVICE == "cpu":
            optimizer.step()
            scheduler.step()
        else:
            convert_models_to_fp32(semclip.model)
            optimizer.step()
            scheduler.step()
            convert_models_to_fp16(semclip.model)
        
        # Update the progress bar with the current loss
        pbar.set_postfix(Loss=total_loss.item())


Epoch 1/1:   0%|          | 0/209 [00:00<?, ?it/s]

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63


Epoch 1/1:   0%|          | 1/209 [03:02<10:32:47, 182.54s/it, Loss=6.26]


IndexError: too many indices for array: array is 2-dimensional, but 3 were indexed

In [None]:
semclip.model = model

semclip.upload_model_to_hf_hub(model_name='semclip-v1', hf_name='hunarbatra')