In [7]:
import pandas as pd
import torch
import open_clip
from PIL import Image
from pathlib import Path

#For Parsing json data
import json

#For Loading Images
from PIL import Image

#For displaying loadbar
from tqdm import tqdm

#Importing pytorch to finetune our clip
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

#Here, we import clip model developed by meta
import clip

#Constructs a CLIP processor which wraps a CLIP image processor and a CLIP tokenizer into a single processor.
from transformers import CLIPProcessor, CLIPModel

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


# Define Constants

In [24]:
INDO_FASHION_FOLDER_PATH = Path('images/indo_fashion')
JSON_PATH = INDO_FASHION_FOLDER_PATH / 'train_data.json'
IMAGE_PATH = INDO_FASHION_FOLDER_PATH / 'train'
BATCH_SIZE = 100
SHUFFLE = True

# Load Images

In [11]:
input_data = []
with open(JSON_PATH, 'r') as f:
	for line in f:
		obj = json.loads(line)
		input_data.append(obj)


In [13]:
# Load the CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [19]:
# Choose computation device
device = "cuda:0" if torch.cuda.is_available() else "cpu" 

# Define a custom dataset
from torch.utils.data import Dataset
from transformers import CLIPProcessor
from PIL import Image

class ImageTitleDataset(Dataset):
    def __init__(self, list_image_path, list_txt, processor):
        """
        Args:
            list_image_path (list): List of image file paths.
            list_txt (list): List of corresponding text titles.
            processor (CLIPProcessor): Processor for preprocessing images and text.
        """
        self.image_path = list_image_path
        self.title = list_txt
        self.processor = processor  # Hugging Face CLIP processor

    def __len__(self):
        """Returns the length of the dataset."""
        return len(self.title)

    def __getitem__(self, idx):
        """Returns a preprocessed image and its corresponding tokenized text."""
        image = Image.open(self.image_path[idx]).convert("RGB")  # Ensure image is in RGB
        text = self.title[idx]

        # Preprocess image and tokenize text using the processor
        inputs = self.processor(text=[text], images=image, return_tensors="pt", padding=True)

        # Return preprocessed image and text
        return inputs["pixel_values"].squeeze(0), inputs["input_ids"].squeeze(0)

In [21]:
# use your own data
list_image_path = []
list_txt = []
for item in input_data:
  img_path = IMAGE_PATH / item['image_path'].split('/')[-1]
  
  #As we have image text pair, we use product title as description.
  caption = item['product_title'][:40]
  list_image_path.append(img_path)
  list_txt.append(caption)

In [26]:
dataset = ImageTitleDataset(list_image_path, list_txt, processor=processor)
train_dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE) 
#Define your own dataloader

# Function to convert model's parameters to FP32 format
#This is done so that our model loads in the provided memory.
def convert_models_to_fp32(model): 
    for p in model.parameters(): 
        p.data = p.data.float() 
        p.grad.data = p.grad.data.float() 

# Check if the device is set to CPU
if device == "cpu":
    model.float()  # Convert the model's parameters to float if using CPU

# Prepare the optimizer
optimizer = torch.optim.Adam(
    model.parameters(), lr=5e-5, betas=(0.9, 0.98), eps=1e-6 ,weight_decay=0.2) 
    
# Adam optimizer is used with specific hyperparameters
# lr (learning rate) is set to 5e-5, which is considered safe for fine-tuning to a new dataset
# betas are used for the optimization algorithm
# eps is a small value to prevent division by zero
# weight_decay adds L2 regularization to the optimizer

# Specify the loss function for images
loss_img = nn.CrossEntropyLoss()

# Specify the loss function for text
loss_txt = nn.CrossEntropyLoss()

In [27]:
num_epochs = 4 # Number of training epochs
for epoch in range(num_epochs):
    pbar = tqdm(train_dataloader, total=len(train_dataloader))
    
    # Iterate through the batches in the training data
    for batch in pbar:
        optimizer.zero_grad()  # Zero out gradients for the optimizer
        
        # Extract images and texts from the batch
        images, texts = batch
        
        # Print the current device (CPU or GPU)
        print(device)
        
        # Move images and texts to the specified device (CPU or GPU)
        images = images.to(device)
        texts = texts.to(device)

        # Forward pass through the model
        logits_per_image, logits_per_text = model(images, texts)

        # Compute the loss
        ground_truth = torch.arange(len(images), dtype=torch.long, device=device)
        total_loss = (loss_img(logits_per_image, ground_truth) + loss_txt(logits_per_text, ground_truth)) / 2

        # Backward pass and update the model's parameters
        total_loss.backward()
        
        # If the device is CPU, directly update the model
        if device == "cpu":
            optimizer.step()
        else:
            # Convert model's parameters to FP32 format, update, and convert back
            convert_models_to_fp32(model)
            optimizer.step()
            clip.model.convert_weights(model)

        # Update the progress bar with the current epoch and loss
        pbar.set_description(f"Epoch {epoch}/{num_epochs}, Loss: {total_loss.item():.4f}")

  0%|          | 0/912 [00:00<?, ?it/s]


RuntimeError: stack expects each tensor to be equal size, but got [10] at entry 0 and [12] at entry 1