<a href="https://colab.research.google.com/github/hmin27/2023_DL_Clip/blob/main/CLIP(Finetune).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CLIP Fine tuning
- Food image classification
- Baseline of Fine Tuned CLIP model


In [1]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-7xe5obbp
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-7xe5obbp
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369500 sha256=80bc16280ec3dc621072ef9bce3cd8a9a2495ab3cfe20a845c26

In [14]:
import os
import clip
import torch
from torch import nn, optim
from PIL import Image
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision.datasets import ImageFolder

%matplotlib inline
BATCH_SIZE = 32
EPOCH = 3
LR = 1e-5

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


# Prepare the Model and Data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
model = model.to(torch.float32)

torch.manual_seed(42)

100%|███████████████████████████████████████| 338M/338M [00:05<00:00, 66.6MiB/s]


<torch._C.Generator at 0x7883636a5910>

In [7]:
# Creating image path, text list
import pandas as pd

data_folder = '/content/drive/MyDrive/Study/DL_CLIP/Food'

image_paths = []
text_descriptions = []
class_folders = os.listdir(data_folder)

for class_folder in class_folders:
    class_folder_path = os.path.join(data_folder, class_folder)
    image_files = os.listdir(class_folder_path)

    for image_file in image_files:
        image_path = os.path.join(class_folder_path, image_file)
        image_paths.append(image_path)

        # Create text description using class label
        text_description = f"a photo of {class_folder.replace('_',' ')}"
        text_descriptions.append(text_description)

print(text_descriptions)
len(text_descriptions)  # 21091개


['a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut', 'a photo of donut',

21091

In [11]:
class MyDataset(Dataset):
    def __init__(self, image_paths, text_descriptions, preprocess):
        self.image_paths = image_paths
        self.text_descriptions = text_descriptions
        self.preprocess = preprocess

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx])
        image = self.preprocess(image)
        text = clip.tokenize(self.text_descriptions[idx])
        return image, text


dataset = MyDataset(image_paths, text_descriptions, preprocess)

# train : validation : test = 7:2:1
total_size = len(dataset)
train_size = int(0.7 * total_size)
val_size = int(0.2 * total_size)
test_size = total_size - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])


trainloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
testloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [12]:
from numpy.lib import shape_base
for batch in trainloader:
    images, texts = batch
    # Print the first batch
    print("Image Path:", images[0].shape)
    print("Text Description:", texts[0].shape)
    break

Image Path: torch.Size([3, 224, 224])
Text Description: torch.Size([1, 77])


# Training

In [15]:
def convert_models_to_fp32(model):
    for p in model.parameters():
        p.data = p.data.float()
        p.grad.data = p.grad.data.float()

loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
# scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(trainloader)*EPOCH)

In [16]:
from tqdm.notebook import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def compute_accuracy(logits, ground_truth):
    _, predicted = logits.max(1)
    total = ground_truth.size(0)
    correct = (predicted == ground_truth).sum().item()
    return correct / total

for epoch in range(EPOCH):
    print(f"Epoch: {epoch+1}")

    # Training loop
    model.train()
    train_total, train_correct = 0, 0
    pbar = tqdm(trainloader, total=len(trainloader))
    for batch in pbar:
        optimizer.zero_grad()

        images, texts = batch
        texts = texts.squeeze(1)
        images = images.to(device)
        texts = texts.to(device)

        logits_per_image, logits_per_text = model(images, texts)

        # Compute loss
        actual_batch_size = images.size(0)
        ground_truth = torch.arange(actual_batch_size).to(device)
        total_loss = (loss_img(logits_per_image, ground_truth) + loss_txt(logits_per_text, ground_truth)) / 2

        # Compute train accuracy
        train_correct += (logits_per_image.argmax(dim=1) == ground_truth).float().sum().item()
        train_total += images.size(0)

        total_loss.backward()

        if device == "cpu":
            optimizer.step()
        else :
            convert_models_to_fp32(model)
            optimizer.step()
            clip.model.convert_weights(model)

        train_accuracy = 100 * train_correct / train_total
        pbar.set_description(f"Epoch {epoch+1}/{EPOCH}, Loss: {total_loss.item():.4f}, Train Acc: {train_accuracy:.2f}%")

    # Validation loop
    model.eval()
    val_total, val_correct = 0, 0
    with torch.no_grad():
        for batch in valloader:
            images, texts = batch
            texts = texts.squeeze(1)
            images = images.to(device)
            texts = texts.to(device)

            logits_per_image, _ = model(images, texts)

            actual_batch_size = logits_per_image.size(0)
            ground_truth = torch.arange(actual_batch_size).to(device)
            # ground_truth = torch.arange(BATCH_SIZE).to(device)

            val_correct += (logits_per_image.argmax(dim=1) == ground_truth).float().sum().item()
            val_total += images.size(0)

    val_accuracy = 100 * val_correct / val_total
    print(f"Validation Accuracy: {val_accuracy:.2f}%")



Epoch: 1


  0%|          | 0/462 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
torch.save(model.state_dict(), 'CLIP_v1.pth')