In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd /content/drive/My Drive/Applied CV Project/freiburg_groceries_dataset

/content/drive/.shortcut-targets-by-id/1i0IgUBpr8uyXxhLQgE5RaQK7Qf6agpOh/Applied CV Project/freiburg_groceries_dataset


In [None]:
ls

[0m[01;34mBEANS[0m/  [01;34mCEREAL[0m/     [01;34mCOFFEE[0m/  [01;34mFLOUR[0m/  [01;34mJUICE[0m/  [01;34mOIL[0m/    [01;34mSODA[0m/    [01;34mTEA[0m/            train_data.json
[01;34mCAKE[0m/   [01;34mCHIPS[0m/      [01;34mCORN[0m/    [01;34mHONEY[0m/  [01;34mMILK[0m/   [01;34mPASTA[0m/  [01;34mSPICES[0m/  test_data.json  [01;34mVINEGAR[0m/
[01;34mCANDY[0m/  [01;34mCHOCOLATE[0m/  [01;34mFISH[0m/    [01;34mJAM[0m/    [01;34mNUTS[0m/   [01;34mRICE[0m/   [01;34mSUGAR[0m/   [01;34mTOMATO_SAUCE[0m/   [01;34mWATER[0m/


In [None]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-dhr1i7om
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-dhr1i7om
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
import json
from PIL import Image

from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import clip
from transformers import CLIPProcessor, CLIPModel

In [None]:
# Choose computation device
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Load pre-trained CLIP model
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)

In [None]:
class image_title_dataset(Dataset):
    def __init__(self, list_image_path, list_txt):
        self.image_path = list_image_path
        self.title = clip.tokenize(list_txt)

    def __len__(self):
        return len(self.title)

    def __getitem__(self, idx):
        try:
            image = preprocess(Image.open(self.image_path[idx]))
        except FileNotFoundError:
            print(f"File not found: {self.image_path[idx]}. Skipping...")
            return None, None

        title = self.title[idx]
        return image, title

In [None]:
def load_data(json_path):
  with open(json_path, "r") as file:
      json_data = json.load(file)
  input_data = []
  for item in json_data:
    input_data.append(item)
  list_image_path = []
  list_txt = []
  for item in input_data:
    # img_path = image_path + item['image_path'].split('/')[-1]
    img_path = item['file_path']
    # caption = item['product_title'][:40]
    caption = item['labels'][0]
    list_image_path.append(img_path)
    list_txt.append(caption)
  return list_image_path, list_txt

In [None]:
train_path = './train_data.json'
test_path = './test_data.json'

train_data = load_data(train_path)
test_data = load_data(test_path)
train_set = image_title_dataset(train_data[0], train_data[1])
test_set = image_title_dataset(test_data[0], test_data[1])

train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
test_loader = DataLoader(test_set, batch_size=64, shuffle=False)

# Print the size of train loader
print(f"Size of train_loader: {len(train_loader)}")

# Print the size of test loader
print(f"Size of test_loader: {len(test_loader)}")

Size of train_loader: 63
Size of test_loader: 16


In [None]:
# Function to convert model's parameters to FP32 format
def convert_models_to_fp32(model):
    for p in model.parameters():
        p.data = p.data.float()
        p.grad.data = p.grad.data.float()


if device == "cpu":
  model.float()

# Prepare the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2) # the lr is smaller, more safe for fine tuning to new dataset

# Specify the loss function
loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()

In [None]:
# Initialize variables for early stopping
best_accuracy = 0
patience = 1  # Number of epochs to wait for improvement
wait_count = 0

# Train the model
num_epochs = 3  # Adjust as needed
for epoch in range(num_epochs):
    model.train()
    pbar = tqdm(train_loader, total=len(train_loader))
    total_correct = 0
    total_samples = 0

    for batch in pbar:
        optimizer.zero_grad()

        images, texts = batch

        images = images.to(device)
        texts = texts.to(device)

        # Forward pass
        logits_per_image, logits_per_text = model(images, texts)

        # Compute loss
        ground_truth = torch.arange(len(images), dtype=torch.long, device=device)
        total_loss = (loss_img(logits_per_image, ground_truth) + loss_txt(logits_per_text, ground_truth)) / 2

        # Compute accuracy
        _, predicted_img = torch.max(logits_per_image, 1)
        _, predicted_text = torch.max(logits_per_text, 1)
        total_correct += (predicted_img == ground_truth).sum().item() + (predicted_text == ground_truth).sum().item()
        total_samples += ground_truth.size(0)

        # Backward pass
        total_loss.backward()
        if device != "cpu":
            convert_models_to_fp32(model)
            optimizer.step()
            clip.model.convert_weights(model)

        pbar.set_description(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss.item():.4f}")

    # Print accuracy at the end of each epoch
    accuracy = total_correct / total_samples
    print(f"Accuracy at epoch {epoch+1}: {accuracy:.4f}")

    # Check for improvement in accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        wait_count = 0
        # Save the model
        torch.save(model.state_dict(), "clip_freiburg.pth")
        print("Model saved.")
    else:
        wait_count += 1
        if wait_count >= patience:
            print("Early stopping.")
            break

Epoch 1/3, Loss: 1.8066: 100%|██████████| 63/63 [00:36<00:00,  1.72it/s]


Accuracy at epoch 1: 0.5322
Model saved.


Epoch 2/3, Loss: 1.8525: 100%|██████████| 63/63 [00:37<00:00,  1.68it/s]


Accuracy at epoch 2: 0.6035
Model saved.


Epoch 3/3, Loss: 2.1914: 100%|██████████| 63/63 [00:37<00:00,  1.67it/s]

Accuracy at epoch 3: 0.5655
Early stopping.





In [None]:
# Set the model to evaluation mode
model.eval()

test_correct = 0
test_total = 0

# Initialize tqdm progress bar
pbar = tqdm(test_loader, total=len(test_loader), desc="Testing")

# Iterate over the test data
for batch in pbar:
    images, texts = batch
    images = images.to(device)
    texts = texts.to(device)

    with torch.no_grad():  # No need to track gradients during inference
        # Forward pass
        logits_per_image, logits_per_text = model(images, texts)

        # Compute accuracy
        _, predicted_img = torch.max(logits_per_image, 1)
        _, predicted_text = torch.max(logits_per_text, 1)
        test_correct += (predicted_img == predicted_text).sum().item()  # Assuming images and texts are matched
        test_total += images.size(0)

    # Update tqdm progress bar description
    pbar.set_postfix({'Test Accuracy': test_correct / test_total})

# Calculate accuracy
test_accuracy = test_correct / test_total
print(f"Test Accuracy: {test_accuracy:.4f}")

Testing: 100%|██████████| 16/16 [02:57<00:00, 11.11s/it, Test Accuracy=0.00294]

Test Accuracy: 0.0029



