# Adapting CLIP model

In [1]:
######## Mount the drive ########

from google.colab import drive
drive.mount('/content/drive/')

######## Install the dependencies ########
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git
%cd /content/drive/MyDrive/adapting-CLIP
#%cd ../../adapting-CLIP

Mounted at /content/drive/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-g25ioto1
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-g25ioto1
  Resolved https://github.com/openai/CLIP.git to commit a9b1bf5920416aaeaec965c25dd9e8f98c864f16
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for cl

In [2]:
#import argparse
#import os.path as osp
from tqdm import tqdm
import numpy as np
import torch
from models.slic_vit import SLICViT
from models.ss_baseline import SSBaseline
from models.resnet_high_res import ResNetHighRes
from utils.zsg_data import FlickrDataset, VGDataset
from utils.grounding_evaluator import GroundingEvaluator

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import cv2
import random
%matplotlib inline

In [3]:
# ! python eval.py --model vit14 --dataset flickr_s1_val --iou_thr 0.5 --num_samples 500

model = SLICViT
args = {
    'model': 'vit14',
    'alpha': 0.75,
    'aggregation': 'mean',
    'n_segments': list(range(100, 601, 50)),
    'temperature': 0.02,
    'upsample': 2,
    'start_block': 0,
    'compactness': 50,
    'sigma': 0,
}
dataset_full = FlickrDataset(data_type='flickr30k_c1/val')
iou_thr = 0.5
model = model(**args).cuda()

100%|████████████████████████████████████████| 890M/890M [00:08<00:00, 111MiB/s]


In [32]:
######### Evaluate the model #########
# Randomly select images
num_samples = 16
idxs = random.sample(range(len(dataset_full)), num_samples)

# Create a random subset of the dataset
dataset = FlickrDataset(data_type=dataset_full.data_type)
dataset.image_paths = [dataset_full.image_paths[idx] for idx in idxs]
dataset.bboxes = [dataset_full.bboxes[idx] for idx in idxs]
dataset.phrases = [dataset_full.phrases[idx] for idx in idxs]

# Lists to hold loaded data
imgs = []
texts = []
bbox_gts = []
bbox_preds = []

# Predict the bounding boxes
for idx in tqdm(range(len(dataset))):

    # Data loading - do not call __getitem__ repeatedly
    data = dataset[idx] 
    #print(data['edge_box'])
    im = data['image']
    text = data['phrases'][0]
    bbox_gts.append(data['bbox'])

    # Predict
    bbox_pred, _ = model(im, text)

    # Hold loaded data
    imgs.append(im)
    texts.append(text)
    bbox_preds.append(bbox_pred[0])

# Evaluate the model
evaluator = GroundingEvaluator(gt_dataset=dataset, iou_thresh=iou_thr)
acc = evaluator(torch.from_numpy(np.stack(bbox_preds, axis=0)))
print('\nAcc: {}'.format(acc))

100%|██████████| 16/16 [01:13<00:00,  4.58s/it]


Acc: 0.3125





In [33]:
######## Visualize multiple images in Grid ########

# Set the number of rows and columns in the grid
row_num = num_samples // 4 + (1 if num_samples % 4 != 0 else 0)
row_num = max(row_num, 2) # at least 2 rows
col_num = 4

# Red: predicted bounding box
# Blue: ground truth bounding box
fig, axs = plt.subplots(row_num, col_num, figsize=(20, 20))
for i in range(row_num):
    for j in range(col_num):

        idx = i * col_num + j

        # check if the index is out of range
        if(idx<num_samples):
          im = imgs[idx]
          bbox_pred = bbox_preds[idx]
          bbox_gt = bbox_gts[idx]

          im = cv2.rectangle(im, (int(bbox_pred[0]), int(bbox_pred[1])), (int(bbox_pred[2]), int(bbox_pred[3])), (255, 50, 50), 2)
          im = cv2.rectangle(im, (int(bbox_gt[0]), int(bbox_gt[1])), (int(bbox_gt[2]), int(bbox_gt[3])), (50, 50, 255), 2)
          
          axs[i, j].imshow(im)
          axs[i, j].set_title(texts[idx])

        # remove the axis
        axs[i, j].axis('off')

plt.show()

# Show the text
# print(texts)


Output hidden; open in https://colab.research.google.com to view.