# Text-Region Retrieval (Object-Attribute Queries) - CLIP
## EX: Retrieve bounding boxes for query "red bird"

## Setup: GPUs, Key Variables, Paths, Metadata

### GPUs

In [1]:
# Notebook assumes 1 GPU can be used; If running on server, note which are available 

%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

DEVICE = "cuda"

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7


### Key Variables

In [2]:
# For loading CLIP 
MODE_FOR_MODEL = "custom_trained"  # "custom_trained" (e.g. adjective negs), "clip_lib" (def clip), "default_openai" (def openclip)
BACKBONE = "ViT-B-32"              # Script is only tested with ViT-B-32

# If we need to preprocess a query file, note 
MAKE_QUERY_FILE = False 
MAKE_MODE = "color_pattern_material" # attributes of interest separated by underscore
NUM_OF_GT_RETRIEVALS_THRESHOLD = 10  # since we eval @k, make sure at least 10 regions exist in dataset 

# Prompt for CLIP inference
PROMPT_TEMPLATE = 'A photo of a {attribute_name} {category_name}.'

# Attributes for CLIP evaluation
ATTRIBUTE_TYPES_OF_INTEREST = ["color", "material", "pattern"]

# We can preprocess crops to save time
LOAD_IMG_INFO = False

### Paths

In [3]:
# Path to model to analyze
PATH_TO_MODEL = '/afs/cs.pitt.edu/usr0/krb115/private/clip_attributes/prep_for_git/attributes_and_vlms/data/random_neg_bs64_lr1en6_epoch_5.pt'
#PATH_TO_MODEL = '/archive2/kyle/clip_training_exp/pt_6_13_23_order_only_attr_noun/checkpoints/epoch_5.pt'
#PATH_TO_MODEL = '/archive2/kyle/clip_training_exp/fix_pt_6_12_baseline_w_filtered_negs/checkpoints/epoch_5.pt'

# Path to object-attribute combo file 
# If this is unavailable, this script will provide option to make file
# Note: Refer to section which makes file for note about cleaning required after creation
PATH_TO_ATTR_OBJ_COMBOS = "/afs/cs.pitt.edu/usr0/krb115/private/clip_attributes/prep_for_git/attributes_and_vlms/data/color_pattern_material_cleaned.txt"

# Path to needed datasets 
PATH_TO_COCO = '/archive2/kyle/datasets/coco/val2017/'
PATH_TO_OVAD = "/archive2/kyle/datasets/ovad/ovad2000.json"

# Path to work area to save temporary files 
WORK_AREA_PATH = "/afs/cs.pitt.edu/usr0/krb115/private/clip_attributes/work_area/"

### Metadata

In [4]:
# Define COCO class mapping (this is standard set dict of ids to names) 
COCO_NUMS_TO_NAMES = {1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorcycle', 5: 'airplane', 6: 'bus', 7: 'train', 8: 'truck', 
            9: 'boat', 10: 'traffic light', 11: 'fire hydrant', 13: 'stop sign', 14: 'parking meter', 
            15: 'bench', 16: 'bird', 17: 'cat', 18: 'dog', 19: 'horse', 20: 'sheep', 21: 'cow', 22: 'elephant', 
            23: 'bear', 24: 'zebra', 25: 'giraffe', 27: 'backpack', 28: 'umbrella', 31: 'handbag', 32: 'tie', 33: 'suitcase',
            34: 'frisbee', 35: 'skis', 36: 'snowboard', 37: 'sports ball', 38: 'kite', 39: 'baseball bat', 40: 'baseball glove', 41: 'skateboard', 
            42: 'surfboard', 43: 'tennis racket', 44: 'bottle', 46: 'wine glass', 47: 'cup', 48: 'fork', 49: 'knife', 50: 'spoon', 51: 'bowl', 
            52: 'banana', 53: 'apple', 54: 'sandwich', 55: 'orange', 56: 'broccoli', 57: 'carrot', 58: 'hot dog', 59: 'pizza', 60: 'donut', 
            61: 'cake', 62: 'chair', 63: 'couch', 64: 'potted plant', 65: 'bed', 67: 'dining table', 70: 'toilet', 72: 'tv', 73: 'laptop', 
            74: 'mouse', 75: 'remote', 76: 'keyboard', 77: 'cell phone', 78: 'microwave', 79: 'oven', 80: 'toaster', 81: 'sink', 82: 'refrigerator', 
            84: 'book', 85: 'clock', 86: 'vase', 87: 'scissors', 88: 'teddy bear', 89: 'hair drier', 90: 'toothbrush'}

# OVAD
COLOR_IDS = [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]
MATERIAL_IDS = [70, 71, 72, 73, 74, 75, 76, 77, 78, 79]
PATTERN_IDS = [87, 88, 89, 90, 91, 92, 93]

### Key Imports 

In [5]:
# Key imports and setup 

import sys
sys.path.append("../vision_language_models_are_bows")         # Add subrepo to Python path 
import clip
import json
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
from matplotlib.patches import Rectangle
from model_zoo import get_model
from model_zoo.clip_models import CLIPWrapper
import numpy as np
from open_clip import create_model_and_transforms             # Need OpenCLIP installed 
import pickle
from PIL import Image
import torch

%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


## Preparation for Evaluation

### Load Model of Interest

In [6]:
# Load trained model 
if MODE_FOR_MODEL == "custom_trained":
    checkpoint = torch.load(PATH_TO_MODEL, map_location=torch.device(DEVICE))
    model, _, preprocess = create_model_and_transforms(
            BACKBONE, "openai", precision='amp', device=DEVICE, jit=False, force_quick_gelu=False, pretrained_image=False)
    model.load_state_dict(checkpoint['state_dict'])
    model = model.eval()        
    model = CLIPWrapper(model, DEVICE)
# Using default model in CLIP library 
elif MODE_FOR_MODEL == "clip_lib":
    if "ViT" in BACKBONE:               # clean text format
        modified_backbone = BACKBONE[::-1].replace('-', '/', 1)[::-1]
    model, preprocess = clip.load(modified_backbone, device=DEVICE)
# If using OpenCLIP default model 
elif MODE_FOR_MODEL == "default_openai":
    if "ViT" in BACKBONE:               # clean text format
        modified_backbone = BACKBONE[::-1].replace('-', '/', 1)[::-1]
    model, preprocess = get_model(model_name="openai-clip:" + modified_backbone, device=DEVICE)

### Load OVAD: Dictionary for ID # to Attribute Name, Image List

In [7]:
# Make id_to_attribute_dict to know attributes in dataset 
with open(PATH_TO_OVAD, "r") as f:
    ovad_data = json.load(f)
id_to_attribute_dict = dict()
for attr in ovad_data["attributes"]:
    id_to_attribute_dict[attr['id']] = attr['name']
print(id_to_attribute_dict)

{0: 'cleanliness:clean/neat', 1: 'cleanliness:unclean/dirt/dirty/muddy', 2: 'clothes color:black', 3: 'clothes color:blue', 4: 'clothes color:brown', 5: 'clothes color:gray', 6: 'clothes color:green', 7: 'clothes color:orange', 8: 'clothes color:pink', 9: 'clothes color:red', 10: 'clothes color:tan', 11: 'clothes color:violet', 12: 'clothes color:white', 13: 'clothes color:yellow', 14: 'clothes pattern:dotted/speckled/spotted', 15: 'clothes pattern:floral', 16: 'clothes pattern:lettered', 17: 'clothes pattern:plaid/tartan/checkered', 18: 'clothes pattern:plain', 19: 'clothes pattern:striped/lined/pinstriped', 20: 'clothes pattern:tiled', 21: 'color quantity:multicolored/colorful', 22: 'color quantity:single-colored/unicolored', 23: 'color quantity:two-colored', 24: 'color:black', 25: 'color:blue', 26: 'color:brown', 27: 'color:gray', 28: 'color:green', 29: 'color:orange', 30: 'color:pink', 31: 'color:red', 32: 'color:tan', 33: 'color:violet', 34: 'color:white', 35: 'color:yellow', 36: 

In [8]:
imgs_in_ovad = []
for img_meta in ovad_data['images']:
    imgs_in_ovad.append(img_meta['id'])
print('# of Images in OVAD')
print(len(imgs_in_ovad))

# of Images in OVAD
2000


### If we need to make a query file, do so

In [9]:
if MAKE_QUERY_FILE:
    
    # Get attribute names/ids of interest 
    names_to_make_file = []
    attribute_ids_to_make_file = []
    if "color" in MAKE_MODE:
        names_to_make_file += ["black", "blue", "brown", "gray", "green", "orange", "pink", "red", "tan", "violet", "white", "yellow"]
        attribute_ids_to_make_file += [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]
    if "material" in MAKE_MODE:
        names_to_make_file += ["asphalt/cement/clay/concrete/stucco", "ceramic/brick/porcelain", "glass", "leather", "metal/metallic/aluminum/brass/copper-zinc/iron/stainless steel/steel/silver", "paper/cardboard", "polymers/plastic/rubber/styrofoam/polymer", "stone/granite/cobblestone/gravel/marble/pebbled/rocky/sandy", "textile/cloth/fabric/denim/cotton/jean/silk/plush", "wood/wooden/bamboo/hardwood"]
        attribute_ids_to_make_file += [70, 71, 72, 73, 74, 75, 76, 77, 78, 79]     
    if "pattern" in MAKE_MODE:
        names_to_make_file += ["dotted/speckled/spotted", "floral", "lettered", "pattern:plaid/tartan/checkered", "plain", "pattern:striped/lined/pinstriped", "tiled"]
        attribute_ids_to_make_file += [87, 88, 89, 90, 91, 92, 93]

    # Need to make dictionary counting attributes for each class in COCO - loop through annots 
    dictionary_coco_cls_to_attr_count = dict()
    for i in COCO_NUMS_TO_NAMES:
        dictionary_coco_cls_to_attr_count[i] = dict() # 1: dict()
        for j in attribute_ids_to_make_file:
            dictionary_coco_cls_to_attr_count[i][j] = 0 # 1: 24: 
    for i, annot in enumerate(ovad_data["annotations"]):
        for index in attribute_ids_to_make_file:
            if annot['att_vec'][index] == 1:
                dictionary_coco_cls_to_attr_count[annot['category_id']][index] += 1

    # Make set of queries to use, with greater than 10 examples 
    attr_and_objs_to_retrieve = []
    for coco_cls in dictionary_coco_cls_to_attr_count:
        for attr_ind in dictionary_coco_cls_to_attr_count[coco_cls]:
            if dictionary_coco_cls_to_attr_count[coco_cls][attr_ind] > NUM_OF_GT_RETRIEVALS_THRESHOLD:
                adj_of_use = names_to_make_file[attribute_ids_to_make_file.index(attr_ind)]
                attr_and_objs_to_retrieve.append([adj_of_use, COCO_NUMS_TO_NAMES[coco_cls], attr_ind, coco_cls])

    # Write as attr index, COCO class index, attr name, COCO name
    with open(PATH_TO_ATTR_OBJ_COMBOS, 'w') as f:
        for a in attr_and_objs_to_retrieve:
            f.write(str(a[2]) + ',' + str(a[3]) + ',' + str(a[0]) + ',' + str(a[1]) + '\n')
            
    print('An extra filtering step is needed (externally) to make realistic attribute-object combos and to prune others')

### Load attributes and objects to retrieve and format as "queries" list

In [10]:
# First extract lists from file 
attr_and_objs_to_retrieve = []
with open(PATH_TO_ATTR_OBJ_COMBOS, 'r') as f:
    for line in f.readlines():
        attr_and_objs_to_retrieve.append(line.strip().split(','))
print(str(len(attr_and_objs_to_retrieve)) + ' combinations')

323 combinations


In [11]:
# Parse info into separate lists, including "queries" with prompts 
queries = []
attr_ids = []
obj_ids = []
attr_names = []
obj_names = []
for combo in attr_and_objs_to_retrieve:
    queries.append(PROMPT_TEMPLATE.format(attribute_name=combo[2], category_name=combo[3]))
    attr_ids.append(combo[0])
    obj_ids.append(combo[1])
    attr_names.append(combo[2])
    obj_names.append(combo[3])
print('Queries')
print(queries)

Queries
['A photo of a black bicycle.', 'A photo of a gray bicycle.', 'A photo of a white bicycle.', 'A photo of a black car.', 'A photo of a blue car.', 'A photo of a gray car.', 'A photo of a red car.', 'A photo of a white car.', 'A photo of a yellow car.', 'A photo of a black motorcycle.', 'A photo of a blue motorcycle.', 'A photo of a gray motorcycle.', 'A photo of a red motorcycle.', 'A photo of a white motorcycle.', 'A photo of a blue airplane.', 'A photo of a gray airplane.', 'A photo of a red airplane.', 'A photo of a white airplane.', 'A photo of a black bus.', 'A photo of a blue bus.', 'A photo of a red bus.', 'A photo of a white bus.', 'A photo of a yellow bus.', 'A photo of a black train.', 'A photo of a blue train.', 'A photo of a gray train.', 'A photo of a red train.', 'A photo of a white train.', 'A photo of a yellow train.', 'A photo of a black truck.', 'A photo of a gray truck.', 'A photo of a red truck.', 'A photo of a white truck.', 'A photo of a black boat.', 'A ph

### Preprocess images into crops for each region to retrieve

In [12]:
if LOAD_IMG_INFO: 
    
    # Just load files
    with open(WORK_AREA_PATH + 'test_imgs.pkl', 'rb') as f:
        test_imgs = pickle.load(f)
    with open(WORK_AREA_PATH + 'test_attr_vectors.pkl', 'rb') as f:
        test_attr_vectors = pickle.load(f)
    with open(WORK_AREA_PATH + 'test_annot_ids.pkl', 'rb') as f:
        test_annot_ids = pickle.load(f)

else:
    
    # Get regions as images, corresponding attribute annotations, and annot cat (obj) ids 
    test_imgs = []
    test_attr_vectors = []
    test_annot_ids = []

    # Loop through all OVAD annotations
    for i, annot in enumerate(ovad_data["annotations"]):

        # Get relevant info 
        img_name = ovad_data['annotations'][i]['image_id']
        bbox_to_use = ovad_data['annotations'][i]['bbox']
        full_img_path = PATH_TO_COCO + f"{img_name:012d}" + '.jpg'
        orig_image = Image.open(full_img_path).convert("RGB")

        # We use crops; could ROIpool features too (in future)
        test_img = orig_image.crop((bbox_to_use[0], bbox_to_use[1], bbox_to_use[0]+bbox_to_use[2], bbox_to_use[1]+bbox_to_use[3]))

        # Append information 
        test_imgs.append(test_img)
        test_attr_vectors.append(ovad_data['annotations'][i]['att_vec'])
        test_annot_ids.append(ovad_data['annotations'][i]['category_id'])

        if i % 1000 == 0:
            print(i)

    with open(WORK_AREA_PATH + 'test_imgs.pkl', 'wb') as f:
        pickle.dump(test_imgs, f)
    with open(WORK_AREA_PATH + 'test_attr_vectors.pkl', 'wb') as f:
        pickle.dump(test_attr_vectors, f)
    with open(WORK_AREA_PATH + 'test_annot_ids.pkl', 'wb') as f:
        pickle.dump(test_annot_ids, f)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000


In [13]:
# Get text classifier 
with torch.no_grad():
    text = clip.tokenize(queries).to(DEVICE)
    if MODE_FOR_MODEL == "custom_trained" or MODE_FOR_MODEL == "default_openai":
        print('Using CLIP Wrapper')
        text_features = model.model.encode_text(text)
    else: 
        text_features = model.encode_text(text)
    text_features /= text_features.norm(dim=-1, keepdim=True)
print('Text features size: ')
print(text_features.shape) # number of queries x text feat dim

Using CLIP Wrapper
Text features size: 
torch.Size([323, 512])


## Inference with CLIP

In [None]:
# Now that we have image list - go through; encode with CLIP; compute similarities; store similarities in dictionary 
# We compute similarities for all queries due to ease of access 
dict_img_to_clip_logits = dict()
with torch.no_grad():
    for i, ex_img in enumerate(test_imgs): 
        ex_img = preprocess(ex_img).unsqueeze(0).to(DEVICE)
        if MODE_FOR_MODEL == "custom_trained" or MODE_FOR_MODEL == "default_openai":
            image_features = model.model.encode_image(ex_img)
        else:
            image_features = model.encode_image(ex_img)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        similarity = (100.0 * image_features @ text_features.T)
        dict_img_to_clip_logits[i] = similarity # 1 x 323
        if i % 500 == 0:
            print(i)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000


## Aggregate Results From Similarity Dictionary 

In [None]:
# Get precision/recall values to evaluate 

# Set up empty dictionary for precision/recall
overall_precision_values = dict()
overall_precision_values["top1"] = []
overall_precision_values["top5"] = []
overall_precision_values["top10"] = []
for attr_type in ATTRIBUTE_TYPES_OF_INTEREST:
    overall_precision_values[attr_type] = dict()
    overall_precision_values[attr_type]["top1"] = []
    overall_precision_values[attr_type]["top5"] = []
    overall_precision_values[attr_type]["top10"] = []
overall_recall_values = dict()
overall_recall_values["top1"] = []
overall_recall_values["top5"] = []
overall_recall_values["top10"] = []
for attr_type in ATTRIBUTE_TYPES_OF_INTEREST:
    overall_recall_values[attr_type] = dict()
    overall_recall_values[attr_type]["top1"] = []
    overall_recall_values[attr_type]["top5"] = []
    overall_recall_values[attr_type]["top10"] = []

# For each text query (category) 
for i, c in enumerate(queries): # Category is in form of prompt 
    print('Query #' + str(i) + ': ' + c)

    # Get attribute/object name along with attribute index 
    attr_index = int(attr_ids[i])
    category_index = int(obj_ids[i])
    attr_name = attr_names[i]
    category_name = obj_names[i]
    
    # For each image in dictionary get the score for that particular prompt 
    scores = []
    for img_in_dict in dict_img_to_clip_logits:
        scores.append(dict_img_to_clip_logits[img_in_dict][0][i].item())
        
    # For each value of k, get top scores; note if query is satisfied
    for k in [1, 5, 10]:
        
        # For each top index, check if the attribute corresponding to the prompt is marked and if the correct COCO category is picked
        top_values, top_indices = torch.topk(torch.Tensor(scores), k)
        precision_vals_at_k = []
        for ind in top_indices:
            # Both obj and attribute should match 
            if test_attr_vectors[ind][attr_index] == 1 and test_annot_ids[ind] == category_index:
                precision_vals_at_k.append(1)
            else:
                precision_vals_at_k.append(0)
        prec_at_k = sum(precision_vals_at_k)/len(precision_vals_at_k)
        rec_at_k = 1 if sum(precision_vals_at_k) > 0 else 0            # do we have at least one correct entry?
        
        # Mark precision at k for each case 
        overall_precision_values["top" + str(k)].append(prec_at_k)
        if attr_index in COLOR_IDS:
            overall_precision_values["color"]["top" + str(k)].append(prec_at_k)
        elif attr_index in PATTERN_IDS:
            overall_precision_values["pattern"]["top" + str(k)].append(prec_at_k)
        elif attr_index in MATERIAL_IDS:
            overall_precision_values["material"]["top" + str(k)].append(prec_at_k)
            
        # Mark recall at k for each case
        overall_recall_values["top" + str(k)].append(rec_at_k)
        if attr_index in COLOR_IDS:
            overall_recall_values["color"]["top" + str(k)].append(rec_at_k)
        elif attr_index in PATTERN_IDS:
            overall_recall_values["pattern"]["top" + str(k)].append(rec_at_k)
        elif attr_index in MATERIAL_IDS:
            overall_recall_values["material"]["top" + str(k)].append(rec_at_k)

    print('\tCorrect @ ' + str(k) + ': ' + str(precision_vals_at_k))
    print('\tTop Image Indices: ' + str(top_indices.tolist()))
    print('\tSims: ' + str([float("{:.2f}".format(num)) for num in top_values]))

### Print Metrics

In [None]:
print("Precision (i.e. what is the acc of our retrievals?)")
print('overall')
print("\t@1: {:.2%}".format(sum(overall_precision_values["top1"])/len(overall_precision_values["top1"])))
print("\t@5: {:.2%}".format(sum(overall_precision_values["top5"])/len(overall_precision_values["top5"])))
print("\t@10: {:.2%}".format(sum(overall_precision_values["top10"])/len(overall_precision_values["top10"])))
print("\t" + str(len(overall_precision_values["top1"])) + ' samples')
for attr_type in ATTRIBUTE_TYPES_OF_INTEREST:
    print(attr_type)
    print("\t@1: {:.2%}".format(sum(overall_precision_values[attr_type]["top1"])/len(overall_precision_values[attr_type]["top1"])))
    print("\t@5: {:.2%}".format(sum(overall_precision_values[attr_type]["top5"])/len(overall_precision_values[attr_type]["top5"])))
    print("\t@10: {:.2%}".format(sum(overall_precision_values[attr_type]["top10"])/len(overall_precision_values[attr_type]["top10"])))
    print("\t" + str(len(overall_precision_values[attr_type]["top1"])) + ' samples')
print()

print("Recall (i.e. have we found at least 1 sample with correct attr+obj combo?)")
print('overall')
print("\t@1: {:.2%}".format(sum(overall_recall_values["top1"])/len(overall_recall_values["top1"])))
print("\t@5: {:.2%}".format(sum(overall_recall_values["top5"])/len(overall_recall_values["top5"])))
print("\t@10: {:.2%}".format(sum(overall_recall_values["top10"])/len(overall_recall_values["top10"])))
print("\t" + str(len(overall_recall_values["top1"])) + ' samples')
for attr_type in ATTRIBUTE_TYPES_OF_INTEREST:
    print(attr_type)
    print("\t@1: {:.2%}".format(sum(overall_recall_values[attr_type]["top1"])/len(overall_recall_values[attr_type]["top1"])))
    print("\t@5: {:.2%}".format(sum(overall_recall_values[attr_type]["top5"])/len(overall_recall_values[attr_type]["top5"])))
    print("\t@10: {:.2%}".format(sum(overall_recall_values[attr_type]["top10"])/len(overall_recall_values[attr_type]["top10"])))
    print("\t" + str(len(overall_recall_values[attr_type]["top1"])) + ' samples')

## Extra: Visualization to see success/error cases 

In [None]:
# Imshow function helper 
def imshow(image, bbox, ax=None, title=None, normalize=True):
    if ax is None:
        fig, ax = plt.subplots()
    if normalize:
        mean = np.array([0.5, 0.5, 0.5])
        std = np.array([0.5, 0.5, 0.5])
        image = std * image + mean
        image = np.clip(image, 0, 1)
    ax.imshow(image)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.tick_params(axis='both', length=0)
    ax.set_xticklabels('')
    ax.set_yticklabels('')
    ax.add_patch(Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3],  color='g', linewidth=4, fill=False))
    return ax

### Visualize specific image id(s) retrieved

In [None]:
LIST_OF_IMAGE_IDS = [6000, 3655, 6715, 10133, 6130, 5823, 2994, 6971, 6718, 6966]

for idx in LIST_OF_IMAGE_IDS:
    img_name = ovad_data['annotations'][idx]['image_id']
    bbox_to_use = ovad_data['annotations'][idx]['bbox']
    full_img_path = PATH_TO_COCO + f"{img_name:012d}" + '.jpg'
    orig_image = Image.open(full_img_path).convert("RGB")
    left = bbox_to_use[0]
    upper = bbox_to_use[1]
    right = left + bbox_to_use[2]
    lower = upper + bbox_to_use[3]
    test_img = orig_image.crop((left, upper, right, lower))
    imshow(orig_image, bbox_to_use, normalize=False)

### Visualize all GT examples for specific query 

In [None]:
ATTR_INDEX = 25         # "blue"
COCO_INDEX = 28         # "umbrella"

for i, annot in enumerate(ovad_data["annotations"]):
    if annot['att_vec'][ATTR_INDEX] == 1 and annot['category_id'] == COCO_INDEX:
        img_name = annot['image_id']
        bbox_to_use = annot['bbox']
        full_img_path = PATH_TO_COCO + f"{img_name:012d}" + '.jpg'
        orig_image = Image.open(full_img_path).convert("RGB")
        test_img = orig_image.crop((bbox_to_use[0], bbox_to_use[1], bbox_to_use[0]+bbox_to_use[2], bbox_to_use[1]+bbox_to_use[3]))
        imshow(orig_image, bbox_to_use, normalize=False)

### Run classification to compare queries 

In [None]:
IDX = 12253 #1400 # red car = 11155; 12465; yellow car 6966; brown bird 12253

# Plot image example for debug 
img_name = ovad_data['annotations'][IDX]['image_id']
bbox_to_use = ovad_data['annotations'][IDX]['bbox']
root_path = '/archive2/kyle/datasets/coco/val2017/'
full_img_path = root_path + f"{img_name:012d}" + '.jpg'
orig_image = Image.open(full_img_path).convert("RGB")
left = bbox_to_use[0]
upper = bbox_to_use[1]
right = left + bbox_to_use[2]
lower = upper + bbox_to_use[3]
test_img = orig_image.crop((left, upper, right, lower))
imshow(orig_image, bbox_to_use, normalize=False)

ex_img = preprocess(test_img).unsqueeze(0).to(DEVICE)
if MODE_FOR_MODEL == "custom_trained" or MODE_FOR_MODEL == "default_openai":
    image_features = model.model.encode_image(ex_img)
else:
    image_features = model.encode_image(ex_img)
image_features /= image_features.norm(dim=-1, keepdim=True)
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk(5)

# Print the result
print("\nTop predictions:\n")
for value, index in zip(values, indices):
    print(f"{queries[index]:>16s}: {100 * value.item():.2f}%")
