In [1]:
import sys
sys.path.append("..")

import pandas as pd

from torch.utils.data import DataLoader
from model_zoo import get_model
from dataset_zoo import VG_Relation, VG_Attribution, COCO_Order, Flickr30k_Order

In [2]:
# Please put your data root directory below. We'll download VG-Relation and VG-Attribution images here. 
# Will be a 1GB zip file (a subset of GQA).
root_dir="~/.cache" 


In [None]:
model, preprocess = get_model(model_name="openai-clip:ViT-B/32", device="cpu", root_dir=root_dir)

In [None]:
# Get the VG-R dataset
vgr_dataset = VG_Relation(image_preprocess=preprocess, download=False, root_dir=root_dir)
vgr_dataset.dataset = vgr_dataset.dataset[:1000]
vgr_dataset.all_relations = vgr_dataset.all_relations[:1000]
vgr_loader = DataLoader(vgr_dataset, batch_size=1, shuffle=False)

print(len(vgr_dataset))

100


In [None]:
print(vgr_dataset[0]["image_options"][0].shape)
for ex in vgr_loader:
    break
    if len(ex['image_options']) > 1:
        print(f"length of options: {len(ex['image_options'])}")

torch.Size([3, 224, 224])


In [None]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="cuda",
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.81it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
import torchvision.transforms as T
normalize = T.Normalize(
        mean=[-m / s for m, s in zip([0.485, 0.456, 0.406],
                                    [0.229, 0.224, 0.225])],
        std=[1/s for s in [0.229, 0.224, 0.225]]
    )

to_pil = T.ToPILImage()


In [None]:
from tqdm import tqdm
from PIL import Image
from io import BytesIO
import base64


def get_retrieval_scores_batched(model, loader):
    tqdm_loader = tqdm(loader)
    tqdm_loader.set_description("Computing retrieval scores")
    image_scores = []
    no_image_scores = []
    image_correct = []
    no_image_correct = []
    probability_increase = []
    
    for batch in tqdm_loader:
        image = batch['image_options'][0][0]
        image = normalize(image)
        image = to_pil(image)

        image_path = "cur_image.png"
        image.save(image_path)
        with open(image_path, "rb") as f:
            encoded_image = base64.b64encode(f.read())
        encoded_image = encoded_image.decode("utf-8")

        with_image_perplexities = []
        no_image_perplexities= []
        with_image_log_probs = []
        no_image_log_probs = []

        for add_image in [False, True]:
            for i in range(2):
                text = batch['caption_options'][i][0]
                messages = []
                if add_image:
                    messages.append(
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "image",
                                    "image": "file://" + image_path
                                },
                            ]
                        }
                    )

                messages.append(
                    {
                        "role": "assistant",
                        "content": [
                            {
                                "type": "text",
                                "text": text
                            }
                        ]
                    }
                )

                text = processor.apply_chat_template(messages, tokenize=False, add_generation=False)
                tokenized_text = processor.apply_chat_template(messages[-1:], tokenize=True, add_special_tokens=False, return_tensors="pt")
                tokenized_text = tokenized_text.to(model.device)

                image_inputs, video_inputs = process_vision_info(messages)
                inputs = processor(
                    text=[text],
                    images=image_inputs,
                    videos=video_inputs,
                    padding=True,
                    return_tensors="pt",
                )
                inputs = inputs.to(model.device)

                outputs = model(**inputs)
                logits = outputs.logits
                log_probs = logits.log_softmax(dim=-1)[:,-tokenized_text.shape[1]:-1]
                
                selected_log_probs = log_probs.gather(dim=-1, index=tokenized_text[:, 1:].unsqueeze(-1)).squeeze(-1)
                perplexity = (-selected_log_probs.mean()).exp().item()
                if add_image:
                    with_image_perplexities.append(perplexity)
                    with_image_log_probs.append(selected_log_probs.sum())
                else:
                    no_image_perplexities.append(perplexity)
                    no_image_log_probs.append(selected_log_probs.sum())


        # since the correct caption is always the second one
        with_image_correct = with_image_perplexities[1] < with_image_perplexities[0]
        no_image_correct = no_image_perplexities[1] < no_image_perplexities[0]
        ratio_image = with_image_log_probs[1] - with_image_log_probs[0]
        ratio_no_image = no_image_log_probs[1] - no_image_log_probs[0]
        probability_increase = (ratio_image - ratio_no_image).exp().item()

        image_scores.append([1/ ppl for ppl in with_image_perplexities])
        no_image_scores.append([1/ ppl for ppl in no_image_perplexities])

        """
        print(f"with image perplexities: {with_image_perplexities}")
        print(f"no image perplexities: {no_image_perplexities}")

        print(f"ratio with image: {ratio_image.exp().item()}")
        print(f"ratio without image: {ratio_no_image.exp().item()}")
        print(f"with image correct: {with_image_correct}")
        print(f"no image correct: {no_image_correct}")
        print(f"probability increase factor: {probability_increase}")
        print(batch["caption_options"])
        print()
        """

    return {
        "image_scores": image_scores,
        "no_image_scores": no_image_scores,
        "image_correct": image_correct,
        "no_image_correct": no_image_correct,
        "probability_increase": probability_increase
    }








In [None]:
import numpy as np
# Compute the scores for each test case
vgr_output = get_retrieval_scores_batched(model, vgr_loader)



Computing retrieval scores: 100%|██████████| 100/100 [00:15<00:00,  6.30it/s]


In [None]:
vgr_image_scores = np.array(vgr_output["image_scores"])[:, None, :]
vgr_no_image_scores = np.array(vgr_output["no_image_scores"])[:, None, :]
vgr_image_correct = vgr_output["image_correct"]
vgr_no_image_correct = vgr_output["no_image_correct"]
vgr_probability_increase = vgr_output["probability_increase"]

In [None]:
import PIL
import matplotlib.pyplot as plt
#print(vgr_dataset[0])
for idx in range(0, 20):
    continue
    vgr_dataset[0]["image_options"][0].shape
    print(vgr_dataset[idx]["caption_options"])
    image = normalize(vgr_dataset[idx]["image_options"][0])
    image = to_pil(image)
    plt.imshow(image)
    plt.show()

In [None]:
# Evaluate the macro accuracy
vgr_records = vgr_dataset.evaluate_scores(vgr_image_scores)
symmetric = ['adjusting', 'attached to', 'between', 'bigger than', 'biting', 'boarding', 'brushing', 'chewing', 'cleaning', 'climbing', 'close to', 'coming from', 'coming out of', 'contain', 'crossing', 'dragging', 'draped over', 'drinking', 'drinking from', 'driving', 'driving down', 'driving on', 'eating from', 'eating in', 'enclosing', 'exiting', 'facing', 'filled with', 'floating in', 'floating on', 'flying', 'flying above', 'flying in', 'flying over', 'flying through', 'full of', 'going down', 'going into', 'going through', 'grazing in', 'growing in', 'growing on', 'guiding', 'hanging from', 'hanging in', 'hanging off', 'hanging over', 'higher than', 'holding onto', 'hugging', 'in between', 'jumping off', 'jumping on', 'jumping over', 'kept in', 'larger than', 'leading', 'leaning over', 'leaving', 'licking', 'longer than', 'looking in', 'looking into', 'looking out', 'looking over', 'looking through', 'lying next to', 'lying on top of', 'making', 'mixed with', 'mounted on', 'moving', 'on the back of', 'on the edge of', 'on the front of', 'on the other side of', 'opening', 'painted on', 'parked at', 'parked beside', 'parked by', 'parked in', 'parked in front of', 'parked near', 'parked next to', 'perched on', 'petting', 'piled on', 'playing', 'playing in', 'playing on', 'playing with', 'pouring', 'reaching for', 'reading', 'reflected on', 'riding on', 'running in', 'running on', 'running through', 'seen through', 'sitting behind', 'sitting beside', 'sitting by', 'sitting in front of', 'sitting near', 'sitting next to', 'sitting under', 'skiing down', 'skiing on', 'sleeping in', 'sleeping on', 'smiling at', 'sniffing', 'splashing', 'sprinkled on', 'stacked on', 'standing against', 'standing around', 'standing behind', 'standing beside', 'standing in front of', 'standing near', 'standing next to', 'staring at', 'stuck in', 'surrounding', 'swimming in', 'swinging', 'talking to', 'topped with', 'touching', 'traveling down', 'traveling on', 'tying', 'typing on', 'underneath', 'wading in', 'waiting for', 'walking across', 'walking by', 'walking down', 'walking next to', 'walking through', 'working in', 'working on', 'worn on', 'wrapped around', 'wrapped in', 'by', 'of', 'near', 'next to', 'with', 'beside', 'on the side of', 'around']
df = pd.DataFrame(vgr_records)
df = df[~df.Relation.isin(symmetric)]
print(f"VG-Relation Macro Accuracy with image: {df.Accuracy.mean()}")


# Evaluate the macro accuracy
vgr_records = vgr_dataset.evaluate_scores(vgr_no_image_scores)
df = pd.DataFrame(vgr_records)
df = df[~df.Relation.isin(symmetric)]
print(f"VG-Relation Macro Accuracy with image: {df.Accuracy.mean()}")

VG-Relation Macro Accuracy with image: 0.839716610549944
VG-Relation Macro Accuracy with image: 0.6268799102132435


In [None]:
# Get the VG-A dataset
vga_dataset = VG_Attribution(image_preprocess=preprocess, download=True, root_dir=root_dir)
vga_loader = DataLoader(vga_dataset, batch_size=16, shuffle=False)
# Compute the scores for each test case
vga_scores = model.get_retrieval_scores_batched(vga_loader)


Downloading...
From: https://drive.google.com/uc?id=13tWvOrNOLHxl3Rm9cR3geAdHx2qR3-Tw
To: /home/loic/projects/vision-language-models-are-bows/notebooks/~/.cache/visual_genome_attribution.json
100%|██████████| 8.71M/8.71M [00:00<00:00, 18.8MB/s]


AttributeError: 'Qwen2_5_VLForConditionalGeneration' object has no attribute 'get_retrieval_scores_batched'

In [None]:
# Evaluate the macro accuracy
vga_records = vga_dataset.evaluate_scores(vga_scores)
df = pd.DataFrame(vga_records)
print(f"VG-Attribution Macro Accuracy: {df.Accuracy.mean()}")

In [None]:
for idx in range(400, 1000, 50):
    print(vga_dataset[idx]["caption_options"])
    normalize = T.Normalize(
        mean=[-m / s for m, s in zip([0.485, 0.456, 0.406],
                                    [0.229, 0.224, 0.225])],
        std=[1/s for s in [0.229, 0.224, 0.225]]
    )

    to_pil = T.ToPILImage()
    image = normalize(vga_dataset[idx]["image_options"][0])
    image = to_pil(image)
    plt.imshow(image)
    plt.show()

In [None]:
from dataset_zoo import get_dataset
coco_dataset = get_dataset("COCO_Order", image_preprocess=preprocess, download=True, root_dir="./coco_data")
coco_loader = DataLoader(coco_dataset, batch_size=16, shuffle=False)