In [1]:
import numpy as np
import faiss
import eccv_caption
from omegaconf import OmegaConf
from lavis.models import load_preprocess
from lavis.models.albef_models.albef_retrieval import AlbefRetrieval
import open_clip
import torch
from PIL import Image
import json
import os
import time
from tqdm.auto import tqdm


  from .autonotebook import tqdm as notebook_tqdm
  return torch.cuda.amp.custom_fwd(orig_func)  # type: ignore
  return torch.cuda.amp.custom_bwd(orig_func)  # type: ignore


In [2]:
DATASET_PATH = "dataset/coco2014"
ANN_PATH = f"{DATASET_PATH}/annotations/karpathy_test.json"
DATA_DIR = f"{DATASET_PATH}/eccv/data"
MODEL_NAME = "ViT-L-14"
PRETRAINED = "openai"
IMAGE_BATCH = 64
TEXT_BATCH = 256
RETRIEVE_K = 50
ALBEF_IMAGE_BATCH = 32
ALBEF_CFG_PATH = "configs/albef_retrieval_base.yaml"

In [3]:
def load_karpathy_test(ann_path, coco_root):
    data = json.load(open(ann_path, "r", encoding="utf-8"))
    images = []
    image_ids = []
    captions = []
    caption_ids = []

    for img in data["images"]:
        if img["split"] != "test":
            continue
        img_path = os.path.join(coco_root, img["filepath"], img["filename"])
        images.append(img_path)
        image_ids.append(img["cocoid"])
        for sent in img["sentences"]:
            captions.append(sent["raw"].strip())
            caption_ids.append(sent["sentid"])

    return images, image_ids, captions, caption_ids

In [4]:
def encode_images(model, preprocess, image_paths, batch_size, device):
    feats = []
    total = len(image_paths)
    for i in range(0, total, batch_size):
        batch_paths = image_paths[i:i + batch_size]
        images = [preprocess(Image.open(p).convert("RGB"))
                  for p in batch_paths]
        image_input = torch.stack(images).to(device)
        with torch.no_grad():
            batch_feats = model.encode_image(image_input)
        feats.append(batch_feats)
        # if (i // batch_size) % 10 == 0:
        #     done = min(i + batch_size, total)
        #     print(f"Encoded images: {done}/{total}")
    feats = torch.cat(feats, dim=0)
    return feats / feats.norm(dim=1, keepdim=True)

In [5]:
def encode_texts(model, tokenizer, texts, batch_size, device):
    feats = []
    total = len(texts)
    for i in range(0, total, batch_size):
        batch = texts[i:i + batch_size]
        tokens = tokenizer(batch).to(device)
        with torch.no_grad():
            batch_feats = model.encode_text(tokens)
        feats.append(batch_feats)
        # if (i // batch_size) % 10 == 0:
        #     done = min(i + batch_size, total)
        #     print(f"Encoded captions: {done}/{total}")
    feats = torch.cat(feats, dim=0)
    return feats / feats.norm(dim=1, keepdim=True)

In [6]:
def topk_faiss(text_feats, img_feats, k):
    txf = text_feats.detach().cpu().numpy().astype("float32", copy=False)
    imf = img_feats.detach().cpu().numpy().astype("float32", copy=False)
    txf = np.ascontiguousarray(txf)
    imf = np.ascontiguousarray(imf)
    k = min(k, imf.shape[0])
    index = faiss.IndexFlatIP(imf.shape[1])
    index.add(imf)
    _, indices = index.search(txf, k)
    return indices

In [7]:
def albef_rerank(albef_model, albef_preprocess, captions, image_paths, t2i_rank, batch_size, device):
    image_feats = []
    total = len(image_paths)
    batch_start = time.time()
    for i in tqdm(range(0, total, batch_size), desc="ALBEF image feats"):
        batch_paths = image_paths[i:i + batch_size]
        images = [albef_preprocess(Image.open(p).convert("RGB"))
                  for p in batch_paths]
        image_input = torch.stack(images).to(device)
        with torch.no_grad():
            batch_feats = albef_model.visual_encoder.forward_features(
                image_input)
        image_feats.append(batch_feats.cpu())
    image_feats = torch.cat(image_feats, dim=0)
    batch_time = time.time() - batch_start

    print(f"ALBEF image feats: {total} in {batch_time:.3f}s")
    
    reranked = []
    total_caps = len(captions)
    batch_start = time.time()
    for i in tqdm(range(total_caps), desc="ALBEF rerank"):
        cand = t2i_rank[i]
        img_feat = image_feats[cand].to(device)
        encoder_att = torch.ones(
            img_feat.size()[:-1], dtype=torch.long).to(device)
        text_input = albef_model.tokenizer(
            [captions[i]] * len(cand),
            padding="max_length",
            truncation=True,
            max_length=35,
            return_tensors="pt",
        ).to(device)
        with torch.no_grad():
            output = albef_model.text_encoder(
                text_input.input_ids,
                attention_mask=text_input.attention_mask,
                encoder_hidden_states=img_feat,
                encoder_attention_mask=encoder_att,
                return_dict=True,
            )
            scores = albef_model.itm_head(
                output.last_hidden_state[:, 0, :])[:, 1]
        order = torch.argsort(scores, descending=True)
        reranked.append([cand[j] for j in order.tolist()])

    batch_time = time.time() - batch_start
    print(f"ALBEF rerank: {total_caps} in {batch_time:.3f}s")
    return reranked

In [8]:
device = "cuda" if torch.cuda.is_available(
) else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

images, image_ids, captions, caption_ids = load_karpathy_test(
    ANN_PATH, DATASET_PATH)
print(f"Test images: {len(images)}")
print(f"Test captions: {len(captions)}")
print(f"Test captions: {len(caption_ids)}")

model, _, preprocess = open_clip.create_model_and_transforms(
    MODEL_NAME, pretrained=PRETRAINED, device=device)
tokenizer = open_clip.get_tokenizer(MODEL_NAME)
model.eval()

t0 = time.time()
image_feats = encode_images(model, preprocess, images, IMAGE_BATCH, device)
text_feats = encode_texts(model, tokenizer, captions, TEXT_BATCH, device)
print(f"Encoded in {time.time() - t0:.1f}s")

i2t_rank = topk_faiss(image_feats, text_feats, RETRIEVE_K).tolist()
t2i_rank = topk_faiss(text_feats, image_feats, RETRIEVE_K).tolist()

Using device: mps
Test images: 5000
Test captions: 25010
Test captions: 25010




Encoded in 299.5s


In [9]:
np.shape(t2i_rank)

(25010, 50)

In [10]:
albef_cfg = OmegaConf.load(ALBEF_CFG_PATH)
albef_model = AlbefRetrieval.from_config(albef_cfg.model)
albef_model.eval()
albef_model = albef_model.to(device)
albef_vis, _ = load_preprocess(albef_cfg.preprocess)

t2i_rank = albef_rerank(
    albef_model, albef_vis["eval"], captions, images, t2i_rank, ALBEF_IMAGE_BATCH, device
)



reshape position embedding from 256 to 576
reshape position embedding from 256 to 576


ALBEF image feats: 100%|██████████| 157/157 [02:56<00:00,  1.12s/it]


ALBEF image feats: 5000 in 191.548s


ALBEF rerank: 100%|██████████| 25010/25010 [1:42:56<00:00,  4.05it/s]


ALBEF rerank: 25010 in 6176.617s


In [11]:
i2t = {
    image_ids[i]: [caption_ids[j] for j in i2t_rank[i]]
    for i in range(len(image_ids))
}
t2i = {
    caption_ids[i]: [image_ids[j] for j in t2i_rank[i]]
    for i in range(len(caption_ids))
}

In [12]:

metric = eccv_caption.Metrics()
scores = metric.compute_all_metrics(
    i2t_retrieved_items=i2t,
    t2i_retrieved_items=t2i,
    target_metrics=["coco_5k_recalls",
                    "eccv_map_at_r", "eccv_rprecision", "eccv_r1"],
    Ks=[1, 5, 10],
    verbose=True,
)
print("COCO 5K T2I recalls:")
print(f"R@1: {scores['coco_5k_r1']['t2i']:.2f}")
print(f"R@5: {scores['coco_5k_r5']['t2i']:.2f}")
print(f"R@10: {scores['coco_5k_r10']['t2i']:.2f}")
print("ECCV T2I metrics:")
print(f"Map@R: {scores['eccv_map_at_r']['t2i']:.2f}")
print(f"R-P: {scores['eccv_rprecision']['t2i']:.2f}")
print(f"R@1: {scores['eccv_r1']['t2i']:.2f}")

100%|██████████| 5000/5000 [00:00<00:00, 1744573.66it/s]
100%|██████████| 25000/25000 [00:00<00:00, 2474515.63it/s]
100%|██████████| 5000/5000 [00:00<00:00, 1091243.63it/s]
100%|██████████| 25000/25000 [00:00<00:00, 1653748.86it/s]
100%|██████████| 5000/5000 [00:00<00:00, 1073590.66it/s]
100%|██████████| 25000/25000 [00:00<00:00, 1353699.97it/s]
100%|██████████| 1261/1261 [00:00<00:00, 60587.17it/s]
100%|██████████| 1332/1332 [00:00<00:00, 121154.84it/s]

COCO 5K T2I recalls:
R@1: 0.52
R@5: 0.76
R@10: 0.84
ECCV T2I metrics:
Map@R: 0.41
R-P: 0.50
R@1: 0.87





In [None]:

import sys
sys.path.insert(0, "/Users/weedcuper/Desktop/Degree/HSE_MS/Module3/ML-Project/reqs/Qwen3-VL-Embedding")
from src.models.qwen3_vl_reranker import Qwen3VLReranker
model = Qwen3VLReranker(model_name_or_path="Qwen/Qwen3-VL-Reranker-8B")


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]