In [None]:
paths = {
    "refcoco+": "/kaggle/input/refcoco-and-refcoco/refcoco+",
    "refcoco": "/kaggle/input/refcoco-and-refcoco/refcoco",
    "vizwiz_annotations": "/kaggle/input/vizwiz/Annotations",
    "vizwiz_val": "/kaggle/input/vizwiz/val",
    "vizwiz_test": "/kaggle/input/vizwiz/test",
    "vizwiz_train": "/kaggle/input/vizwiz/train",
    "coco2017": "/kaggle/input/coco-2017-dataset/coco2017",
    "vqa_dataset": "/kaggle/input/visual-question-answering-computer-vision-nlp/dataset"
}

In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("/kaggle/input/all-files")
print("Tokenizer loaded successfully.")


In [None]:
!pip install --quiet open_clip_torch


In [None]:
import os
import json
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast

paths = {
    "refcoco+": "/kaggle/input/refcoco-and-refcoco/refcoco+",
    "refcoco": "/kaggle/input/refcoco-and-refcoco/refcoco",
    "vizwiz_annotations": "/kaggle/input/vizwiz/Annotations",
    "vizwiz_val": "/kaggle/input/vizwiz/val",
    "vizwiz_test": "/kaggle/input/vizwiz/test",
    "vizwiz_train": "/kaggle/input/vizwiz/train",
    "coco2017": "/kaggle/input/coco-2017-dataset/coco2017",
    "vqa_dataset": "/kaggle/input/visual-question-answering-computer-vision-nlp/dataset",
    "tokenizer": "/kaggle/input/all-files"
}

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

tokenizer = BertTokenizerFast.from_pretrained(paths["tokenizer"])

class CocoCaptionDataset(Dataset):
    def __init__(self, image_dir, annotation_file, transform, tokenizer):
        with open(annotation_file, 'r') as f:
            self.annotations = json.load(f)['annotations']
        self.image_dir = image_dir
        self.transform = transform
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        ann = self.annotations[idx]
        img_path = os.path.join(self.image_dir, f"{ann['image_id']:012d}.jpg")
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)
        caption = self.tokenizer(
            ann['caption'],
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return image, caption.input_ids.squeeze(), caption.attention_mask.squeeze()

coco_dataset = CocoCaptionDataset(
    image_dir=os.path.join(paths['coco2017'], 'train2017'),
    annotation_file=os.path.join(paths['coco2017'], 'annotations', 'captions_train2017.json'),
    transform=transform,
    tokenizer=tokenizer
)

coco_loader = DataLoader(coco_dataset, batch_size=8, shuffle=True)

print(f"COCO dataset samples: {len(coco_dataset)}")
sample_img, sample_ids, sample_mask = next(iter(coco_loader))
print(f"Sample batch - images: {sample_img.shape}, token_ids: {sample_ids.shape}, attention_mask: {sample_mask.shape}")

In [None]:
import os
import json
import torch
from PIL import Image
from torchvision import transforms
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast
from open_clip import create_model_and_transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

paths = {
    "coco2017": "/kaggle/input/coco-2017-dataset/coco2017",
    "tokenizer": "/kaggle/input/all-files"  # or wherever your tokenizer files are stored
}


transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

model, _, preprocess_train = create_model_and_transforms(
    model_name="ViT-B-32",
    pretrained="laion2b_s34b_b79k",
    device=device
)
model = model.to(device)

for param in model.visual.parameters():
    param.requires_grad = False
for param in model.transformer.parameters():
    param.requires_grad = False

tokenizer = BertTokenizerFast.from_pretrained(paths["tokenizer"])

class CocoCaptionDataset(Dataset):
    def __init__(self, image_dir, annotation_file, transform, tokenizer, max_length=77):
        with open(annotation_file, 'r') as f:
            self.annotations = json.load(f)['annotations']
        self.image_dir = image_dir
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        ann = self.annotations[idx]
        img_path = os.path.join(self.image_dir, f"{ann['image_id']:012d}.jpg")
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)
        encoded = self.tokenizer(
            ann['caption'],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        return image, encoded.input_ids.squeeze(0), encoded.attention_mask.squeeze(0)

def collate_fn(batch):
    images, token_ids, attention_masks = zip(*batch)
    images = torch.stack(images)
    token_ids = torch.stack(token_ids)
    attention_masks = torch.stack(attention_masks)
    return images, token_ids, attention_masks

coco_dataset = CocoCaptionDataset(
    image_dir=os.path.join(paths['coco2017'], 'train2017'),
    annotation_file=os.path.join(paths['coco2017'], 'annotations', 'captions_train2017.json'),
    transform=transform,
    tokenizer=tokenizer
)

coco_loader = DataLoader(coco_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5)
criterion = nn.CosineEmbeddingLoss()

model.train()
for images, token_ids, attention_masks in coco_loader:
    images = images.to(device)
    token_ids = token_ids.to(device)

    image_features = model.encode_image(images)
    text_features = model.encode_text(token_ids)

    labels = torch.ones(image_features.size(0)).to(device)
    loss = criterion(image_features, text_features, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Batch loss: {loss.item()}")
    break

with torch.no_grad():
    cosine_sim = nn.functional.cosine_similarity(image_features, text_features)
    print(f"Cosine similarity sample: {cosine_sim[:5]}")

In [None]:
import torch
from torch import nn
from PIL import Image
from torchvision import transforms
from open_clip import create_model_and_transforms
from peft import get_peft_model, LoraConfig, TaskType

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model, _, preprocess = create_model_and_transforms(
    model_name="ViT-B-32",
    pretrained="laion2b_s34b_b79k",
    device=device
)
model = model.to(device)

class VisualCompressor(nn.Module):
    def __init__(self, input_dim=512, output_dim=256, num_tokens=8):
        super().__init__()
        self.proj = nn.Linear(input_dim, output_dim)
        self.pool = nn.AdaptiveAvgPool1d(num_tokens)

    def forward(self, *args, **kwargs):
        x = kwargs.get("input_ids", None)
        if x is None:
            x = kwargs.get("inputs_embeds", None)
        if x is None:
            raise ValueError("Input tensor x is required (passed as input_ids or inputs_embeds)")
        x = self.proj(x)
        x = x.transpose(1, 2)
        x = self.pool(x)
        return x.transpose(1, 2)

compressor = VisualCompressor().to(device)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.FEATURE_EXTRACTION
)

compressor = get_peft_model(compressor, lora_config)
compressor.print_trainable_parameters()

image = Image.open("/kaggle/input/coco-2017-dataset/coco2017/train2017/000000000009.jpg").convert("RGB")
image_tensor = preprocess(image).unsqueeze(0).to(device)

with torch.no_grad():
    image_features = model.encode_image(image_tensor)
    image_tokens = compressor(image_features.unsqueeze(1))

print("Compressed visual tokens:", image_tokens.shape)

In [None]:
import os
import torch
import pandas as pd
import json
from PIL import Image
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from open_clip import create_model_and_transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

paths = {
    "coco2017": "/kaggle/input/coco-2017-dataset/coco2017",
    "vqa_dataset": "/kaggle/input/visual-question-answering-computer-vision-nlp/dataset",
    "refcoco": "/kaggle/input/refcoco-and-refcoco/refcoco",
    "tokenizer": "/kaggle/input/gpt-2-tokens",
    "decoder": "/kaggle/input/gpt-2-model"
}

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

model, _, preprocess = create_model_and_transforms(
    model_name="ViT-B-32",
    pretrained="laion2b_s34b_b79k",
    device=device
)
model = model.to(device)
for param in model.visual.parameters():
    param.requires_grad = False
for param in model.transformer.parameters():
    param.requires_grad = False

tokenizer = GPT2Tokenizer.from_pretrained(paths["tokenizer"], local_files_only=True)
tokenizer.pad_token = tokenizer.eos_token
decoder = GPT2LMHeadModel.from_pretrained(paths["decoder"], local_files_only=True).to(device)

class VisualCompressor(nn.Module):
    def __init__(self, input_dim=512, compressed_dim=256, output_dim=768, num_tokens=8):
        super().__init__()
        self.proj = nn.Linear(input_dim, compressed_dim)
        self.pool = nn.AdaptiveAvgPool1d(num_tokens)
        self.expand = nn.Linear(compressed_dim, output_dim)
    def forward(self, x):
        x = self.proj(x)
        x = x.transpose(1,2)
        x = self.pool(x)
        x = x.transpose(1,2)
        x = self.expand(x)
        return x

compressor = VisualCompressor().to(device)
for param in compressor.parameters():
    param.requires_grad = False
for name, param in compressor.named_parameters():
    if "proj.weight" in name or "proj.bias" in name:
        param.requires_grad = True

trainable = sum(p.numel() for p in compressor.parameters() if p.requires_grad)
total = sum(p.numel() for p in compressor.parameters())
print(f"trainable params: {trainable} || all params: {total} || trainable%: {100 * trainable / total:.4f}")

class MultiTaskDataset(Dataset):
    def __init__(self, coco_file, vqa_file, refcoco_file, transform, tokenizer, max_len=128):
        with open(coco_file) as f:
            self.coco = json.load(f)['annotations']
        self.vqa = pd.read_csv(vqa_file).to_dict(orient='records')
        with open(refcoco_file) as f:
            self.refcoco = json.load(f)['annotations']
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.coco) + len(self.vqa) + len(self.refcoco)
    def __getitem__(self, idx):
        if idx < len(self.coco):
            ann = self.coco[idx]
            task = 'caption'
            img_path = os.path.join(paths['coco2017'], 'train2017', f"{ann['image_id']:012d}.jpg")
            text = ann.get('caption', '')
        elif idx < len(self.coco)+len(self.vqa):
            ann = self.vqa[idx - len(self.coco)]
            task = 'vqa'
            img_path = os.path.join(paths['vqa_dataset'], 'images', f"{ann['image_id']}.png")
            text = ann.get('question', '')
        else:
            ann = self.refcoco[idx - len(self.coco) - len(self.vqa)]
            task = 'grounding'
            img_path = os.path.join(paths['coco2017'], 'train2017', f"{ann['image_id']:012d}.jpg")
            text = ann.get('caption', '') or ann.get('question', '') or ann.get('phrase', '')
        text = text.strip()
        if len(text) == 0:
            text = "unknown"
        image = Image.open(img_path).convert("RGB")
        image = self.transform(image)
        enc = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        return image, enc.input_ids.squeeze(0), enc.attention_mask.squeeze(0), task

def collate_fn(batch):
    images, token_ids, attention_masks, tasks = zip(*batch)
    images = torch.stack(images)
    token_ids = torch.stack(token_ids)
    attention_masks = torch.stack(attention_masks)
    return images, token_ids, attention_masks, tasks

dataset = MultiTaskDataset(
    coco_file=os.path.join(paths['coco2017'],'annotations','captions_train2017.json'),
    vqa_file=os.path.join(paths['vqa_dataset'],'data_train.csv'),
    refcoco_file=os.path.join(paths['refcoco'],'instances.json'),
    transform=transform,
    tokenizer=tokenizer
)
loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

optimizer = optim.AdamW(list(model.parameters()) + list(decoder.parameters()) + list(compressor.parameters()), lr=5e-5)
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

model.train()
decoder.train()
compressor.train()

for images, token_ids, attention_masks, tasks in loader:
    images = images.to(device)
    token_ids = token_ids.to(device)
    attention_masks = attention_masks.to(device)

    image_features = model.encode_image(images)
    image_tokens = compressor(image_features.unsqueeze(1))

    text_embeds = decoder.transformer.wte(token_ids)
    inputs_embeds = torch.cat([image_tokens, text_embeds], dim=1)

    visual_mask = torch.ones(image_tokens.size(0), image_tokens.size(1)).to(device)
    attention_mask = torch.cat([visual_mask, attention_masks], dim=1)

    visual_labels = torch.full((image_tokens.size(0), image_tokens.size(1)), -100).to(device)
    labels = torch.cat([visual_labels, token_ids], dim=1)

    outputs = decoder(inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Batch loss: {loss.item()}")
    break

with torch.no_grad():
    text_mean = decoder.transformer.wte(token_ids).mean(dim=1)  # [4, 768]
    image_proj = nn.Linear(512, 768).to(device)
    image_proj.eval()
    image_aligned = image_proj(image_features)  # [4, 768]
    cosine_sim = nn.functional.cosine_similarity(image_aligned, text_mean, dim=-1)
    print(f"Cosine similarity sample: {cosine_sim[:5]}")

In [None]:
!sudo apt update
!sudo apt install -y git wget unzip ffmpeg
!git clone https://github.com/facebookresearch/ov-seg.git
%cd ov-seg
!pip install -r requirements.txt
!pip install -e .
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
!pip install git+https://github.com/facebookresearch/detectron2.git@main
!mkdir -p checkpoints
!wget -O checkpoints/refcocog_clip.pth https://dl.fbaipublicfiles.com/ovseg/refcocog_clip.pth

In [None]:
pip install git+https://github.com/<username>/ovseg.git


In [None]:
import torch
import torchvision.transforms as T
from PIL import Image
from ovseg.models import build_model
from ovseg.utils import load_config, load_checkpoint
from ovseg.evaluation import compute_iou
import json
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 1: Load Mask2Former + CLIP model
config = load_config("configs/refcocog_clip.yaml")
model = build_model(config).to(device)
model.eval()
load_checkpoint(model, "checkpoints/refcocog_clip.pth")

# Step 2: Load image and text query
image_path = "refcocog/images/image1174.jpg"
image = Image.open(image_path).convert("RGB")
transform = T.Compose([
    T.Resize((512, 512)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
image_tensor = transform(image).unsqueeze(0).to(device)
text_query = "the curtain to the right of the bed"

# Step 3: Run inference
with torch.no_grad():
    outputs = model(image_tensor, text_query)

masks = outputs["pred_masks"]  # [N, H, W]
boxes = outputs["pred_boxes"]  # [N, 4]
scores = outputs["scores"]     # [N]

# Step 4: Load ground truth and compute IoU
def load_gt_boxes(json_path, image_id):
    with open(json_path) as f:
        data = json.load(f)
    return [ann["bbox"] for ann in data["annotations"] if ann["image_id"] == image_id]

gt_boxes = load_gt_boxes("refcocog/annotations/instances.json", image_id=1174)
pred_boxes = boxes.cpu().numpy()
ious = compute_iou(pred_boxes, gt_boxes)
mean_iou = ious.mean()

print(f"Detected {len(pred_boxes)} regions")
print(f"Mean IoU on RefCOCOg: {mean_iou:.4f}")

In [None]:
import torch
import torchvision.transforms as T
import torchvision.transforms.functional as TF
from PIL import Image, ImageEnhance, ImageFilter
import random
import numpy as np

def apply_occlusion(img, box_size=80):
    w, h = img.size
    x = random.randint(0, w - box_size)
    y = random.randint(0, h - box_size)
    img.paste((0, 0, 0), [x, y, x + box_size, y + box_size])
    return img

def apply_blur(img, radius=3):
    return img.filter(ImageFilter.GaussianBlur(radius))

def apply_lighting(img, factor=0.5):
    enhancer = ImageEnhance.Brightness(img)
    return enhancer.enhance(factor)

def apply_domain_shift(img_path):
    # VizWiz image loader
    return Image.open(img_path).convert("RGB")

def evaluate_model(model, image, text, transform, tokenizer, task="vqa"):
    image_tensor = transform(image).unsqueeze(0).to(device)
    enc = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    token_ids = enc.input_ids.to(device)
    attention_mask = enc.attention_mask.to(device)

    with torch.no_grad():
        image_features = model.encode_image(image_tensor)
        image_tokens = compressor(image_features.unsqueeze(1))
        text_embeds = decoder.transformer.wte(token_ids)
        inputs_embeds = torch.cat([image_tokens, text_embeds], dim=1)
        visual_mask = torch.ones(image_tokens.size(0), image_tokens.size(1)).to(device)
        full_mask = torch.cat([visual_mask, attention_mask], dim=1)
        visual_labels = torch.full((image_tokens.size(0), image_tokens.size(1)), -100).to(device)
        labels = torch.cat([visual_labels, token_ids], dim=1)
        outputs = decoder(inputs_embeds=inputs_embeds, attention_mask=full_mask, labels=labels)
        return outputs.loss.item()

image_path = "refcocog/images/image1174.jpg"
text_query = "the curtain to the right of the bed"
original = Image.open(image_path).convert("RGB")

perturbations = {
    "original": original,
    "occlusion": apply_occlusion(original.copy()),
    "blur": apply_blur(original.copy()),
    "lighting": apply_lighting(original.copy()),
    "vizwiz": apply_domain_shift("vizwiz/image_0001.jpg")
}

losses = {}
for name, img in perturbations.items():
    loss = evaluate_model(model, img, text_query, transform, tokenizer, task="vqa")
    losses[name] = loss
    print(f"{name} loss: {loss:.4f}")

base = losses["original"]
for k, v in losses.items():
    if k != "original":
        drop = v - base
        print(f"Î” Loss ({k}): {drop:.4f}")

In [None]:
import torch
import time
import numpy as np
from sklearn.metrics import average_precision_score

def evaluate_vqa(model, dataset, tokenizer):
    correct = 0
    total = 0
    for image, question, answer in dataset:
        image = image.to(device)
        enc = tokenizer(question, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
        token_ids = enc.input_ids.to(device)
        attention_mask = enc.attention_mask.to(device)

        with torch.no_grad():
            image_features = model.encode_image(image.unsqueeze(0))
            image_tokens = compressor(image_features.unsqueeze(1))
            text_embeds = decoder.transformer.wte(token_ids)
            inputs_embeds = torch.cat([image_tokens, text_embeds], dim=1)
            visual_mask = torch.ones(image_tokens.size(0), image_tokens.size(1)).to(device)
            full_mask = torch.cat([visual_mask, attention_mask], dim=1)
            visual_labels = torch.full((image_tokens.size(0), image_tokens.size(1)), -100).to(device)
            labels = torch.cat([visual_labels, token_ids], dim=1)
            outputs = decoder(inputs_embeds=inputs_embeds, attention_mask=full_mask, labels=labels)
            pred = tokenizer.decode(outputs.logits.argmax(-1)[0], skip_special_tokens=True)
            if pred.strip().lower() == answer.strip().lower():
                correct += 1
            total += 1
    return 100 * correct / total

def compute_iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

def evaluate_grounding(model, grounding_dataset):
    ious = []
    for image, phrase, gt_box in grounding_dataset:
        image = image.to(device)
        with torch.no_grad():
            outputs = model(image.unsqueeze(0), phrase)
            pred_box = outputs["pred_boxes"][0].cpu().numpy()
            iou = compute_iou(pred_box, gt_box)
            ious.append(iou)
    return np.mean(ious)

def evaluate_retrieval(model, retrieval_dataset, k=10):
    image_features = []
    text_features = []
    for image, text in retrieval_dataset:
        image = image.to(device)
        with torch.no_grad():
            image_feat = model.encode_image(image.unsqueeze(0)).cpu().numpy()
            text_enc = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
            text_feat = decoder.transformer.wte(text_enc.input_ids.to(device)).mean(dim=1).cpu().numpy()
        image_features.append(image_feat)
        text_features.append(text_feat)

    image_features = np.vstack(image_features)
    text_features = np.vstack(text_features)
    sims = np.dot(text_features, image_features.T)

    mAP = average_precision_score(np.eye(len(sims)), sims)
    recall_at_k = np.mean([int(i in sims[i].argsort()[-k:]) for i in range(len(sims))])
    return mAP, recall_at_k

def evaluate_efficiency(model, image, text):
    image = image.to(device)
    enc = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    token_ids = enc.input_ids.to(device)
    attention_mask = enc.attention_mask.to(device)

    torch.cuda.reset_peak_memory_stats()
    start = time.time()
    with torch.no_grad():
        image_features = model.encode_image(image.unsqueeze(0))
        image_tokens = compressor(image_features.unsqueeze(1))
        text_embeds = decoder.transformer.wte(token_ids)
        inputs_embeds = torch.cat([image_tokens, text_embeds], dim=1)
        visual_mask = torch.ones(image_tokens.size(0), image_tokens.size(1)).to(device)
        full_mask = torch.cat([visual_mask, attention_mask], dim=1)
        decoder(inputs_embeds=inputs_embeds, attention_mask=full_mask)
    end = time.time()
    latency = end - start
    memory = torch.cuda.max_memory_allocated() / 1e6
    return latency, memory

vqa_acc = evaluate_vqa(model, vqa_dataset, tokenizer)
iou_score = evaluate_grounding(model, refcocog_dataset)
mAP, recall_k = evaluate_retrieval(model, retrieval_dataset)
latency, memory = evaluate_efficiency(model, sample_image, sample_text)

print(f"VQA Accuracy: {vqa_acc:.2f}%")
print(f"RefCOCOg IoU: {iou_score:.4f}")
print(f"Retrieval mAP: {mAP:.4f}, Recall@10: {recall_k:.4f}")
print(f"Inference latency: {latency:.2f}s, Memory usage: {memory:.2f}MB")