# A Comparative Analysis of Specialist vs. Generalist Vision-Language Models on the Flickr30k Entities Grounding Benchmark

This notebook implements a research project comparing specialist (GLIP) and generalist (LLaVA) vision-language models on the Flickr30k Entities grounding benchmark.

**Inspiration**: This project is directly inspired by the foundational paper "Flickr30k Entities: Collecting region-to-phrase correspondences for richer image-to-sentence models" (Plummer et al., 2015).

**Core Goal**: Test whether modern SOTA "generalist" models (like LLaVA) have incidentally learned fine-grained grounding skills compared to "specialist" models (like GLIP) that were explicitly trained for this task.



In [None]:
%pip install -q -r requirements.txt



[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


## Phase 1: Environment and Data Preparation

### 1.1 Environment Setup


In [None]:
import torch
import torchvision
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import xml.etree.ElementTree as ET
import os
import re
from tqdm import tqdm
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
from transformers import LlavaProcessor, LlavaForConditionalGeneration
import warnings
warnings.filterwarnings('ignore')

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")


### 1.2 Data Acquisition

Download the Flickr30k Entities dataset. You can either:
1. Download manually from https://github.com/BryanPlummer/flickr30k_entities
2. Use the code below to download programmatically


In [None]:
import urllib.request
import zipfile
import tarfile
import requests

os.makedirs('data', exist_ok=True)

annotations_urls = [
    "https://raw.githubusercontent.com/BryanPlummer/flickr30k_entities/master/annotations.zip",
    "https://github.com/BryanPlummer/flickr30k_entities/raw/master/annotations.zip"
]
images_url = "http://shannon.cs.illinois.edu/DenotationGraph/data/flickr30k-images.tar.gz"

annotations_path = "data/annotations.zip"
images_path = "data/flickr30k-images.tar.gz"

if not os.path.exists("data/Flickr30kEntities"):
    print("Downloading annotations...")
    downloaded = False
    for url in annotations_urls:
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()
            with open(annotations_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print("Annotations downloaded!")
            downloaded = True
            break
        except Exception as e:
            print(f"Failed with {url}: {e}")
            continue
    
    if not downloaded:
        raise Exception("Failed to download annotations from all URLs")
    
    print("Extracting annotations...")
    with zipfile.ZipFile(annotations_path, 'r') as zip_ref:
        zip_ref.extractall("data/")
    print("Annotations extracted!")
else:
    print("Annotations already exist.")

if not os.path.exists("data/flickr30k-images"):
    if os.path.exists(images_path):
        print("Found existing images archive, extracting...")
        with tarfile.open(images_path, "r:gz") as tar:
            tar.extractall("data/")
        print("Images extracted!")
    else:
        print("Downloading images...")
        try:
            response = requests.get(images_url, stream=True, timeout=30)
            response.raise_for_status()
            total_size = int(response.headers.get('content-length', 0))
            with open(images_path, 'wb') as f:
                if total_size > 0:
                    downloaded = 0
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                        downloaded += len(chunk)
                        if downloaded % (10 * 1024 * 1024) == 0:
                            print(f"Downloaded {downloaded / (1024*1024):.1f} MB...")
                else:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
            print("Images downloaded!")
            print("Extracting images...")
            with tarfile.open(images_path, "r:gz") as tar:
                tar.extractall("data/")
            print("Images extracted!")
        except Exception as e:
            print(f"Failed to download images automatically: {e}")
            print("\n" + "="*60)
            print("MANUAL DOWNLOAD REQUIRED:")
            print("="*60)
            print("The Flickr30k images require manual download.")
            print("Please visit: http://shannon.cs.illinois.edu/DenotationGraph/")
            print("Fill out the form to request access, then download flickr30k-images.tar.gz")
            print(f"Place it at: {images_path}")
            print("Then re-run this cell to extract the images.")
            print("="*60)
            print(f"\nNote: The code will automatically extract the file if you place it at: {images_path}")
else:
    print("Images already exist.")


### 1.3 Annotation Parsing

Parse the XML annotation files to extract image-phrase-bounding-box triplets.


In [None]:
def parse_flickr30k_xml(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    filename = root.find('filename').text
    results = []
    
    for obj in root.findall('object'):
        name_elem = obj.find('name')
        bndbox_elem = obj.find('bndbox')
        
        if name_elem is not None and bndbox_elem is not None:
            phrase = name_elem.text
            xmin = int(bndbox_elem.find('xmin').text)
            ymin = int(bndbox_elem.find('ymin').text)
            xmax = int(bndbox_elem.find('xmax').text)
            ymax = int(bndbox_elem.find('ymax').text)
            
            results.append({
                "image_filename": filename,
                "phrase": phrase,
                "bbox": [xmin, ymin, xmax, ymax]
            })
    
    return results


In [None]:
annotations_dir = "data/Flickr30kEntities/Annotations"
all_groundings = []

xml_files = [f for f in os.listdir(annotations_dir) if f.endswith('.xml')]
print(f"Found {len(xml_files)} XML annotation files")

for xml_file in tqdm(xml_files, desc="Parsing XML files"):
    xml_path = os.path.join(annotations_dir, xml_file)
    groundings = parse_flickr30k_xml(xml_path)
    all_groundings.extend(groundings)

print(f"\nTotal groundings extracted: {len(all_groundings)}")
print(f"Sample grounding: {all_groundings[0]}")


### 1.4 Test Set Creation

Create a reproducible 500-sample test set.


In [None]:
df_all = pd.DataFrame(all_groundings)
print(f"Full dataset size: {len(df_all)} rows")

test_set = df_all.sample(n=500, random_state=42)
test_set = test_set.reset_index(drop=True)

test_set.to_csv('test_set.csv', index=False)
print(f"Test set created: {len(test_set)} rows")
print(f"Saved to test_set.csv")
print(f"\nTest set preview:")
print(test_set.head())


## Phase 2: Helper Functions and Model Setup

### 2.1 Intersection over Union (IoU) Function


In [None]:
def calculate_iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    
    interArea = max(0, xB - xA) * max(0, yB - yA)
    
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    
    unionArea = boxAArea + boxBArea - interArea
    
    if unionArea == 0:
        return 0.0
    
    iou = interArea / unionArea
    return iou


### 2.2 Visualization Function


In [None]:
def visualize_results(image, phrase, truth_box, pred_box, iou_score):
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    ax.imshow(image)
    
    truth_rect = plt.Rectangle(
        (truth_box[0], truth_box[1]),
        truth_box[2] - truth_box[0],
        truth_box[3] - truth_box[1],
        linewidth=3, edgecolor='green', facecolor='none', label='Ground Truth'
    )
    ax.add_patch(truth_rect)
    
    pred_rect = plt.Rectangle(
        (pred_box[0], pred_box[1]),
        pred_box[2] - pred_box[0],
        pred_box[3] - pred_box[1],
        linewidth=3, edgecolor='red', facecolor='none', label='Model Prediction'
    )
    ax.add_patch(pred_rect)
    
    ax.set_title(f"Phrase: '{phrase}' (IoU: {iou_score:.2f})", fontsize=12)
    ax.legend(loc='upper right')
    ax.axis('off')
    plt.tight_layout()
    plt.show()


## Phase 3: Experiment 1 (Specialist Model: GLIP)

### 3.1 Load Model


In [None]:
model_id = "VincentL/glip-base-patch16-224"
glip_processor = AutoProcessor.from_pretrained(model_id)
glip_model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id)

device = "cuda" if torch.cuda.is_available() else "cpu"
glip_model.to(device)
glip_model.eval()
print(f"GLIP model loaded on {device}")


### 3.2 Run Analysis


In [None]:
test_set = pd.read_csv('test_set.csv')
glip_results = []

images_dir = "data/flickr30k-images"

for idx, row in tqdm(test_set.iterrows(), total=len(test_set), desc="Running GLIP"):
    image_filename = row['image_filename']
    phrase = row['phrase']
    truth_box = eval(row['bbox'])
    
    image_path = os.path.join(images_dir, image_filename)
    if not os.path.exists(image_path):
        continue
    
    image = Image.open(image_path).convert('RGB')
    width, height = image.size
    
    inputs = glip_processor(text=phrase, images=image, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = glip_model(**inputs)
    
    results = glip_processor.post_process_object_detection(outputs, threshold=0.0)
    
    if len(results) > 0 and len(results[0]['boxes']) > 0:
        pred_box_normalized = results[0]['boxes'][0].cpu().numpy()
        pred_box = [
            int(pred_box_normalized[0] * width),
            int(pred_box_normalized[1] * height),
            int(pred_box_normalized[2] * width),
            int(pred_box_normalized[3] * height)
        ]
    else:
        pred_box = [0, 0, 0, 0]
    
    iou_score = calculate_iou(truth_box, pred_box)
    
    glip_results.append({
        'image_filename': image_filename,
        'phrase': phrase,
        'truth_box': truth_box,
        'pred_box': pred_box,
        'iou': iou_score
    })


### 3.3 Report GLIP Results


In [None]:
glip_df = pd.DataFrame(glip_results)
mean_iou = glip_df['iou'].mean()
accuracy = (glip_df['iou'] > 0.5).sum() / len(glip_df) * 100

print("=" * 50)
print("GLIP Results (Specialist Model)")
print("=" * 50)
print(f"Mean IoU: {mean_iou:.4f}")
print(f"Accuracy (IoU > 0.5): {accuracy:.2f}%")
print(f"Total samples evaluated: {len(glip_df)}")
print("=" * 50)


## Phase 4: Experiment 2 (Generalist Model: LLaVA)

### 4.1 Load Model


In [None]:
llava_model_id = "llava-hf/llava-1.5-7b-hf"
llava_processor = LlavaProcessor.from_pretrained(llava_model_id)
llava_model = LlavaForConditionalGeneration.from_pretrained(
    llava_model_id,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)

llava_model.to(device)
llava_model.eval()
print(f"LLaVA model loaded on {device}")


### 4.2 Create Text-Parsing Helper


In [None]:
def parse_bbox_from_text(text):
    pattern = r'\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]'
    match = re.search(pattern, text)
    if match:
        return [int(match.group(1)), int(match.group(2)), int(match.group(3)), int(match.group(4))]
    return None


### 4.3 Run Analysis


In [None]:
test_set = pd.read_csv('test_set.csv')
llava_results = []

images_dir = "data/flickr30k-images"

for idx, row in tqdm(test_set.iterrows(), total=len(test_set), desc="Running LLaVA"):
    image_filename = row['image_filename']
    phrase = row['phrase']
    truth_box = eval(row['bbox'])
    
    image_path = os.path.join(images_dir, image_filename)
    if not os.path.exists(image_path):
        continue
    
    image = Image.open(image_path).convert('RGB')
    
    prompt = f"USER: <image>\nWhat is the bounding box [xmin, ymin, xmax, ymax] for the phrase: '{phrase}'? Respond with *only* the bounding box.\nASSISTANT:"
    
    inputs = llava_processor(prompt, image, return_tensors="pt").to(device)
    
    with torch.no_grad():
        generate_ids = llava_model.generate(**inputs, max_new_tokens=20)
    
    generated_text = llava_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    
    pred_box = parse_bbox_from_text(generated_text)
    
    if pred_box is None:
        iou_score = 0.0
        pred_box = [0, 0, 0, 0]
    else:
        iou_score = calculate_iou(truth_box, pred_box)
    
    llava_results.append({
        'image_filename': image_filename,
        'phrase': phrase,
        'truth_box': truth_box,
        'pred_box': pred_box,
        'iou': iou_score,
        'generated_text': generated_text
    })


### 4.4 Report LLaVA Results


In [None]:
llava_df = pd.DataFrame(llava_results)
mean_iou = llava_df['iou'].mean()
accuracy = (llava_df['iou'] > 0.5).sum() / len(llava_df) * 100
format_failures = (llava_df['iou'] == 0.0).sum()

print("=" * 50)
print("LLaVA Results (Generalist Model)")
print("=" * 50)
print(f"Mean IoU: {mean_iou:.4f}")
print(f"Accuracy (IoU > 0.5): {accuracy:.2f}%")
print(f"Format failures (no box output): {format_failures} ({format_failures/len(llava_df)*100:.2f}%)")
print(f"Total samples evaluated: {len(llava_df)}")
print("=" * 50)


## Phase 5: Final Comparison and Qualitative Analysis

### 5.1 Quantitative Head-to-Head Comparison


In [None]:
comparison_df = pd.DataFrame({
    'Model': ['GLIP (Specialist)', 'LLaVA (Generalist)'],
    'Mean IoU': [glip_df['iou'].mean(), llava_df['iou'].mean()],
    'Accuracy (IoU > 0.5)': [
        (glip_df['iou'] > 0.5).sum() / len(glip_df) * 100,
        (llava_df['iou'] > 0.5).sum() / len(llava_df) * 100
    ]
})

print("=" * 60)
print("QUANTITATIVE COMPARISON")
print("=" * 60)
print(comparison_df.to_string(index=False))
print("=" * 60)


### 5.2 Qualitative Failure Analysis


In [None]:
merged_df = pd.merge(
    glip_df[['image_filename', 'phrase', 'truth_box', 'pred_box', 'iou']],
    llava_df[['image_filename', 'phrase', 'pred_box', 'iou']],
    on=['image_filename', 'phrase'],
    suffixes=('_glip', '_llava')
)

images_dir = "data/flickr30k-images"


#### Easy Success: Both models perform well (IoU > 0.7)


In [None]:
easy_success = merged_df[(merged_df['iou_glip'] > 0.7) & (merged_df['iou_llava'] > 0.7)]
if len(easy_success) > 0:
    sample = easy_success.iloc[0]
    image_path = os.path.join(images_dir, sample['image_filename'])
    image = Image.open(image_path).convert('RGB')
    
    print(f"Example: '{sample['phrase']}'")
    print(f"GLIP IoU: {sample['iou_glip']:.3f}, LLaVA IoU: {sample['iou_llava']:.3f}")
    visualize_results(
        image,
        sample['phrase'],
        eval(str(sample['truth_box'])) if isinstance(sample['truth_box'], str) else sample['truth_box'],
        eval(str(sample['pred_box_llava'])) if isinstance(sample['pred_box_llava'], str) else sample['pred_box_llava'],
        sample['iou_llava']
    )
else:
    print("No easy success cases found.")


#### GLIP Win: GLIP succeeds but LLaVA fails (GLIP IoU > 0.7, LLaVA IoU < 0.2)


In [None]:
glip_win = merged_df[(merged_df['iou_glip'] > 0.7) & (merged_df['iou_llava'] < 0.2)]
if len(glip_win) > 0:
    sample = glip_win.iloc[0]
    image_path = os.path.join(images_dir, sample['image_filename'])
    image = Image.open(image_path).convert('RGB')
    
    print(f"Example: '{sample['phrase']}'")
    print(f"GLIP IoU: {sample['iou_glip']:.3f}, LLaVA IoU: {sample['iou_llava']:.3f}")
    print("Showing LLaVA's prediction (failure case):")
    visualize_results(
        image,
        sample['phrase'],
        eval(str(sample['truth_box'])) if isinstance(sample['truth_box'], str) else sample['truth_box'],
        eval(str(sample['pred_box_llava'])) if isinstance(sample['pred_box_llava'], str) else sample['pred_box_llava'],
        sample['iou_llava']
    )
else:
    print("No GLIP win cases found.")


#### Total Failure: Both models fail (IoU < 0.2)


In [None]:
total_failure = merged_df[(merged_df['iou_glip'] < 0.2) & (merged_df['iou_llava'] < 0.2)]
if len(total_failure) > 0:
    sample = total_failure.iloc[0]
    image_path = os.path.join(images_dir, sample['image_filename'])
    image = Image.open(image_path).convert('RGB')
    
    print(f"Example: '{sample['phrase']}'")
    print(f"GLIP IoU: {sample['iou_glip']:.3f}, LLaVA IoU: {sample['iou_llava']:.3f}")
    print("Showing LLaVA's prediction:")
    visualize_results(
        image,
        sample['phrase'],
        eval(str(sample['truth_box'])) if isinstance(sample['truth_box'], str) else sample['truth_box'],
        eval(str(sample['pred_box_llava'])) if isinstance(sample['pred_box_llava'], str) else sample['pred_box_llava'],
        sample['iou_llava']
    )
else:
    print("No total failure cases found.")


#### LLaVA Format Failure: LLaVA failed to output a bounding box (IoU = 0.0)


In [None]:
format_failure = llava_df[llava_df['iou'] == 0.0]
if len(format_failure) > 0:
    sample = format_failure.iloc[0]
    print(f"Example phrase: '{sample['phrase']}'")
    print(f"Generated text: {sample['generated_text']}")
    print("\nThis demonstrates LLaVA's format compliance issues.")
else:
    print("No format failure cases found.")


### 5.3 Final Conclusions

Based on our quantitative and qualitative analysis, we can draw the following key findings:
(IN PROGRESS)

## Repository

This project is available at: https://github.com/krishnankonda/vlm-grounding-analysis