In [1]:
from transformers import AutoModel, AutoTokenizer
from huggingface_hub import login
import torch
import os
import json
from PIL import Image
from tqdm import tqdm
import re
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode

  from .autonotebook import tqdm as notebook_tqdm
  import pynvml  # type: ignore[import]


In [None]:
your_hf_token = ""
login(token=your_hf_token)

In [None]:
# Configuration
MODEL_PATH = "OpenGVLab/InternVL3_5-8B"
COCO_DIR = "Thesis/coco2014"
IMAGE_DIR = os.path.join(COCO_DIR, "val2014")
ANNOTATION_FILE = os.path.join(COCO_DIR, "annotations", "captions_val2014.json")
OUTPUT_FILE = "internvl3_5_8b_coco_results.json"
BATCH_SIZE = 2  # Smaller batch size due to high memory usage
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Generation parameters
GENERATION_CONFIG = {
    "max_new_tokens": 50,
    "do_sample": False,  # Use greedy decoding for consistency
}

# Image processing constants
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
IMAGE_SIZE = 448  # InternVL uses 448x448 resolution


In [4]:
torch.cuda.empty_cache()
if torch.cuda.is_available():
    print(f"Free GPU memory: {torch.cuda.mem_get_info()[0] / 1024**3:.2f} GB")

Free GPU memory: 23.30 GB


In [5]:
def build_transform(input_size):
    """Build image transform for InternVL."""
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    """Find closest aspect ratio for dynamic preprocessing."""
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    """Dynamic preprocessing for InternVL."""
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # Calculate target ratios
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) 
        for i in range(1, n + 1) 
        for j in range(1, n + 1) 
        if i * j <= max_num and i * j >= min_num
    )
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # Find closest aspect ratio
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size
    )

    # Calculate target dimensions
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # Resize image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    
    # Split image into tiles
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    
    assert len(processed_images) == blocks
    
    # Add thumbnail if needed
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    
    return processed_images

def load_image_for_internvl(image_path, max_num=12):
    """Load and preprocess image for InternVL, returning pixel values and num patches."""
    try:
        image = Image.open(image_path).convert('RGB')
    except Exception as e:
        print(f"Error loading image {image_path}: {e}")
        # Create a blank image as fallback
        image = Image.new('RGB', (IMAGE_SIZE, IMAGE_SIZE), (255, 255, 255))
    
    transform = build_transform(input_size=IMAGE_SIZE)
    images = dynamic_preprocess(
        image, 
        image_size=IMAGE_SIZE, 
        use_thumbnail=True, 
        max_num=max_num
    )
    
    pixel_values = [transform(img) for img in images]
    pixel_values = torch.stack(pixel_values)
    num_patches = pixel_values.size(0)
    
    return pixel_values, num_patches

def load_batch_images(image_paths, max_num=12):
    """
    Load and preprocess a batch of images for InternVL.
    
    Returns:
        pixel_values_batch: Tensor of shape [total_patches, C, H, W]
        num_patches_list: List of number of patches per image
    """
    all_pixel_values = []
    num_patches_list = []
    
    for img_path in image_paths:
        try:
            pixel_values, num_patches = load_image_for_internvl(img_path, max_num)
            all_pixel_values.append(pixel_values)
            num_patches_list.append(num_patches)
        except Exception as e:
            print(f"Error loading batch image {img_path}: {e}")
            # Create fallback blank image
            blank_img = Image.new('RGB', (IMAGE_SIZE, IMAGE_SIZE), (255, 255, 255))
            transform = build_transform(input_size=IMAGE_SIZE)
            pixel_values = transform(blank_img).unsqueeze(0)
            all_pixel_values.append(pixel_values)
            num_patches_list.append(1)
    
    # Concatenate all pixel values into one tensor
    pixel_values_batch = torch.cat(all_pixel_values, dim=0)
    return pixel_values_batch, num_patches_list

def load_model_and_tokenizer(model_path, device):
    """Load InternVL3.5-8B model and tokenizer."""
    print(f"Loading model: {model_path}")
    
    # Load model with optimized settings
    model = AutoModel.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        use_flash_attn=True,
        trust_remote_code=True,
        device_map="auto"
    ).eval()
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_path, 
        trust_remote_code=True, 
        use_fast=False
    )
    
    print(f"✓ Model loaded successfully on {next(model.parameters()).device}")
    return model, tokenizer

def generate_captions_batch_real(image_paths, model, tokenizer, generation_config):
    """
    Generate captions for a batch of images using real batch processing with InternVL3.5-8B.
    
    Args:
        image_paths: List of image file paths
        model: InternVL model
        tokenizer: Tokenizer
        generation_config: Generation configuration
        
    Returns:
        List of generated captions
    """
    try:
        # Load and preprocess all images in the batch
        pixel_values_batch, num_patches_list = load_batch_images(image_paths, max_num=12)
        
        # Move to device
        device = next(model.parameters()).device
        pixel_values_batch = pixel_values_batch.to(device, dtype=torch.bfloat16)
        
        # Create batch questions - one per image
        questions = ['<image>\nGenerate a detailed caption for this image in one sentence.'] * len(image_paths)
        
        # Use batch_chat for true parallel processing
        responses = model.batch_chat(
            tokenizer,
            pixel_values_batch,
            questions=questions,
            generation_config=generation_config,
            num_patches_list=num_patches_list,
            history=None,
            return_history=False
        )
        
        # Clean up responses
        captions = [response.strip() for response in responses]
        return captions
        
    except Exception as e:
        print(f"Error in batch generation: {e}")
        # Fallback to sequential processing if batch fails
        print("Falling back to sequential processing...")
        captions = []
        for img_path in image_paths:
            try:
                pixel_values, _ = load_image_for_internvl(img_path, max_num=12)
                caption = model.chat(
                    tokenizer,
                    pixel_values.to(next(model.parameters()).device, dtype=torch.bfloat16),
                    '<image>\nGenerate a detailed caption for this image in one sentence.',
                    generation_config=generation_config
                )
                captions.append(caption.strip())
            except Exception as e2:
                print(f"Error processing individual image {img_path}: {e2}")
                captions.append("Error generating caption")
        return captions

def extract_image_id(filename):
    """
    Extract image ID from COCO test filename.
    Example: 'COCO_test2014_000000123456.jpg' -> 123456
    """
    match = re.search(r'(\d{12})\.(jpg|jpeg|png)$', filename, re.IGNORECASE)
    if match:
        return int(match.group(1))
    else:
        raise ValueError(f"Could not extract image ID from filename: {filename}")

In [6]:
model, tokenizer = load_model_and_tokenizer(MODEL_PATH, DEVICE)

Loading model: OpenGVLab/InternVL3_5-8B


`torch_dtype` is deprecated! Use `dtype` instead!


FlashAttention2 is not installed.


Loading checkpoint shards: 100%|██████████| 4/4 [01:56<00:00, 29.21s/it]


✓ Model loaded successfully on cuda:0


In [8]:
image_files = [f for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
print(f"\nFound {len(image_files)} images in {IMAGE_DIR}")

image_files.sort()

# Test on first batch
print("\nTesting on first batch...")
test_batch_size = min(BATCH_SIZE, len(image_files))
test_paths = [os.path.join(IMAGE_DIR, image_files[i]) for i in range(test_batch_size)]
test_captions = generate_captions_batch_real(test_paths, model, tokenizer, GENERATION_CONFIG)
print(f"Sample captions from first batch:")
for i, (filename, caption) in enumerate(zip(image_files[:test_batch_size], test_captions[:3])):
    print(f"  {i+1}. {filename}: {caption}")


Found 40504 images in /home/kezouke/Thesis/coco2014/val2014

Testing on first batch...


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Error in batch generation: CUDA out of memory. Tried to allocate 2.74 GiB. GPU 0 has a total capacity of 23.57 GiB of which 2.52 GiB is free. Including non-PyTorch memory, this process has 21.04 GiB memory in use. Of the allocated memory 20.37 GiB is allocated by PyTorch, and 364.34 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Falling back to sequential processing...


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Sample captions from first batch:
  1. COCO_val2014_000000000042.jpg: A variety of shoes, including a red pair with a dragon design and a fluffy brown object, are stored in a metal wire basket.
  2. COCO_val2014_000000000073.jpg: A vintage motorcycle with a sidecar, featuring a classic design with a chrome engine, leather seat, and a license plate reading "SV-6260."


In [9]:
BATCH_SIZE = 1

In [10]:
# Process all images
results = []
for batch_idx in tqdm(range(0, len(image_files), BATCH_SIZE), desc="Processing images"):
    batch_files = image_files[batch_idx:batch_idx + BATCH_SIZE]
    image_paths_batch = []
    image_ids_batch = []

    for filename in batch_files:
        image_path = os.path.join(IMAGE_DIR, filename)
        try:
            # Verify image can be opened
            with Image.open(image_path) as img:
                img.verify()
            img_id = extract_image_id(filename)
            image_paths_batch.append(image_path)
            image_ids_batch.append(img_id)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

    if not image_paths_batch:
        continue

    # Generate captions
    captions = generate_captions_batch_real(
        image_paths_batch, 
        model, 
        tokenizer,
        GENERATION_CONFIG
    )

    # Append to results
    for img_id, caption in zip(image_ids_batch, captions):
        results.append({
            "image_id": img_id,
            "caption": caption
        })
        
    torch.cuda.empty_cache()

Processing images:   0%|          | 0/40504 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Processing images:   0%|          | 1/40504 [00:02<29:32:18,  2.63s/it]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Processing images:   0%|          | 2/40504 [00:03<19:01:33,  1.69s/it]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Processing images:   0%|          | 3/40504 [00:04<17:01:11,  1.51s/it]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Processing images:   0%|          | 4/40504 [00:07<21:56:22,  1.95s/it]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Processing images:   0%|          | 5/40504 [00:10<24:01:51,  2.14s/it]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Processing images:   0%|          | 6/40504 [00:11<22:02:53,  1.96s/it]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Pro

In [11]:
# Save results
print(f"\nSaving results to {OUTPUT_FILE}")
with open(OUTPUT_FILE, 'w') as f:
    json.dump(results, f, indent=2)
print("✓ Results saved in required format.")


Saving results to internvl3_5_8b_coco_results.json
✓ Results saved in required format.
