In [1]:
from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
from transformers import AutoModelForCausalLM
from huggingface_hub import login
import torch
import os
import json
from PIL import Image
from tqdm import tqdm
import numpy as np
from deepseek_vl.utils.io import load_pil_images
import re

Python version is above 3.10, patching the collections module.


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
your_hf_token = ""
login(token=your_hf_token)

In [None]:
# Configuration
MODEL_PATH = "deepseek-ai/deepseek-vl-7b-chat"
IMAGE_DIR = "Thesis/coco2014/test2014"
OUTPUT_FILE = "deepseek_vl_7b_coco_results_test.json"
BATCH_SIZE = 8
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Generation parameters
GENERATION_PARAMS = {
    "max_new_tokens": 50,
    "do_sample": False,  # Use greedy decoding for consistency
}

In [4]:
torch.cuda.empty_cache()
if torch.cuda.is_available():
    print(f"GPU Memory: {torch.cuda.mem_get_info()[0] / 1024**3:.2f} GB available")

GPU Memory: 23.30 GB available


In [5]:
def load_model_and_processor(model_path, device):
    """Load DeepSeek-VL model and processor."""
    print(f"Loading model: {model_path}")
    
    # Load processor
    vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
    tokenizer = vl_chat_processor.tokenizer
    
    # Load model with bfloat16 for better performance
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    ).eval()
    
    print(f"✓ Model loaded successfully on {next(model.parameters()).device}")
    return model, vl_chat_processor, tokenizer

def generate_captions_batch(image_paths, model, processor, tokenizer):
    """
    Generate captions for a batch of images using DeepSeek-VL.
    
    Args:
        image_paths: List of image file paths
        model: DeepSeek-VL model
        processor: DeepSeek-VL processor
        tokenizer: Tokenizer from processor
        
    Returns:
        List of generated captions
    """
    captions = []
    
    for img_path in image_paths:
        try:
            # Create conversation format for single image
            conversation = [
                {
                    "role": "User",
                    "content": "<image_placeholder>Generate a detailed caption for this image in one sentence.",
                    "images": [img_path]
                },
                {
                    "role": "Assistant",
                    "content": ""
                }
            ]
            
            # Load image
            pil_images = load_pil_images(conversation)
            
            # Prepare inputs
            prepare_inputs = processor(
                conversations=conversation,
                images=pil_images,
                force_batchify=True
            ).to(model.device)
            
            # Get image embeddings
            inputs_embeds = model.prepare_inputs_embeds(**prepare_inputs)
            
            # Generate response
            outputs = model.language_model.generate(
                inputs_embeds=inputs_embeds,
                attention_mask=prepare_inputs.attention_mask,
                pad_token_id=tokenizer.eos_token_id,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                max_new_tokens=GENERATION_PARAMS["max_new_tokens"],
                do_sample=GENERATION_PARAMS["do_sample"],
                use_cache=True
            )
            
            # Decode the response
            answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
            
            # Clean up the caption
            caption = answer.strip()
            captions.append(caption)
            
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
            captions.append("Error generating caption")
    
    return captions

def extract_image_id(filename):
    """
    Extract image ID from COCO test filename.
    Example: 'COCO_test2014_000000123456.jpg' -> 123456
    """
    match = re.search(r'(\d{12})\.(jpg|jpeg|png)$', filename, re.IGNORECASE)
    if match:
        return int(match.group(1))
    else:
        raise ValueError(f"Could not extract image ID from filename: {filename}")

In [6]:
# Load model and processor
model, processor, tokenizer = load_model_and_processor(MODEL_PATH, DEVICE)

# Get all image files in the test directory
image_files = [f for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
print(f"\nFound {len(image_files)} images in {IMAGE_DIR}")

# Sort for reproducibility
image_files.sort()

results = []

Loading model: deepseek-ai/deepseek-vl-7b-chat


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 3/3 [03:27<00:00, 69.27s/it]

✓ Model loaded successfully on cuda:0

Found 40775 images in /home/kezouke/Thesis/coco2014/test2014





In [7]:
# Test on first batch
print("\nTesting on first batch...")
test_batch_size = min(BATCH_SIZE, len(image_files))
test_paths = [os.path.join(IMAGE_DIR, image_files[i]) for i in range(test_batch_size)]
test_captions = generate_captions_batch(test_paths, model, processor, tokenizer)


Testing on first batch...


In [8]:
print(f"Sample captions from first batch:")
for i, (filename, caption) in enumerate(zip(image_files[:test_batch_size], test_captions[:3])):
    print(f"  {i+1}. {filename}: {caption}")

Sample captions from first batch:
  1. COCO_test2014_000000000001.jpg: A cab that says Metropolitan Police on it is parked by a street light.
  2. COCO_test2014_000000000014.jpg: A man on a skateboard is about to cross a street under a sign that says Nice.
  3. COCO_test2014_000000000016.jpg: A baseball player, wearing the number 12, is swinging a bat at a baseball.


In [9]:
# Process all images in batches
for batch_idx in tqdm(range(0, len(image_files), BATCH_SIZE), desc="Processing batches"):
    batch_files = image_files[batch_idx:batch_idx + BATCH_SIZE]
    image_paths_batch = []
    image_ids_batch = []

    for filename in batch_files:
        image_path = os.path.join(IMAGE_DIR, filename)
        try:
            # Verify image can be opened
            with Image.open(image_path) as img:
                img.verify()
            img_id = extract_image_id(filename)
            image_paths_batch.append(image_path)
            image_ids_batch.append(img_id)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

    if not image_paths_batch:
        continue

    # Generate captions
    captions = generate_captions_batch(image_paths_batch, model, processor, tokenizer)

    # Append to results
    for img_id, caption in zip(image_ids_batch, captions):
        results.append({
            "image_id": img_id,
            "caption": caption
        })

Processing batches: 100%|██████████| 5097/5097 [7:33:50<00:00,  5.34s/it]  


In [10]:
# Save results
print(f"\nSaving results to {OUTPUT_FILE}")
with open(OUTPUT_FILE, 'w') as f:
    json.dump(results, f, indent=2)
print("✓ Results saved in required format.")


Saving results to deepseek_vl_7b_coco_results_test.json
✓ Results saved in required format.
