In [1]:
from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
from transformers import AutoModelForCausalLM
from huggingface_hub import login
import torch
import os
import json
from PIL import Image
from tqdm import tqdm
import numpy as np
from deepseek_vl2.utils.io import load_pil_images
import re

Python version is above 3.10, patching the collections module.


In [2]:
# Verify numpy is available
print(f"Numpy version: {np.__version__}")
print(f"Torch version: {torch.__version__}")

Numpy version: 1.26.4
Torch version: 2.0.1+cu118


In [None]:
your_hf_token = ""
login(token=your_hf_token)

In [None]:
# Configuration
MODEL_PATH = "deepseek-ai/deepseek-vl2-tiny"
IMAGE_DIR = "Thesis/coco2014/test2014"
OUTPUT_FILE = "deepseek_vl2_tiny_coco_results_test.json"
BATCH_SIZE = 1
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Generation parameters
GENERATION_PARAMS = {
    "max_new_tokens": 50,
    "do_sample": False,  # Use greedy decoding for consistency
}

# Generation parameters
GENERATION_PARAMS = {
    "max_new_tokens": 50,
    "do_sample": False,
}

torch.cuda.empty_cache()
if torch.cuda.is_available():
    print(f"Free GPU memory: {torch.cuda.mem_get_info()[0] / 1024**3:.2f} GB")

Free GPU memory: 23.30 GB


In [5]:
def load_model_and_processor(model_path, device):
    """Load DeepSeek-VL2 model and processor."""
    print(f"Loading model: {model_path}")
    
    # Load processor
    vl_chat_processor = DeepseekVLV2Processor.from_pretrained(model_path)
    tokenizer = vl_chat_processor.tokenizer
    
    # Load model with bfloat16 for better performance
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    ).eval()
    
    print(f"✓ Model loaded successfully on {next(model.parameters()).device}")
    return model, vl_chat_processor, tokenizer

def generate_captions_batch(image_paths, model, processor, tokenizer):
    """
    Generate captions for a batch of images using DeepSeek-VL2.
    
    Args:
        image_paths: List of image file paths
        model: DeepSeek-VL2 model
        processor: DeepSeek-VL2 processor
        tokenizer: Tokenizer from processor
        
    Returns:
        List of generated captions
    """
    captions = []
    
    for img_path in image_paths:
        try:
            # Create conversation format for single image with new role format
            conversation = [
                {
                    "role": "<|User|>",
                    "content": "<image>\nGenerate a detailed caption for this image in one sentence.",
                    "images": [img_path]
                },
                {
                    "role": "<|Assistant|>",
                    "content": ""
                }
            ]
            
            # Load image
            pil_images = load_pil_images(conversation)
            
            # Prepare inputs with system prompt (can be empty)
            prepare_inputs = processor(
                conversations=conversation,
                images=pil_images,
                force_batchify=True,
                system_prompt=""  # Empty system prompt for captioning task
            ).to(model.device)
            
            # Get image embeddings
            inputs_embeds = model.prepare_inputs_embeds(**prepare_inputs)
            
            # Generate response - call generate() directly on model
            outputs = model.generate(
                inputs_embeds=inputs_embeds,
                attention_mask=prepare_inputs.attention_mask,
                pad_token_id=tokenizer.eos_token_id,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                max_new_tokens=GENERATION_PARAMS["max_new_tokens"],
                do_sample=GENERATION_PARAMS["do_sample"],
                use_cache=True
            )
            
            # Decode the response
            answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
            
            # Clean up the caption
            caption = answer.strip()
            # Remove any prefix that might contain the prompt
            if "Generate a detailed caption for this image in one sentence." in caption:
                caption = caption.split("Generate a detailed caption for this image in one sentence.")[-1].strip()
            
            captions.append(caption)
            
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
            captions.append("Error generating caption")
    
    return captions

def extract_image_id(filename):
    """
    Extract image ID from COCO test filename.
    Example: 'COCO_test2014_000000123456.jpg' -> 123456
    """
    match = re.search(r'(\d{12})\.(jpg|jpeg|png)$', filename, re.IGNORECASE)
    if match:
        return int(match.group(1))
    else:
        raise ValueError(f"Could not extract image ID from filename: {filename}")

In [6]:
# Load model and processor
model, processor, tokenizer = load_model_and_processor(MODEL_PATH, DEVICE)

Loading model: deepseek-ai/deepseek-vl2-tiny


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Add pad token = ['<｜▁pad▁｜>'] to the tokenizer
<｜▁pad▁｜>:2
Add image token = ['<image>'] to the tokenizer
<image>:128815
Add grounding-related tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>'] to the tokenizer with input_ids
<|ref|>:128816
<|/ref|>:128817
<|det|>:128818
<|/det|>:128819
<|grounding|>:128820
Add chat tokens = ['<|User|>', '<|Assistant|>'] to the tokenizer with input_ids
<|User|>:128821
<|Assistant|>:128822

✓ Model loaded successfully on cuda:0


In [7]:
# Get all image files in the test directory
image_files = [f for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
print(f"\nFound {len(image_files)} images in {IMAGE_DIR}")

# Sort for reproducibility
image_files.sort()


Found 40775 images in /home/kezouke/Thesis/coco2014/test2014


In [8]:
results = []
# Test on first batch
print("\nTesting on first batch...")
test_batch_size = min(BATCH_SIZE, len(image_files))
test_paths = [os.path.join(IMAGE_DIR, image_files[i]) for i in range(test_batch_size)]
test_captions = generate_captions_batch(test_paths, model, processor, tokenizer)
print(f"Sample captions from first batch:")
for i, (filename, caption) in enumerate(zip(image_files[:test_batch_size], test_captions[:3])):
    print(f"  {i+1}. {filename}: {caption}")

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



Testing on first batch...


  return F.conv2d(input, weight, bias, self.stride,


Sample captions from first batch:
  1. COCO_test2014_000000000001.jpg: A yellow taxi cab is parked next to a parking meter in front of a white truck.


In [9]:
# Process all images in batches
for batch_idx in tqdm(range(0, len(image_files), BATCH_SIZE), desc="Processing batches"):
    batch_files = image_files[batch_idx:batch_idx + BATCH_SIZE]
    image_paths_batch = []
    image_ids_batch = []

    for filename in batch_files:
        image_path = os.path.join(IMAGE_DIR, filename)
        try:
            # Verify image can be opened
            with Image.open(image_path) as img:
                img.verify()
            img_id = extract_image_id(filename)
            image_paths_batch.append(image_path)
            image_ids_batch.append(img_id)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

    if not image_paths_batch:
        continue

    # Generate captions
    captions = generate_captions_batch(image_paths_batch, model, processor, tokenizer)

    # Append to results
    for img_id, caption in zip(image_ids_batch, captions):
        results.append({
            "image_id": img_id,
            "caption": caption
        })

Processing batches: 100%|██████████| 40775/40775 [3:39:54<00:00,  3.09it/s]  


In [10]:
# Save results
print(f"\nSaving results to {OUTPUT_FILE}")
with open(OUTPUT_FILE, 'w') as f:
    json.dump(results, f, indent=2)
print("✓ Results saved in required format.")


Saving results to deepseek_vl2_tiny_coco_results_test.json
✓ Results saved in required format.
