In [1]:
from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
from transformers import AutoModelForCausalLM
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
from huggingface_hub import login
import torch
import os
import json
from PIL import Image
from tqdm import tqdm
import numpy as np
from deepseek_vl.utils.io import load_pil_images

Python version is above 3.10, patching the collections module.


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
your_hf_token = ""
login(token=your_hf_token)

In [None]:
# Configuration
MODEL_PATH = "deepseek-ai/deepseek-vl-7b-chat"
COCO_DIR = "Thesis/coco2014"
IMAGE_DIR = os.path.join(COCO_DIR, "val2014")
ANNOTATION_FILE = os.path.join(COCO_DIR, "annotations", "captions_val2014.json")
OUTPUT_FILE = "deepseek_vl_7b_coco_results.json"
BATCH_SIZE = 16
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Generation parameters
GENERATION_PARAMS = {
    "max_new_tokens": 50,
    "do_sample": False,  # Use greedy decoding for consistency
}

In [4]:
torch.cuda.empty_cache()
if torch.cuda.is_available():
    print(f"Free GPU memory: {torch.cuda.mem_get_info()[0] / 1024**3:.2f} GB")

Free GPU memory: 23.30 GB


In [5]:
def load_model_and_processor(model_path, device):
    """Load DeepSeek-VL model and processor."""
    print(f"Loading model: {model_path}")
    
    # Load processor
    vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
    tokenizer = vl_chat_processor.tokenizer
    
    # Load model with bfloat16 for better performance
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    ).eval()
    
    print(f"✓ Model loaded successfully")
    return model, vl_chat_processor, tokenizer

def generate_captions_batch(image_paths, model, processor, tokenizer, batch_size=2):
    """
    Generate captions for a batch of images using DeepSeek-VL.
    
    Args:
        image_paths: List of image file paths
        model: DeepSeek-VL model
        processor: DeepSeek-VL processor
        tokenizer: Tokenizer from processor
        batch_size: Batch size for processing
        
    Returns:
        List of generated captions
    """
    captions = []
    
    for img_path in image_paths:
        try:
            # Create conversation format for single image
            conversation = [
                {
                    "role": "User",
                    "content": "<image_placeholder>Generate a detailed caption for this image in one sentence.",
                    "images": [img_path]
                },
                {
                    "role": "Assistant",
                    "content": ""
                }
            ]
            
            # Load image
            pil_images = load_pil_images(conversation)
            
            # Prepare inputs
            prepare_inputs = processor(
                conversations=conversation,
                images=pil_images,
                force_batchify=True
            ).to(model.device)
            
            # Get image embeddings
            inputs_embeds = model.prepare_inputs_embeds(**prepare_inputs)
            
            # Generate response
            outputs = model.language_model.generate(
                inputs_embeds=inputs_embeds,
                attention_mask=prepare_inputs.attention_mask,
                pad_token_id=tokenizer.eos_token_id,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                max_new_tokens=GENERATION_PARAMS["max_new_tokens"],
                do_sample=GENERATION_PARAMS["do_sample"],
                use_cache=True
            )
            
            # Decode the response
            answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
            
            # Clean up the caption
            caption = answer.strip()
            captions.append(caption)
            
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
            captions.append("Error generating caption")
    
    return captions

def evaluate_coco(annotation_file, results_file):
    """Evaluate results using pycocoevalcap."""
    print("\n" + "="*60)
    print("COCO Evaluation Results")
    print("="*60)
    
    # Load COCO API
    coco = COCO(annotation_file)
    coco_results = coco.loadRes(results_file)
    
    # Create evaluator
    coco_eval = COCOEvalCap(coco, coco_results)
    
    # Run evaluation
    coco_eval.evaluate()
    
    # Print scores
    print("\nMetrics:")
    for metric, score in coco_eval.eval.items():
        print(f"  {metric:10s}: {score:.4f}")
    
    return coco_eval.eval

def to_serializable(obj):
    """Convert numpy types to Python natives for JSON serialization."""
    if isinstance(obj, np.generic):
        return obj.item()
    raise TypeError(f"Type {type(obj)} not serializable")

In [6]:
# Load model and processor
model, processor, tokenizer = load_model_and_processor(MODEL_PATH, DEVICE)

Loading model: deepseek-ai/deepseek-vl-7b-chat


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.19s/it]

✓ Model loaded successfully





In [7]:
# Load COCO annotations
print(f"\nLoading COCO annotations from {ANNOTATION_FILE}")
coco = COCO(ANNOTATION_FILE)
img_ids = coco.getImgIds()
print(f"Total images: {len(img_ids)}")


Loading COCO annotations from /home/kezouke/Thesis/coco2014/annotations/captions_val2014.json
loading annotations into memory...
Done (t=0.11s)
creating index...
index created!
Total images: 40504


In [8]:
print(f"Generating captions for {len(img_ids)} images...")
print(f"Batch size: {BATCH_SIZE}\n")

results = []
image_data_list = []

# Prepare image data
for img_id in img_ids:
    img_info = coco.loadImgs(img_id)[0]
    image_path = os.path.join(IMAGE_DIR, img_info['file_name'])
    image_data_list.append((img_id, image_path))

Generating captions for 40504 images...
Batch size: 16



In [9]:
print("Testing on first batch...")
test_batch_size = min(BATCH_SIZE, len(image_data_list))
test_paths = [image_data_list[i][1] for i in range(test_batch_size)]
test_captions = generate_captions_batch(test_paths, model, processor, tokenizer, BATCH_SIZE)
print(f"Sample captions from first batch:")
for i, caption in enumerate(test_captions[:3]):
    print(f"  {i+1}. {caption}")

Testing on first batch...
Sample captions from first batch:
  1. A man rides a motorcycle on a dirt road with a bridge and mountains in the background.
  2. A person in a hairnet is using a large knife to cut a cake.
  3. A young boy stands in a field with several water buffalo, holding a colorful umbrella.


In [10]:
# Process all images in batches
for batch_idx in tqdm(range(0, len(image_data_list), BATCH_SIZE), desc="Processing batches"):
    batch_data = image_data_list[batch_idx:batch_idx+BATCH_SIZE]
    img_ids_batch = [x[0] for x in batch_data]
    paths_batch = [x[1] for x in batch_data]
    
    # Generate captions for batch
    captions = generate_captions_batch(
        paths_batch, 
        model, 
        processor,
        tokenizer,
        batch_size=len(paths_batch)
    )
    
    # Add to results
    for img_id, caption in zip(img_ids_batch, captions):
        results.append({
            "image_id": img_id,
            "caption": caption
        })

Processing batches: 100%|██████████| 2532/2532 [7:31:23<00:00, 10.70s/it]  


In [11]:
# Save results
print(f"\nSaving results to {OUTPUT_FILE}")
with open(OUTPUT_FILE, 'w') as f:
    json.dump(results, f)
print("✓ Results saved")


Saving results to deepseek_vl_7b_coco_results.json
✓ Results saved


In [12]:
# Evaluate with COCO metrics
eval_scores = evaluate_coco(ANNOTATION_FILE, OUTPUT_FILE)


COCO Evaluation Results
loading annotations into memory...
Done (t=0.13s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.03s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 2492309 tokens at 2566483.18 tokens per second.
Nov 05, 2025 12:31:33 AM edu.stanford.nlp.process.PTBLexer next
PTBTokenizer tokenized 738522 tokens at 2048067.82 tokens per second.


setting up scorers...
clipscore is using cuda
computing Bleu score...
{'testlen': 640493, 'reflen': 498809, 'guess': [640493, 599989, 559485, 518981], 'correct': [371478, 175850, 74824, 30509]}
ratio: 1.2840445942234378
Bleu_1: 0.580
Bleu_2: 0.412
Bleu_3: 0.283
Bleu_4: 0.191
computing METEOR score...
METEOR: 0.272
computing Rouge score...
ROUGE_L: 0.477
computing CIDEr score...
CIDEr: 0.617
computing SPICE score...


Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.3 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [0.5 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.2

SPICE evaluation took: 4.674 min
SPICE: 0.222
computing CLIPScore score...


  0%|          | 0/159 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling 

due to a numerical instability, new numpy normalization is slightly different than paper results. to exactly replicate paper results, please use numpy version less than 1.21, e.g., 1.20.3.


  0%|          | 0/792 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling 

due to a numerical instability, new numpy normalization is slightly different than paper results. to exactly replicate paper results, please use numpy version less than 1.21, e.g., 1.20.3.


40504it [00:00, 122933.31it/s]


CLIPScore: 0.816
RefCLIPScore: 0.824

Metrics:
  Bleu_1    : 0.5800
  Bleu_2    : 0.4123
  Bleu_3    : 0.2833
  Bleu_4    : 0.1912
  METEOR    : 0.2721
  ROUGE_L   : 0.4772
  CIDEr     : 0.6170
  SPICE     : 0.2217
  CLIPScore : 0.8164
  RefCLIPScore: 0.8242


In [13]:

# Save scores
scores_file = OUTPUT_FILE.replace('.json', '_scores.json')
with open(scores_file, 'w') as f:
    json.dump(eval_scores, f, indent=2, default=to_serializable)

print(f"✓ Scores saved to {scores_file}")

✓ Scores saved to deepseek_vl_7b_coco_results_scores.json
