In [1]:
import copy
import os
from typing import ClassVar, List, Optional, Tuple, Union

import torch
from PIL import Image
from transformers import BatchFeature
from transformers.models.qwen2_vl import Qwen2VLProcessor
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

from colpali_engine.utils.processing_utils import BaseVisualRetrieverProcessor
from colpali_engine.models import ColQwen2_5, ColQwen2_5_Processor

from config import settings




In [2]:
# class MultimodalColQwen_2_5_Processor(ColQwen2_5_Processor):
#     def process_images(self, images: List[Image.Image], context_prompts: Optional[List[str]] = None) -> BatchFeature:
#         """
#         Process images for ColQwen2.5.
#         """

#         if context_prompts:
#             texts_doc = context_prompts
#         else:
#             texts_doc = [self.visual_prompt_prefix] * len(images)
#         images = [image.convert("RGB") for image in images]

#         batch_doc = self(
#             text=texts_doc,
#             images=images,
#             padding="longest",
#             return_tensors="pt",
#         )

#         # NOTE: The following adjustment ensures correct behavior with DDP on multiple GPUs.
#         offsets = batch_doc["image_grid_thw"][:, 1] * batch_doc["image_grid_thw"][:, 2]  # (batch_size,)

#         # Split the pixel_values tensor into a list of tensors, one per image
#         pixel_values = list(
#             torch.split(batch_doc["pixel_values"], offsets.tolist())
#         )  # [(num_patches_image_0, pixel_values), ..., (num_patches_image_n, pixel_values)]

#         # Pad the list of pixel_value tensors to the same length along the sequence dimension
#         batch_doc["pixel_values"] = torch.nn.utils.rnn.pad_sequence(
#             pixel_values, batch_first=True
#         )  # (batch_size, max_num_patches, pixel_values)

#         return batch_doc


In [20]:
model_dir = os.path.join(
    settings.model_weight_dir, "Qwen2.5-VL-3B-Instruct"
)
adapter_dir = os.path.join(
    settings.model_weight_dir, "embedding/colnomic-embed-multimodal-3b"
)

# https://github.com/QwenLM/Qwen2.5-VL/issues/760#issuecomment-2657856186
model = ColQwen2_5.from_pretrained(
    model_dir,
    torch_dtype=torch.bfloat16,
    device_map="mps",
    attn_implementation="eager",
).eval()

model.load_adapter(adapter_dir)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of ColQwen2_5 were not initialized from the model checkpoint at /Users/id4thomas/models/Qwen2.5-VL-3B-Instruct and are newly initialized: ['custom_text_proj.bias', 'custom_text_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
class NomicColQwen_2_5_Processor(ColQwen2_5_Processor):
    def __init__(
        self,
        *args,
        max_num_visual_tokens: int = 768,
        **kwargs,
    ):
        super().__init__(*args, **kwargs)
        self.tokenizer.padding_side = "left"

        self.max_num_visual_tokens = max_num_visual_tokens
        self.factor = 28
        self.min_pixels = 4 * 28 * 28
        self.max_pixels = self.max_num_visual_tokens * 28 * 28

        self.image_processor.min_pixels = self.min_pixels
        self.image_processor.max_pixels = self.max_pixels

In [22]:
processor = NomicColQwen_2_5_Processor.from_pretrained(adapter_dir)

Some kwargs in processor config are unused and will not have any effect: max_num_visual_tokens. 


In [23]:
# context_template = "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>"
context_template = '''<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>Describe the image. The following is text related to the image: {}<|im_end|><|endoftext|>'''

images = [
    Image.new("RGB", (128, 128), color="white"),
    Image.new("RGB", (64, 32), color="black"),
]
context_contents = [
    "랜덤한 이미지 1",
    "랜덤한 이미지 2",
]
context_prompts = [
    context_template.format(x) for x in context_contents
]
context_prompts

['<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image. The following is text related to the image: 랜덤한 이미지 1<|im_end|><|endoftext|>',
 '<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image. The following is text related to the image: 랜덤한 이미지 2<|im_end|><|endoftext|>']

In [24]:
batch = processor.process_images(images=images, context_prompts=context_prompts)

In [25]:
batch

{'input_ids': tensor([[151644,    872,    198, 151652, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151653,  74785,    279,   2168,     13,    576,   2701,
            374,   1467,   5435,    311,    279,   2168,     25,   5140,    252,
            250, 144452,  23573,  90667,  21329,    220,     16, 151645, 151643],
        [151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
         151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
         151643, 151644,    872,    198, 151652, 151655, 151655, 151655, 151655,
         151655, 151655, 151653,  74785,    279,   2168,     13,    576,   2701,
            374,   1467,   5435,    311,    279,   2168,     25,   5140,    252,
            250, 144452,  23573,  90667,  21329,    220,     17, 151645, 151643]]), 'attention

In [26]:
with torch.no_grad():
    image_embeddings = model(**batch.to("mps"))

In [29]:
image_embeddings.shape

torch.Size([2, 54, 128])