In [None]:
!pip install colpali_engine==0.3.1

# I) Embed one picture

In [2]:
import torch
from transformers import AutoProcessor
from colpali_engine.models import ColPali, ColPaliProcessor
from PIL import Image

model_name = "vidore/colpali"
model = ColPali.from_pretrained("vidore/colpaligemma-3b-mix-448-base", torch_dtype=torch.bfloat16, device_map="cuda").eval()
model.load_adapter(model_name)
autoprocessor = AutoProcessor.from_pretrained(model_name)
processor = ColPaliProcessor(
    tokenizer=autoprocessor.tokenizer,
    image_processor=autoprocessor.image_processor
)

  from .autonotebook import tqdm as notebook_tqdm
`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.33s/it]


In [25]:
# What does ColPali image embedding look like?

# Embed picture
image1 = Image.open(r"..\data\bubble.png")
# image2 = Image.open(r".\docs\bluejay.jpg")
processed_image = ColPaliProcessor.process_images(processor, [image1])
with torch.no_grad():
    processed_image = {k: v.to(model.device) for k, v in processed_image.items()}
    ps = model(**processed_image)

# Move the embedding to CPU and convert to a regular Python list if needed
# embedding_cpu = ps.cpu()
# embedding_list = embedding_cpu.tolist() 

# print(f"Embedding shape: {embedding_cpu.shape}")
# print(f"Embedding type: {embedding_cpu.dtype}")

Embedding shape: torch.Size([1, 1030, 128])
Embedding type: torch.bfloat16


# Embed a query

In [3]:
queries = ["When did Apple release their 8-K"]

# Process queries
batch_query = processor.process_queries(queries)

# Compute embeddings
device = torch.device("cuda")
with torch.no_grad():
    batch_query = {k: v.to(model.device) for k, v in batch_query.items()}
    qs = model(**batch_query)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [5]:
torch.save(qs, "query.pt")

In [23]:
# Compute distance
processor.score(qs, ps)

tensor([[14.4375],
        [13.9375]])

In [6]:
qs.size()

torch.Size([1, 32, 128])

# II) How do I embed an entire PDF?

In [2]:
from pdf2image import convert_from_path

def convert_pdf_to_images(pdf_path):
    images = convert_from_path(pdf_path, poppler_path = r"C:\Program Files\poppler-24.07.0\Library\bin")
    
    processed_image = ColPaliProcessor.process_images(processor, images)
    
    with torch.no_grad():
        processed_image = {k: v.to(model.device) for k, v in processed_image.items()}
        embedding = model(**processed_image)

    return embedding
        
pdf_path = r"C:\Users\Kenta Sakai\projects\FinDoc-Retrieval\data\pdf\0000320193-23-000005.pdf"

embedding = convert_pdf_to_images(pdf_path)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [3]:
embedding.size()

torch.Size([4, 1030, 128])