In [1]:
import json
from pathlib import Path
import os
import time

import pandas as pd
from tqdm import tqdm

from pydantic_settings import BaseSettings, SettingsConfigDict

class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    data_dir: str
    model_dir: str
    docling_model_dir: str
    
settings = Settings()
os.environ["HF_HOME"] = settings.docling_model_dir

In [2]:
import sys
sys.path.append("src")

from core.base.schema import Document
from core.reader import PDF2ImageReader
from core.embedder import LocalColpaliEngineEmbedder
from core.storage.docstore import InMemoryDocumentStore
from core.storage.vectorstore.qdrant.late_interaction import QdrantLateInteractionVectorStore

# 1. Read Documents

In [3]:
# testing on macOS
poppler_path = "/opt/homebrew/Cellar/poppler/25.01.0/bin"
reader = PDF2ImageReader(poppler_path=poppler_path)

In [4]:
pdf_dir = os.path.join(settings.data_dir, "allganize-RAG-Evaluation-Dataset-KO/finance")
pdf_fnames =[x for x in os.listdir(pdf_dir) if x.endswith(".pdf")]
print("num files:", len(pdf_fnames))
pdf_fnames[:10]

num files: 10


['★2019 제1회 증시콘서트 자료집_최종★.pdf',
 '240409(보도자료) 금융위 핀테크 투자 생태계 활성화 나선다.pdf',
 '2024년 3월_3. 향후 통화신용정책 방향.pdf',
 '133178946057443204_WP22-05.pdf',
 '240130(보도자료) 지방은행의 시중은행 전환시 인가방식 및 절차.pdf',
 '130292099630937500_KIFVIP2013-10.pdf',
 '2024년 3월_2. 통화신용정책 운영.pdf',
 '[별첨] 지방은행의 시중은행 전환시 인가방식 및 절차.pdf',
 '240320(보도자료) 금융권의 상생금융 추진현황.pdf',
 '한-호주 퇴직연금 포럼_책자(최종).pdf']

In [5]:
documents = []

for fname in tqdm(pdf_fnames[:2]):
    file_path = os.path.join(pdf_dir, pdf_fnames[0])
    document = reader.run(file_path)
    document.nodes = document.nodes[:3] ## test with only first 10 pages
    documents.append(document)

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:17<00:00,  8.63s/it]


In [38]:
for node in document.nodes[:2]:
    print(type(node))

<class 'core.base.schema.ImageNode'>
<class 'core.base.schema.ImageNode'>


# 2. Store Documents

In [6]:
docstore = InMemoryDocumentStore()

In [7]:
docstore.add(documents)

In [8]:
docstore.count()

2

# 3. Split Documents
* each ImageNode into separate documents

In [9]:
chunks = []
for document in documents:
    source_id = document.id_
    for node in document.nodes:
        chunk = Document(
            nodes=[node],
            metadata={"source_id": source_id}
        )
        chunks.append(chunk)
print(len(chunks))

6


# 4. Embed Using Colpali

In [10]:
import itertools

In [11]:
import torch
from transformers import AutoTokenizer
from colpali_engine.models import ColIdefics3, ColIdefics3Processor

# Load Colpali engine
model_dir = os.path.join(
    settings.model_dir, "multimodal_retriever/colSmol-500M"
)

In [12]:
model = ColIdefics3.from_pretrained(
    model_dir,
    torch_dtype=torch.bfloat16,
    # device="mps",
    # device_map="mps",
    # attn_implementation="flash_attention_2" # or eager
).eval()
model.to("mps")
tokenizer = AutoTokenizer.from_pretrained(model_dir)
processor = ColIdefics3Processor.from_pretrained(model_dir)

Some kwargs in processor config are unused and will not have any effect: image_seq_len. 


In [13]:
embedder = LocalColpaliEngineEmbedder(
    model=model,
    processor=processor,
    tokenizer=tokenizer
)

In [14]:
embeddings = []

embeddings = embedder.run(
    nodes=itertools.chain(*[x.nodes for x in documents]),
    batch_size=3
)

In [15]:
# (page_num, seq_len, embedding_dim)
print(len(embeddings))
print(len(embeddings[0]))
print(len(embeddings[0][0]))

6
871
128


# 5. Insert into VectorStore
* initialize qdrant in-memory
* insert documents

In [16]:
from qdrant_client import QdrantClient

# initialize client
client = QdrantClient(":memory:")
collection_name = "allganize-finance"

In [17]:
vector_store = QdrantLateInteractionVectorStore(
    collection_name=collection_name,
    client=client
)

In [18]:
from qdrant_client.http import models

embedding_dim = 128

# vector_store._client.create_collection(
vector_store.create(
    # collection_name=vector_store.collection_name,
    on_disk_payload=True,  # store the payload on disk
    vectors_config=models.VectorParams(
        size=embedding_dim,
        distance=models.Distance.COSINE,
        on_disk=True, # move original vectors to disk
        multivector_config=models.MultiVectorConfig(
            comparator=models.MultiVectorComparator.MAX_SIM
        ),
        quantization_config=models.BinaryQuantization(
        binary=models.BinaryQuantizationConfig(
                always_ram=True  # keep only quantized vectors in RAM
            ),
        ),
    ),
)

In [19]:
print(len(chunks), len(embeddings))

6 6


In [20]:
type(embeddings[0]), type(embeddings[0][0])

(list, list)

In [21]:
vector_store.add(
    documents=chunks,
    embeddings=embeddings
)

In [22]:
chunks[0].id_

'd01e8eb9-5d4f-4cd2-bf04-188ae7c61a17'

In [29]:
points = vector_store._client.retrieve(
    collection_name=vector_store.collection_name,
    ids=[chunks[0].id_],
    with_vectors=True
)

In [35]:
print(points[0].id)
print(points[0].payload)
print(len(points[0].vector))
print(len(points[0].vector[0]))

d01e8eb9-5d4f-4cd2-bf04-188ae7c61a17
{'source_id': '994bb7d8-63a4-4c51-a3cd-a1d497a136f8'}
871
128
