# allganize-RAG-Evaluation data + multimodal hybrid ingestion
## Methodology
```
1. Load Document Readers
    1-1. Load DoclingPDFReader
        1-1-1. Initialize Docling Converter
        1-1-2. Initialize PSIKing Reader
    1-2. Load PDF2ImageReader
2. Load PDF File Data
3. Ingest Data
    3-1. (Reader) PDF File -> PSIKing Document
    3-2. (Splitter) Chunk Documents
4. Embed
5. Insert into DocumentStore, VectorStore
    5-1. Insert to DocStore
    5-2. Insert to VectorStore
6. Test Query
```

## Settings
[Dataset]
* real-life Korean finance pdf files from `allganize-RAG-Evaluation-Dataset-KO`
    * https://huggingface.co/datasets/allganize/RAG-Evaluation-Dataset-KO
    * use 10 'finance' domain files

[Embedder]
* Dense: `jina-embeddings-v4-vllm-retrieval` [[hf link]](https://huggingface.co/jinaai/jina-embeddings-v4-vllm-retrieval)
    * served using vLLM `v0.9.1` docker image
* Sparse: Qdrant/BM42 (`fastembed/sparse/all_miniLM_L6_v2_with_attentions` [[hflink]](https://huggingface.co/Qdrant/all_miniLM_L6_v2_with_attentions))

In [5]:
import json
import os

from pathlib import Path
import time
from typing import Any, Dict, List, Optional

import pandas as pd
from pydantic import BaseModel
from tqdm import tqdm

from config import settings
# Artifacts should contain model weights downloaded using `docling-tools models download`
# Typically set to `~/.cache/docling/models`
# os.environ["DOCLING_ARTIFACTS_PATH"] = settings.docling_artifacts_path

In [6]:
## Import Core Schemas
from psiking.core.base.schema import Document, TextNode, ImageNode, TableNode

# 1. Load Chunks from DocumentStore

## 1-1. Load DocumentStore

In [7]:
from psiking.core.storage.docstore.in_memory import InMemoryDocumentStore

doc_store = InMemoryDocumentStore()

In [8]:
doc_store.load('storage/docstore_v2507.json')

## 1-2. Load Chunks

In [9]:
chunks = list(doc_store._store.values())
print(len(chunks))

1032


In [10]:
chunks[0].metadata

{'reader': 'DoclingPDFReader',
 'source_id': '7373884a-8255-482d-9e7c-00b919083526',
 'domain': 'finance',
 'method': 'docling-pdf',
 'prov': '[{"page_no": 1, "bbox": {"l": 71.444, "t": 702.6370374023437, "r": 511.598, "b": 645.7080374023437, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 37]}]'}

# 4. Embed

## 4-1. Dense Embedding

### 4-1-1. Load Embedder

In [11]:
import asyncio
from tqdm.asyncio import tqdm

from psiking.core.embedder.vllm.online_jina_emb_v4 import VLLMOnlineJinaEmbV4Embedder

dense_embedder = VLLMOnlineJinaEmbV4Embedder(
    base_url=settings.multimodal_embedding_base_url,
    model=settings.multimodal_embedding_model
)

### 4-1-2. Embed Documents

In [12]:
import base64
from io import BytesIO
from PIL import Image

from bs4 import BeautifulSoup

def img_to_base64(img: Image.Image, format="PNG"):
    buffer = BytesIO()
    img.save(buffer, format=format)         # or format="JPEG"
    buffer.seek(0)

    # 3. Base64-encode
    b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
    return b64

def prepare_text(doc: Document):
    node = doc.nodes[0]
    text = ''
    
    if isinstance(node, TextNode):
        text = node.text
    elif isinstance(node, ImageNode):
        description = node.text
        caption = node.caption
        text = f"Caption: {caption}\nDescription: {description}"
    elif isinstance(node, TableNode):
        html = node.text
        soup = BeautifulSoup(html, 'html.parser')
        table_text = soup.get_text().strip()
        caption = node.caption
        text = f"Caption: {caption}\nTable: {table_text}"
    return text

async def dense_embed(semaphore, doc: Document):
    text = prepare_text(doc)
    messages = [
        {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': 'Passage: '+text},
            ]
        }
    ]
    # Image
    node = doc.nodes[0]
    if isinstance(node, ImageNode) or isinstance(node, TableNode):
        if not node.image_data is None:
            image_base64 = img_to_base64(node.image)
            messages[0]['content'].append(
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{image_base64}"}
                }
            )
    async with semaphore:
        try:
            embedding = await dense_embedder.arun(
                input=messages,
                input_format='messages',
                pool=True,
                normalize=True
            )
        except Exception as e:
            print("ERR DOC {} {}".format(doc.id_, str(e)))
            raise e
        
    return embedding

In [13]:
semaphore = asyncio.Semaphore(12)

tasks = []
for chunk in chunks:
    task = dense_embed(semaphore, chunk)
    tasks.append(task)

dense_embeddings = await tqdm.gather(*tasks)

100%|██████████| 1032/1032 [21:28<00:00,  1.25s/it]


In [14]:
dense_embeddings = [x.tolist() for x in dense_embeddings]

In [15]:
# (num_chunks, seq_len, embedding_dim)
print(len(dense_embeddings))
print(len(dense_embeddings[0]))

1032
2048


In [16]:
type(dense_embeddings[0][0])

float

## 4-2. Sparse Embedding

### 4-2-1. Load Embedder

In [17]:
from fastembed import SparseTextEmbedding

sparse_model_dir = os.path.join(
    settings.model_weight_dir, "embedding/fastembed/sparse/all_miniLM_L6_v2_with_attentions"
)
os.listdir(sparse_model_dir)

sparse_embedding_model = SparseTextEmbedding(
    model_name="Qdrant/bm42-all-minilm-l6-v2-attentions",
    specific_model_path=sparse_model_dir,
    cuda=False,
    lazy_load=False
)

In [18]:
from psiking.core.embedder.fastembed.local_sparse import LocalFastEmbedSparseEmbedder

sparse_embedder = LocalFastEmbedSparseEmbedder(
    model=sparse_embedding_model
)

### 4-2-2. Embed Documents

In [19]:
sparse_inputs = [
    prepare_text(x) for x in chunks
]

In [20]:
sparse_embedding_values, sparse_embedding_indices = sparse_embedder.run(
    sparse_inputs,
    batch_size=256
)

# 5. Insert into DocumentStore, VectorStore

## 5-2. Insert to VectorStore

In [21]:
from qdrant_client import QdrantClient
from psiking.core.storage.vectorstore.qdrant import QdrantSingleHybridVectorStore

# initialize client
# client = QdrantClient(":memory:")
client = QdrantClient(host="localhost", port=6333)
collection_name = "allganize-finance-multimodal-hybrid-v2507_2"

vector_store = QdrantSingleHybridVectorStore(
    collection_name=collection_name,
    client=client
)

In [22]:
from qdrant_client.http import models

dense_embedding_dim = len(dense_embeddings[0])

vector_store.create_collection(
    on_disk_payload=True,  # store the payload on disk
    dense_vector_config = models.VectorParams(
        size=dense_embedding_dim,
        distance=models.Distance.COSINE,
        on_disk=True,
        hnsw_config = {
            "m": 16,
            "ef_construct": 100,
        }
    ),
    sparse_vector_config = models.SparseVectorParams(
        modifier=models.Modifier.IDF, ## uses indices from bm42 embedder
    )
)

In [23]:
# check collection
collection_info = vector_store._client.get_collection(
    collection_name=vector_store.collection_name
)
print(collection_info.model_dump_json(indent=4))

{
    "status": "green",
    "optimizer_status": "ok",
    "vectors_count": null,
    "indexed_vectors_count": 0,
    "points_count": 0,
    "segments_count": 2,
    "config": {
        "params": {
            "vectors": {
                "vector_dense": {
                    "size": 2048,
                    "distance": "Cosine",
                    "hnsw_config": {
                        "m": 16,
                        "ef_construct": 100,
                        "full_scan_threshold": null,
                        "max_indexing_threads": null,
                        "on_disk": null,
                        "payload_m": null
                    },
                    "quantization_config": null,
                    "on_disk": true,
                    "datatype": null,
                    "multivector_config": null
                }
            },
            "shard_number": 1,
            "sharding_method": null,
            "replication_factor": 1,
            "write_consistency

In [24]:
vector_store.add(
    documents=chunks,
    dense_embeddings=dense_embeddings,
    sparse_embedding_values=sparse_embedding_values,
    sparse_embedding_indices=sparse_embedding_indices,
    metadata_keys=["source_id", "domain", 'prov', 'method']
)

In [25]:
chunks[0].id_

'b9ea81f3-b5ff-4b69-9a6b-d2fce3c07575'

In [26]:
points = vector_store._client.retrieve(
    collection_name=vector_store.collection_name,
    ids=[chunks[0].id_],
    with_vectors=True
)

In [27]:
print(points[0].id)
print(points[0].payload)
print(len(points[0].vector))

b9ea81f3-b5ff-4b69-9a6b-d2fce3c07575
{'source_id': '7373884a-8255-482d-9e7c-00b919083526', 'domain': 'finance', 'method': 'docling-pdf', 'prov': '[{"page_no": 1, "bbox": {"l": 71.444, "t": 702.6370374023437, "r": 511.598, "b": 645.7080374023437, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 37]}]'}
2


# 6. Test Query

In [28]:
import numpy as np
from psiking.core.storage.vectorstore.schema import (
    MetadataFilters,
    FilterOperator,
    VectorStoreQuery,
    VectorStoreQueryMode,
    VectorStoreQueryOptions,
)   

In [29]:
# Use random query embedding
query_embedding = np.random.randn(dense_embedding_dim)

vsquery=VectorStoreQuery(
    dense_embedding=query_embedding,
    sparse_embedding_values=[],
    sparse_embedding_indicies=[]
    
)
vsoptions=VectorStoreQueryOptions(
    mode=VectorStoreQueryMode.HYBRID,
    top_k=10,
    hybrid_fusion_method='rrf',
    sparse_top_k=30,
    dense_top_k=30
)

In [30]:
points = vector_store.query(
    query=vsquery,
    options=vsoptions
)

In [31]:
points

[ScoredPoint(id='d2b5b4dc-1ca3-4db7-b435-559dba7bbabd', version=15, score=0.5, payload={'source_id': '980889bb-16cd-447f-b5eb-1384b84903cc', 'domain': 'finance', 'method': 'docling-pdf', 'prov': '[{"page_no": 109, "bbox": {"l": 91.65, "t": 667.3470073242187, "r": 240.37, "b": 649.7340073242187, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 41]}]'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id='fad9a112-5f27-47ce-80fa-35c258a92d11', version=9, score=0.33333334, payload={'source_id': '72b54f4b-7002-48ea-ad20-2c613d8360f6', 'domain': 'finance', 'method': 'docling-pdf', 'prov': '[{"page_no": 37, "bbox": {"l": 75.50782012939453, "t": 443.55621337890625, "r": 438.07318115234375, "b": 225.366455078125, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 0]}]'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id='74ff5aed-de68-414b-9b42-37e898291b3d', version=10, score=0.25, payload={'source_id': 'bbd035d6-51a2-41ba-b913-8357d89b7852', 'domain': 'finance', 'method':

In [32]:
# Get Retrieved Result from docstore
retrieved_doc_id = points[0].id

retrieved_doc = doc_store.get(retrieved_doc_id)[0]

In [33]:
nodes = retrieved_doc.nodes
print(len(nodes))
print(json.dumps(retrieved_doc.metadata, indent=4))

1
{
    "reader": "DoclingPDFReader",
    "source_id": "980889bb-16cd-447f-b5eb-1384b84903cc",
    "domain": "finance",
    "method": "docling-pdf",
    "prov": "[{\"page_no\": 109, \"bbox\": {\"l\": 91.65, \"t\": 667.3470073242187, \"r\": 240.37, \"b\": 649.7340073242187, \"coord_origin\": \"BOTTOMLEFT\"}, \"charspan\": [0, 41]}]"
}
