# allganize-RAG-Evaluation data + multimodal hybrid search
## Methodology
1. Read PDF files with `Reader`
    * Try `DoclingPDFReader` with `PDF2ImageReader` as fallback
2. Chunk `Document` into single-node `Document`
3. Embed chunk `Document` instances
    * dense: `Visualized_BGE`
    * sparse
4. Insert into `QdrantSingleHybridVectorStore` vector store
5. Test retrieval with queries

## Setting
* parser:
    * IBM [Docling](https://github.com/DS4SD/docling) v2.22.0
    * docling-v2 pdf parser backend
* dense embedding model: `baai/bge-visualized` (bge-m3 weight)
    * https://huggingface.co/BAAI/bge-visualized
* data: real-life pdf files from `allganize-RAG-Evaluation-Dataset-KO`
    * https://huggingface.co/datasets/allganize/RAG-Evaluation-Dataset-KO
    * use 10 'finance' domain PDF files

In [1]:
import json
from pathlib import Path
import time
from typing import Any, Dict, List, Optional

import jsonlines
import pandas as pd
from tqdm import tqdm

from config import settings

In [2]:
import sys
import os

parent_dir = os.path.dirname(os.getcwd())
core_src_dir = os.path.join(parent_dir, "src/psiking")
sys.path.append(core_src_dir)

In [3]:
## Import Core Schemas
from core.base.schema import Document, TextNode, ImageNode, TableNode

# 1. Read Data
* 10 pdf files
* try conversion with docling -> use pdf2image as fallback

In [None]:
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    PictureDescriptionApiOptions
)
from core.reader.pdf.docling_reader import DoclingPDFReader

format_options = PdfPipelineOptions()
format_options.images_scale = 1.5
format_options.generate_page_images = True
format_options.generate_picture_images = True

format_options.do_ocr = False
format_options.do_table_structure = True

# Image description
print("VLM MODEL:", settings.vlm_model)

# Use VLM for image description (ImageNode.text)
image_description_options = PictureDescriptionApiOptions(
    url=f"{settings.vlm_base_url}/v1/chat/completions",
    params=dict(
        model=settings.vlm_model,
        seed=42,
        max_completion_tokens=512,
        temperature=0.9
    ),
    prompt="이미지에 대해 3줄 정도로 자세히 설명해 주세요. 이미지에 정보가 없다면 설명 텍스트를 작성하지 않습니다",
    timeout=90,
    bitmap_area_threshold=0.05 # 5% of page area
)
format_options.do_picture_description = True
format_options.picture_description_options = image_description_options

docling_reader = DoclingPDFReader()

In [None]:
from core.reader import PDF2ImageReader

# testing on macOS, provide poppler path manually
poppler_path = "/opt/homebrew/Cellar/poppler/25.01.0/bin"
pdf2img_reader = PDF2ImageReader(poppler_path=poppler_path)

In [5]:
# PDF File directory
pdf_dir = os.path.join(settings.data_dir, "allganize-RAG-Evaluation-Dataset-KO/finance")
pdf_fnames =[x for x in os.listdir(pdf_dir) if x.endswith(".pdf")]
print("num files:", len(pdf_fnames))
pdf_fnames[:10]

num files: 10


['★2019 제1회 증시콘서트 자료집_최종★.pdf',
 '240409(보도자료) 금융위 핀테크 투자 생태계 활성화 나선다.pdf',
 '2024년 3월_3. 향후 통화신용정책 방향.pdf',
 '133178946057443204_WP22-05.pdf',
 '240130(보도자료) 지방은행의 시중은행 전환시 인가방식 및 절차.pdf',
 '130292099630937500_KIFVIP2013-10.pdf',
 '2024년 3월_2. 통화신용정책 운영.pdf',
 '[별첨] 지방은행의 시중은행 전환시 인가방식 및 절차.pdf',
 '240320(보도자료) 금융권의 상생금융 추진현황.pdf',
 '한-호주 퇴직연금 포럼_책자(최종).pdf']

In [None]:
# Convert pages to image
documents = []
docling_failed_fnames = []
pdf2img_failed_fnames = []
for doc_i, fname in tqdm(enumerate(pdf_fnames[:3])):
    file_path = os.path.join(pdf_dir, fname)
    extra_info = {
        "source_id": f"allganize-RAG-Evaluation-Dataset-KO/finance/{doc_i}", # arbitrary id
        "domain": "finance",
        "source_file": fname
    }
    try:
        document = docling_reader.run(
            file_path,
            extra_info=extra_info
        )
        documents.append(document)
        continue
    except Exception as e:
        print("[DOCLING READER] failed {} - {}".format(fname, str(e)))
        docling_failed_fnames.append(fname)
    
    try:
        document = pdf2img_reader.run(
            file_path,
            extra_info=extra_info
        )
        documents.append(document)
    except Exception as e:
        print("[PDF2IMG READER] failed {} - {}".format(fname, str(e)))
        pdf2img_failed_fnames.append(fname)
    
for node in document.nodes[:3]:
    print(type(node))

3it [00:51, 17.05s/it]

<class 'core.base.schema.TextNode'>
<class 'core.base.schema.TextNode'>
<class 'core.base.schema.TableNode'>





In [7]:
document.metadata

{'source_id': 'allganize-RAG-Evaluation-Dataset-KO/finance/2',
 'domain': 'finance',
 'source_file': '2024년 3월_3. 향후 통화신용정책 방향.pdf'}

In [8]:
# image = document.nodes[0].image

# # Crop to half
# width, height = image.size
# left_half = image.crop((0, 0, width, height//2))
# left_half

# 2. Process Document into Chunks
1. merge text nodes with `TextNodeMerger`
2. split texts into chunks with `LangchainRecursiveCharacterTextSplitter`

In [9]:
from core.processor.document.text_merger import TextNodeMerger
# Split Documents page-level
merger = TextNodeMerger()

merged_documents = []
for document in documents:
    merged_document = merger.run(document)
    merged_documents.append(merged_document)

In [10]:
# merged_documents[0]
merged_documents[0].nodes[0]

TextNode(id_='d8480416-eacf-429d-819e-cd0087504758', metadata={'page_no': 1}, text_type=<TextType.PLAIN: 'plain'>, label=<TextLabel.PLAIN: 'plain'>, resource=MediaResource(data=None, text='증권사 리서치센터장, 자산운용사 대표와 함께하는 제1회 증시 콘서트\n2019 하반기 증시 대전망\n|\xa0일\xa0시\xa0| 2019.\xa07.\xa02\xa0(화)\xa014:30\n|\xa0장\xa0소\xa0| 금융투자협회\xa03층\xa0불스홀', path=None, url=None, mimetype=None))

In [11]:
# 3. Run Splitter
from core.splitter.text.langchain_text_splitters import LangchainRecursiveCharacterTextSplitter

splitter = LangchainRecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap = 128
)

chunks = []
for document in merged_documents:
    document_chunks = []
    source_id = document.id_
    for i, node in enumerate(document.nodes):
        # Run Splitter
        if isinstance(node, TextNode):
            try:
                split_nodes = splitter.run(node)
            except Exception as e:
                print(i, node)
                print(str(e))
                raise e
        else:
            split_nodes = [node]
        
        # Create New Document
        for split_node in split_nodes:
            # Each Document contains single node
            chunk = Document(
                nodes=[split_node],
                metadata={
                    "source_id": source_id,
                    "domain": document.metadata["domain"],
                    "source_file": document.metadata['source_file'],
                }
            )
        document_chunks.append(chunk)
    chunks.extend(document_chunks)
print(len(chunks))

258


# 3. Format Text (Prepare Embedding Input)

In [None]:
from core.formatter.document.simple import SimpleTextOnlyFormatter

# use default templates
formatter = SimpleTextOnlyFormatter()
formatted_texts = formatter.run(chunks)

def select_embedding_input_idxs(texts: str, min_length: int = 20):
    return [i for i, x in enumerate(texts) if len(x.strip())>min_length]

embedding_input_idxs = select_embedding_input_idxs(
    texts=formatted_texts,
    min_length=20
)
print(len(embedding_input_idxs))

# 4. Embed Using ColPali

## 4-1. Dense Embedding VisualizedBGE

In [None]:
## Load Model
import torch
from visual_bge.modeling import Visualized_BGE

# Load Colpali engine
bge_m3_model_dir = os.path.join(
    settings.model_weight_dir, "bge-m3"
)
visualized_model_dir = os.path.join(
    settings.model_weight_dir, "baai-bge-visualized/Visualized_m3.pth"
)

dense_embedding_model = Visualized_BGE(
    model_name_bge = bge_m3_model_dir,
    model_weight= visualized_model_dir
)
dense_embedding_model.eval()
print("Loaded Dense Embedding Model")
dense_embedding_model.dtype



Loaded Model


In [None]:
from core.embedder.flagembedding import (
    VisualizedBGEInput, 
    LocalVisualizedBGEEmbedder
)
dense_embedder = LocalVisualizedBGEEmbedder(
    model=dense_embedding_model
)

In [None]:
def prepare_visualized_bge_input(chunk: Document):
    # Single 
    node = chunk.nodes[0]
    if isinstance(node, TextNode):
        return VisualizedBGEInput(
            text=node.text
        )
    elif isinstance(node, ImageNode) or isinstance(node, TableNode):
        return VisualizedBGEInput(
            text="[Caption] {} [Text] {}".format(
                node.caption, node.text
            ),
            image=node.image
        )
    else:
        raise ValueError("Unknown node type error {}".format(type(node)))
visualized_bge_inputs = [prepare_visualized_bge_input(x) for x in chunks]

In [None]:
dense_embeddings = dense_embedder.run(visualized_bge_inputs, batch_size = 4, disable_tqdm=False)

100%|██████████| 35/35 [02:48<00:00,  4.81s/it]
100%|██████████| 30/30 [00:42<00:00,  1.41s/it]


In [None]:
# (num_chunks, seq_len, embedding_dim)
print(len(dense_embeddings))
print(len(dense_embeddings[0]))

258
1024


## 4-2. Sparse Embedding
* Embed using BM42 Sparse embedder model
    * https://huggingface.co/Qdrant/all_miniLM_L6_v2_with_attentions

### Loading model from pre-downloaded directory
* Load model using 'specific model path'
    * specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else
    * download_model method skips download phase (available > v0.5.1 )
        * https://github.com/qdrant/fastembed/blob/a931f143ef3543234bc9d8d0c305496c67199972/fastembed/common/model_management.py#L367
    * build from source with commit `a931f143ef3543234bc9d8d0c305496c67199972`
* cache_dir: cache_dir (str, optional): The path to the cache directory.
    Can be set using the `FASTEMBED_CACHE_PATH` env variable.
    Defaults to `fastembed_cache` in the system's temp directory.
```
cd poetry
poetry build
pip install --force-reinstall fastembed-0.5.1-py3-none-any.whl
```

In [None]:
os.environ["FASTEMBED_CACHE_PATH"] = str(os.path.join(os.getcwd(), "fastembed"))
print(os.environ["FASTEMBED_CACHE_PATH"])
sparse_model_dir = os.path.join(settings.model_weight_dir, "fastembed/sparse/all_miniLM_L6_v2_with_attentions")
os.listdir(sparse_model_dir)

In [None]:
# Load fastembed model
from fastembed import SparseTextEmbedding

# test specific_model_path function
downloaded_dir = SparseTextEmbedding.download_model(
    model={},
    cache_dir=os.environ["FASTEMBED_CACHE_PATH"],
    specific_model_path=sparse_model_dir,
)
print(downloaded_dir)

sparse_model = SparseTextEmbedding(
    model_name="Qdrant/bm42-all-minilm-l6-v2-attentions",
    specific_model_path=sparse_model_dir,
    cuda=False,
    lazy_load=False
)

test_embeddings = list(sparse_model.embed(["hi"]))
print(test_embeddings)
test_embeddings[0].values.tolist(), test_embeddings[0].indices.tolist()

In [None]:
# Load Embedder
from core.embedder.fastembed.local_sparse import LocalFastEmbedSparseEmbedder

sparse_embedder = LocalFastEmbedSparseEmbedder(
    model=sparse_model
)

sparse_embedding_values, sparse_embedding_indices = sparse_embedder.run(
    embedding_inputs, batch_size=256
)

# 4. Insert into VectorStore
* intialize qdrant in-memory

In [None]:
from qdrant_client import QdrantClient
from core.storage.vectorstore.qdrant import QdrantSingleHybridVectorStore


# initialize client
client = QdrantClient(":memory:")
collection_name = "allganize-finance"

vector_store = QdrantSingleHybridVectorStore(
    collection_name=collection_name,
    client=client
)

In [None]:
## Create Collection
from qdrant_client.http import models

# bge-m3 1024 dim
dense_embedding_dim=1024
dense_vectors_config = models.VectorParams(
    size=dense_embedding_dim,
    distance=models.Distance.COSINE,
    on_disk=True,
)

# Sparse BM42 Embedding
sparse_vectors_config = models.SparseVectorParams(
    modifier=models.Modifier.IDF, ## uses indices from bm42 embedder
)

# Create VectorStore
vector_store.create_collection(
    dense_vector_config=dense_vectors_config,
    sparse_vector_config=sparse_vectors_config,
    on_disk_payload=True,
)

# Create Index
vector_store.create_index(
    field_name="text",
    field_schema=models.TextIndexParams(
        type="text",
        tokenizer=models.TokenizerType.MULTILINGUAL,
    ),
)

In [None]:
vector_store.add(
    documents=[chunks[x] for x in embedding_input_idxs],
    texts=embedding_inputs,
    dense_embeddings=dense_embeddings,
    sparse_embedding_values=sparse_embedding_values,
    sparse_embedding_indices=sparse_embedding_indices,
    metadata_keys=["source_file", "source_id", "title"]
)

In [None]:
# check collection
collection_info = vector_store._client.get_collection(
    collection_name=vector_store.collection_name
)
print(collection_info.model_dump_json(indent=4))

In [22]:
chunks[0].id_

'6f2b2144-2458-43ef-8674-2a58cd847ffb'

In [23]:
points = vector_store._client.retrieve(
    collection_name=vector_store.collection_name,
    ids=[chunks[0].id_],
    with_vectors=True
)

In [25]:
print(points[0].id)
print(points[0].payload)
print(len(points[0].vector))

6f2b2144-2458-43ef-8674-2a58cd847ffb
{'source_id': '780f8c46-3ce0-4f03-8939-7c893b65ab1e', 'source_file': '★2019 제1회 증시콘서트 자료집_최종★.pdf'}
1024
