# allganize-RAG-Evaluation data + multimodal search
## Methodology
1. 

## Env
* model: `baai/bge-visualized` (bge-m3 weight)
* data: real-life pdf files from `allganize-RAG-Evaluation-Dataset-KO`
    * https://huggingface.co/datasets/allganize/RAG-Evaluation-Dataset-KO
    * use 10 'finance' domain files

In [1]:
import json
from pathlib import Path
import time
from typing import Any, Dict, List, Optional

import jsonlines
import pandas as pd
from tqdm import tqdm

from config import settings

In [2]:
import sys
import os

parent_dir = os.path.dirname(os.getcwd())
core_src_dir = os.path.join(parent_dir, "src/psiking")
sys.path.append(core_src_dir)

In [3]:
## Import Core Schemas
from core.base.schema import Document, TextNode, ImageNode, TableNode

# 1. Read Data
* 10 pdf files, convert to image with pdf2image

## 1-1. Load DoclingPDFReader

In [4]:
from pydantic import BaseModel

from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorOptions,
    PdfPipelineOptions,
    PictureDescriptionApiOptions,
    TableStructureOptions,
    TableFormerMode
)
from docling.document_converter import DocumentConverter, PdfFormatOption

from core.reader.pdf.docling_reader import DoclingPDFReader

from src.docling_vllm_picture_description_pipeline import (
    VLLMPictureDescriptionApiOptions,
    VLLMPictureDescriptionPdfPipeline
)

# Initialize format options
format_options = PdfPipelineOptions()

format_options.accelerator_options = AcceleratorOptions(device="mps")

format_options.images_scale = 1.5
format_options.generate_page_images = True
format_options.generate_picture_images = True
format_options.do_ocr = False

# Image description
print("VLM MODEL:", settings.vlm_model)

DESCRIPTION_INSTRUCTION = '''주어진 이미지에대해 2가지 정보를 반환합니다.
* description: 최대 2문장 정도로 이미지에 대한 간결한 설명
* text: 이미지 내에서 인식된 모든 텍스트
다음 JSON 형식으로 반환하세요 {"description": str, "text": str}'''

class ImageDescription(BaseModel):
    description: str
    text: str

image_description_options = VLLMPictureDescriptionApiOptions(
    url=f"{settings.vlm_base_url}/v1/chat/completions",
    params=dict(
        model=settings.vlm_model,
        seed=42,
        max_completion_tokens=512,
        temperature=0.9,
        extra_body={"guided_json": ImageDescription.model_json_schema()}
    ),
    # prompt="이미지에 대해 최대 2문장 정도로 설명하고 있는 텍스트를 모두 추출하세요. 이미지에 정보가 없다면 설명 텍스트를 작성하지 않습니다. 인식 텍스트와 설명만 반환하세요.",
    prompt=DESCRIPTION_INSTRUCTION,
    batch_size=6, # Not implemented inside
    scale=0.9,
    timeout=90,
    min_coverage_area_threshold=0.01,
    bitmap_area_threshold=0.1 # 10% of page area
)

format_options.do_picture_description = True
# format_options.do_picture_description = False
format_options.enable_remote_services = True
format_options.picture_description_options = image_description_options

# TableStructure
format_options.do_table_structure = True
format_options.table_structure_options = TableStructureOptions(mode=TableFormerMode.ACCURATE)

# Initialize Converter
converter = DocumentConverter(
    allowed_formats = [
        InputFormat.PDF,
    ],
    format_options = {
        InputFormat.PDF: PdfFormatOption(
            pipeline_cls=VLLMPictureDescriptionPdfPipeline,
            pipeline_options = format_options,
            backend = DoclingParseV2DocumentBackend
        )
    }
)

VLM MODEL: Qwen2.5-VL-3B-Instruct


In [5]:
# initalize reader
reader = DoclingPDFReader(converter=converter)

## 1-2. Load PDF fnames, run

In [6]:
# PDF File directory
pdf_dir = os.path.join(settings.data_dir, "allganize-RAG-Evaluation-Dataset-KO/finance")
pdf_fnames =[x for x in os.listdir(pdf_dir) if x.endswith(".pdf")]
print("num files:", len(pdf_fnames))
pdf_fnames[:10]

num files: 10


['★2019 제1회 증시콘서트 자료집_최종★.pdf',
 '240409(보도자료) 금융위 핀테크 투자 생태계 활성화 나선다.pdf',
 '2024년 3월_3. 향후 통화신용정책 방향.pdf',
 '133178946057443204_WP22-05.pdf',
 '240130(보도자료) 지방은행의 시중은행 전환시 인가방식 및 절차.pdf',
 '130292099630937500_KIFVIP2013-10.pdf',
 '2024년 3월_2. 통화신용정책 운영.pdf',
 '[별첨] 지방은행의 시중은행 전환시 인가방식 및 절차.pdf',
 '240320(보도자료) 금융권의 상생금융 추진현황.pdf',
 '한-호주 퇴직연금 포럼_책자(최종).pdf']

In [7]:
# Convert pages to image
documents = []
failed_fnames = []
for doc_i, fname in tqdm(enumerate(pdf_fnames[:3])):
    file_path = os.path.join(pdf_dir, fname)
    try:
        document = reader.run(
            file_path,
            extra_info = {
                "source_id": f"allganize-RAG-Evaluation-Dataset-KO/finance/{doc_i}", # arbitrary id
                "domain": "finance",
                "source_file": fname
            }
        )
    except Exception as e:
        print("[READER] failed {} - {}".format(fname, str(e)))
        failed_fnames.append(fname)
        continue
    document.nodes = document.nodes
    documents.append(document)
    
for node in document.nodes[:3]:
    print(type(node))

0it [00:00, ?it/s]

NUM IMAGES TO ANNOTATE 4
NUM IMAGES TO ANNOTATE 11
NUM IMAGES TO ANNOTATE 10
NUM IMAGES TO ANNOTATE 13
NUM IMAGES TO ANNOTATE 16
NUM IMAGES TO ANNOTATE 16
NUM IMAGES TO ANNOTATE 16
NUM IMAGES TO ANNOTATE 16
NUM IMAGES TO ANNOTATE 7


1it [01:32, 92.93s/it]

NUM IMAGES TO ANNOTATE 3


2it [01:40, 42.95s/it]

NUM IMAGES TO ANNOTATE 14


3it [02:14, 44.77s/it]

<class 'core.base.schema.TextNode'>
<class 'core.base.schema.TextNode'>
<class 'core.base.schema.TableNode'>





In [8]:
document.metadata

{'source_id': 'allganize-RAG-Evaluation-Dataset-KO/finance/2',
 'domain': 'finance',
 'source_file': '2024년 3월_3. 향후 통화신용정책 방향.pdf'}

In [9]:
# image = document.nodes[0].image

# # Crop to half
# width, height = image.size
# left_half = image.crop((0, 0, width, height//2))
# left_half

# 2. Process Document into Chunks
1. merge text nodes with `TextNodeMerger`
2. split texts into chunks with `LangchainRecursiveCharacterTextSplitter`

In [10]:
from core.processor.document.text_merger import TextNodeMerger
# Split Documents page-level
merger = TextNodeMerger()

merged_documents = []
for document in documents:
    merged_document = merger.run(document)
    merged_documents.append(merged_document)

In [11]:
# merged_documents[0]
merged_documents[0].nodes[0]

TextNode(id_='68f777e2-a01b-4e2e-b8bb-f618c8d5c2ef', metadata={'page_no': 1}, text_type=<TextType.PLAIN: 'plain'>, label=<TextLabel.PLAIN: 'plain'>, resource=MediaResource(data=None, text='증권사 리서치센터장, 자산운용사 대표와 함께하는 제1회 증시 콘서트\n2019 하반기 증시 대전망\n|\xa0일\xa0시\xa0| 2019.\xa07.\xa02\xa0(화)\xa014:30\n|\xa0장\xa0소\xa0| 금융투자협회\xa03층\xa0불스홀', path=None, url=None, mimetype=None))

In [None]:
# 3. Run Splitter
from core.splitter.text.langchain_text_splitters import LangchainRecursiveCharacterTextSplitter

splitter = LangchainRecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap = 128
)

chunks = []
for document in merged_documents:
    document_chunks = []
    source_id = document.id_
    for i, node in enumerate(document.nodes):
        # Run Splitter
        if isinstance(node, TextNode):
            try:
                split_nodes = splitter.run(node)
            except Exception as e:
                print(i, node)
                print(str(e))
                raise e
        else:
            split_nodes = [node]
        
        # Create New Document
        for split_node in split_nodes:
            # Each Document contains single node
            chunk = Document(
                nodes=[split_node],
                metadata={
                    "source_id": source_id,
                    "domain": document.metadata["domain"],
                    "source_file": document.metadata['source_file'],
                }
            )
            document_chunks.append(chunk)
    chunks.extend(document_chunks)
print(len(chunks))

265


# 3. Embed Using ColPali

In [13]:
## Load Model
import torch
from visual_bge.modeling import Visualized_BGE

# Load Colpali engine
bge_m3_model_dir = os.path.join(
    settings.model_weight_dir, "bge-m3"
)
visualized_model_dir = os.path.join(
    settings.model_weight_dir, "baai-bge-visualized/Visualized_m3.pth"
)

model = Visualized_BGE(
    model_name_bge = bge_m3_model_dir,
    model_weight= visualized_model_dir
)
model.eval()
print("Loaded Model")



Loaded Model


In [14]:
from core.embedder.flagembedding import (
    VisualizedBGEInput, 
    LocalVisualizedBGEEmbedder
)
embedder = LocalVisualizedBGEEmbedder(
    model=model
)

In [15]:
model.dtype

torch.float32

In [16]:
def prepare_embedding_input(chunk: Document):
    # Single 
    node = chunk.nodes[0]
    if isinstance(node, TextNode):
        return VisualizedBGEInput(
            text=node.text
        )
    elif isinstance(node, ImageNode) or isinstance(node, TableNode):
        return VisualizedBGEInput(
            text="{} {}".format(
                node.caption, node.text
            ),
            image=node.image
        )
        
inputs = [prepare_embedding_input(x) for x in chunks]

In [17]:
embeddings = embedder.run(inputs, batch_size = 4, disable_tqdm=False)

100%|██████████| 36/36 [05:36<00:00,  9.36s/it]
100%|██████████| 31/31 [00:48<00:00,  1.55s/it]


In [18]:
# (num_chunks, seq_len, embedding_dim)
print(len(embeddings))
print(len(embeddings[0]))

265
1024


# 4. Insert into VectorStore
* intialize qdrant in-memory

In [19]:
from qdrant_client import QdrantClient
from core.storage.vectorstore.qdrant import QdrantSingleVectorStore

# initialize client
client = QdrantClient(":memory:")
collection_name = "allganize-finance"

vector_store = QdrantSingleVectorStore(
    collection_name=collection_name,
    client=client
)

In [20]:
from qdrant_client.http import models

embedding_dim = 1024

vector_store.create_collection(
    on_disk_payload=True,  # store the payload on disk
    vectors_config = models.VectorParams(
        size=embedding_dim,
        distance=models.Distance.COSINE,
        on_disk=True,
    )
)

In [21]:
vector_store.add(
    documents=chunks,
    embeddings=embeddings,
    metadata_keys=["source_file", "source_id", "title"]
)

In [22]:
chunks[0].id_

'6ccd0e26-ef12-43f9-a7b0-56947c9898c8'

In [23]:
points = vector_store._client.retrieve(
    collection_name=vector_store.collection_name,
    ids=[chunks[0].id_],
    with_vectors=True
)

In [24]:
print(points[0].id)
print(points[0].payload)
print(len(points[0].vector))

6ccd0e26-ef12-43f9-a7b0-56947c9898c8
{'source_id': 'c1f7a40f-1422-41a0-b443-4729ee0a31c4', 'source_file': '★2019 제1회 증시콘서트 자료집_최종★.pdf'}
1024
