# allganize-RAG-Evaluation - retrieval performance
## Methodology
```
1. Load DocStore, VectorStore
    1-1. Load DocStore
    1-2. Load VectorStore
2. Initialize Embedder
3. Load Evaluation Data
    3-1. Load Query & Ground Truth
    3-2. Calculate Query Embeddings
4. Run Retrieval
```


In [1]:
import json
import os

from config import settings

In [2]:
INDEXING_MODE="multimodal-hybrid"

# 1. Load DocStore, VectorStore

## 1-1. Load DocStore

In [3]:
from psiking.core.storage.docstore.in_memory import InMemoryDocumentStore

In [4]:
doc_store = InMemoryDocumentStore()
doc_store.load('storage/docstore_v2507.json')

## 1-2. Load VectorStore

In [5]:
from qdrant_client import QdrantClient
from psiking.core.storage.vectorstore.qdrant import QdrantSingleVectorStore

# initialize client
# client = QdrantClient(":memory:")
client = QdrantClient(host="localhost", port=6333)
collection_name = f"allganize-finance-{INDEXING_MODE}-v2507"

vector_store = QdrantSingleVectorStore(
    collection_name=collection_name,
    client=client
)

# 2. Intialize Embedder

## 2-1. Dense Embedder

In [6]:
import asyncio
from tqdm.asyncio import tqdm

from psiking.core.embedder.vllm.online_jina_emb_v4 import VLLMOnlineJinaEmbV4Embedder

dense_embedder = VLLMOnlineJinaEmbV4Embedder(
    base_url=settings.multimodal_embedding_base_url,
    model=settings.multimodal_embedding_model
)

## 2-2. Sparse Embedder

In [7]:
from fastembed import SparseTextEmbedding
from psiking.core.embedder.fastembed.local_sparse import LocalFastEmbedSparseEmbedder

sparse_model_dir = os.path.join(
    settings.model_weight_dir, "embedding/fastembed/sparse/all_miniLM_L6_v2_with_attentions"
)
os.listdir(sparse_model_dir)

sparse_embedding_model = SparseTextEmbedding(
    model_name="Qdrant/bm42-all-minilm-l6-v2-attentions",
    specific_model_path=sparse_model_dir,
    cuda=False,
    lazy_load=False
)

sparse_embedder = LocalFastEmbedSparseEmbedder(
    model=sparse_embedding_model
)

# 3. Load Evaluation Data

## 3-1. Load Query & Ground Truth

In [8]:
import pandas as pd
answer_df = pd.read_csv('data/retrieval_ground_truth.tsv', sep='\t')
print(answer_df.shape[0])
answer_df = answer_df[answer_df.domain=='finance']
print(answer_df.shape[0])

300
60


In [9]:
answer_df.head()

Unnamed: 0,domain,question,target_file_id,target_file_name,target_page_no,context_type,target_answer
0,finance,"시중은행, 지방은행, 인터넷은행의 인가 요건 및 절차에 차이가 있는데 그 차이점은 ...",c94f675e-7d81-48bd-88f8-c5ff766190cc,[별첨] 지방은행의 시중은행 전환시 인가방식 및 절차.pdf,4,paragraph,"시중은행, 지방은행, 인터넷은행 모두 은행업을 영위하기 위해서는 '은행법' 제8조에..."
1,finance,"은행업을 신청하고자 할 때, 은행법상 소유규제에 부합하는 대주주 요건을 충족하려면 ...",c94f675e-7d81-48bd-88f8-c5ff766190cc,[별첨] 지방은행의 시중은행 전환시 인가방식 및 절차.pdf,7,table,은행업을 신청하려면 대주주 요건을 충족해야 합니다. 대주주 요건으로는 부실금융기관 ...
2,finance,본인가를 받으려는 지방은행이 시중은행 전환시 예비인가를 받을 필요가 있는지 설명하시...,c94f675e-7d81-48bd-88f8-c5ff766190cc,[별첨] 지방은행의 시중은행 전환시 인가방식 및 절차.pdf,8,paragraph,"본인가를 받으려는 지방은행이 시중은행 전환을 신청하는 경우, 예비인가를 받을 필요는..."
3,finance,"은행법에 의거 예비인가를 신청할 수 있는지와, 그 경우 금융위원회가 검토했어야 하는...",c94f675e-7d81-48bd-88f8-c5ff766190cc,[별첨] 지방은행의 시중은행 전환시 인가방식 및 절차.pdf,10,paragraph,은행법에 의하면 예비인가를 신청할 수 있습니다. 제8조에 따른 인가를 받으려는 자는...
4,finance,2019년 YTD 기준으로 브라질의 주식 시장 수익률과 베트남의 주식 시장 수익률 ...,7373884a-8255-482d-9e7c-00b919083526,★2019 제1회 증시콘서트 자료집_최종★.pdf,6,image,Refinitiv에서 제공한 자료에 따르면 2019년 YTD 브라질의 주식 시장 수...


## 3-2. Calculate Query Embeddings

In [10]:
import asyncio
from typing import List
from tqdm.asyncio import tqdm as atqdm

import numpy as np
from psiking.core.storage.vectorstore.schema import (
    MetadataFilters,
    FilterOperator,
    VectorStoreQuery,
    VectorStoreQueryMode,
    VectorStoreQueryOptions,
)   

In [11]:
queries = answer_df.question.values.tolist()

### 3-2-1. Calculate Dense Embeddings

In [None]:
async def calculate_dense_embedding(semaphore, text: str, mode: str='messages'):
    if mode=='messages':
        messages = [
            {
                'role': 'user',
                'content': [
                    {'type': 'text', 'text': text},
                ]
            }
        ]
        async with semaphore:
            try:
                embedding = await dense_embedder.arun(
                    input=messages,
                    input_format='messages',
                    pool=True,
                    normalize=True
                )
            except Exception as e:
                print("ERR {}".format( str(e)))
                raise e
    else:
        async with semaphore:
            try:
                embedding = await dense_embedder.arun(
                    input=text,
                    input_format='text',
                    mode='query',
                    pool=True,
                    normalize=True
                )
            except Exception as e:
                print("ERR {}".format( str(e)))
                raise e
        
    return embedding

In [None]:
QUERY_EMBEDDING_MODE='text'
QUERY_EMBEDDING_MODE='messages'

semaphore = asyncio.Semaphore(16)
tasks = []
for query in queries:
    task = calculate_dense_embedding(semaphore, query, mode=QUERY_EMBEDDING_MODE)
    tasks.append(task)

dense_query_embeddings = await tqdm.gather(*tasks)

  0%|          | 0/60 [00:00<?, ?it/s]

100%|██████████| 60/60 [00:14<00:00,  4.28it/s]


### 3-2-2. Calculate Sparse Embeddings

In [14]:
sparse_query_embedding_values, sparse_query_embedding_indicies = sparse_embedder.run(
    queries,
    batch_size=256
)

In [15]:
sparse_query_embedding_values[0], sparse_query_embedding_indicies[0]

([0.3358786137712128,
  0.3013605213375253,
  0.1513849676123903,
  0.19132686389565112,
  0.2917136214767342,
  0.25055627768987737,
  0.12610037760540843,
  0.3535769768753488],
 [1024444394,
  1285937098,
  693871510,
  376689346,
  332251539,
  1798584096,
  1061271926,
  1903036828])

# 4. Prepare Evaluation

In [16]:
from typing import Literal
## Retrieval Function
def retrieve(
    mode: Literal['dense', 'sparse', 'hybrid'],
    d: np.ndarray,
    s_v: List[float],
    s_i: List[float],
    k: int = 10,
    sparse_k: int = 30,
    dense_k: int = 30,
)->List[str]:
    """Retrieves point(chunk's) documentids"""
    vsquery=VectorStoreQuery(
        dense_embedding=d,
        sparse_embedding_values=s_v,
        sparse_embedding_indicies=s_i
    )
    if mode=='hybrid':
        vsoptions=VectorStoreQueryOptions(
            mode=VectorStoreQueryMode.HYBRID,
            top_k=k,
            hybrid_fusion_method='rrf',
            sparse_top_k=sparse_k,
            dense_top_k=dense_k
        )
    elif mode=='dense':
        vsoptions=VectorStoreQueryOptions(
            mode=VectorStoreQueryMode.DENSE,
            top_k=k,
        )
    
    elif mode=='sparse':
        vsoptions=VectorStoreQueryOptions(
            mode=VectorStoreQueryMode.SPARSE,
            top_k=k,
        )
    
    chunks = vector_store.query(
        query=vsquery,
        options=vsoptions
    )
    chunk_ids = [x.id for x in chunks]
    return chunk_ids

In [17]:
## Evaluation Function
from utils.eval_utils import (
    calculate_filelevel_ap,
    calculate_filelevel_rr,
    calculate_pagelevel_ap,
    calculate_pagelevel_rr
)

# 5. Evaluate - Sparse

In [18]:
k = 30

retrieved_chunkids = []

print("K {}".format(k))
for query_i in tqdm(range(len(queries))):
    d = dense_query_embeddings[query_i]
    s_v = sparse_query_embedding_values[query_i]
    s_i = sparse_query_embedding_indicies[query_i]
    
    retrieved_chunkids.append(
        retrieve(
            'sparse',
            d=d,
            s_v=s_v,
            s_i=s_i,
            k=k,
        )
    )

K 30


100%|██████████| 60/60 [00:00<00:00, 121.56it/s]


## 5-1. File-Level

In [19]:
answer_fileids = [
    answer_df.iloc[x]['target_file_id']
    for x in range(len(queries))
]

retrieved_fileids = [
    [
        doc_store.get(x)[0].metadata['source_id']
        for x in chunkids
    ]
    for chunkids in retrieved_chunkids
]
len(answer_fileids), len(retrieved_fileids[5])

(60, 13)

In [20]:
# [len(x) for x in retrieved_fileids]

In [21]:
# mAP
for at_k in [5,10]:
    aps = [
        calculate_filelevel_ap(retrieved, answer, at_k=at_k)
        for retrieved, answer in zip(retrieved_fileids, answer_fileids, strict=True)
    ]
    print("mAP@{}: {:.4f}".format(at_k, sum(aps)/len(aps)))

mAP@5: 0.3696
mAP@10: 0.3293


In [22]:
# mRR
at_k = 5
for at_k in [5,10,15]:
    rrs = [
        calculate_filelevel_rr(retrieved, answer, at_k=at_k)
        for retrieved, answer in zip(retrieved_fileids, answer_fileids, strict=True)
    ]

    print("mRR@{}: {:.4f}".format(at_k, sum(rrs)/len(rrs)))

mRR@5: 0.4789
mRR@10: 0.4840
mRR@15: 0.4840


## 5-2. Page-Level

In [23]:
answers = [
    (answer_df.iloc[x]['target_file_id'], int(answer_df.iloc[x]['target_page_no']))
    for x in range(len(queries))
]

retrieved_metadata = [
    [
        doc_store.get(x)[0].metadata
        for x in chunkids
    ]
    for chunkids in retrieved_chunkids
]

In [24]:
for at_k in [5,10,15]:
    aps = [
        calculate_pagelevel_ap(retrieved, answer, at_k=at_k)
        for retrieved, answer in zip(retrieved_metadata, answers, strict=True)
    ]
    print("mAP@{}: {:.4f}".format(at_k, sum(aps)/len(aps)))

mAP@5: 0.0301
mAP@10: 0.0323
mAP@15: 0.0292


In [25]:
for at_k in [5,10,15]:
    rrs = [
        calculate_pagelevel_rr(retrieved, answer, at_k=at_k)
        for retrieved, answer in zip(retrieved_metadata, answers, strict=True)
    ]
    print("mRR@{}: {:.4f}".format(at_k, sum(rrs)/len(rrs)))

mRR@5: 0.0628
mRR@10: 0.0628
mRR@15: 0.0642


# 6. Evaluate - Dense

In [26]:
k = 30

retrieved_chunkids = []

print("K {}".format(k))
for query_i in tqdm(range(len(queries))):
    d = dense_query_embeddings[query_i]
    s_v = sparse_query_embedding_values[query_i]
    s_i = sparse_query_embedding_indicies[query_i]
    
    retrieved_chunkids.append(
        retrieve(
            'dense',
            d=d,
            s_v=s_v,
            s_i=s_i,
            k=k,
        )
    )

K 30


100%|██████████| 60/60 [00:00<00:00, 102.79it/s]


## 6-1. File-Level

In [27]:
answer_fileids = [
    answer_df.iloc[x]['target_file_id']
    for x in range(len(queries))
]

retrieved_fileids = [
    [
        doc_store.get(x)[0].metadata['source_id']
        for x in chunkids
    ]
    for chunkids in retrieved_chunkids
]

In [28]:
# mAP
for at_k in [5,10,15]:
    aps = [
        calculate_filelevel_ap(retrieved, answer, at_k=at_k)
        for retrieved, answer in zip(retrieved_fileids, answer_fileids, strict=True)
    ]
    print("mAP@{}: {:.4f}".format(at_k, sum(aps)/len(aps)))

mAP@5: 0.7662
mAP@10: 0.7230
mAP@15: 0.6810


In [29]:
# mRR
at_k = 5
for at_k in [5,10,15]:
    rrs = [
        calculate_filelevel_rr(retrieved, answer, at_k=at_k)
        for retrieved, answer in zip(retrieved_fileids, answer_fileids, strict=True)
    ]

    print("mRR@{}: {:.4f}".format(at_k, sum(rrs)/len(rrs)))

mRR@5: 0.8594
mRR@10: 0.8594
mRR@15: 0.8594


## 6-2. Page-Level

In [30]:
answers = [
    (answer_df.iloc[x]['target_file_id'], int(answer_df.iloc[x]['target_page_no']))
    for x in range(len(queries))
]

retrieved_metadata = [
    [
        doc_store.get(x)[0].metadata
        for x in chunkids
    ]
    for chunkids in retrieved_chunkids
]

In [31]:
# mAP
for at_k in [5,10,15]:
    aps = [
        calculate_pagelevel_ap(retrieved, answer, at_k=at_k)
        for retrieved, answer in zip(retrieved_metadata, answers, strict=True)
    ]
    print("mAP@{}: {:.4f}".format(at_k, sum(aps)/len(aps)))

mAP@5: 0.2208
mAP@10: 0.1648
mAP@15: 0.1349


In [32]:
# mRR
for at_k in [5,10,15]:
    rrs = [
        calculate_pagelevel_rr(retrieved, answer, at_k=at_k)
        for retrieved, answer in zip(retrieved_metadata, answers, strict=True)
    ]
    print("mRR@{}: {:.4f}".format(at_k, sum(rrs)/len(rrs)))

mRR@5: 0.4428
mRR@10: 0.4546
mRR@15: 0.4569


# 7 - Evaluate - Hybrid

In [33]:
k = 30
sparse_k = 100
dense_k = 100

retrieved_chunkids = []

print("K {} (sparse {} dense {})".format(k, sparse_k, dense_k))
for query_i in tqdm(range(len(queries))):
    d = dense_query_embeddings[query_i]
    s_v = sparse_query_embedding_values[query_i]
    s_i = sparse_query_embedding_indicies[query_i]
    
    retrieved_chunkids.append(
        retrieve(
            'hybrid',
            d=d,
            s_v=s_v,
            s_i=s_i,
            k=k,
            sparse_k=sparse_k,
            dense_k=dense_k
        )
    )

K 30 (sparse 100 dense 100)


100%|██████████| 60/60 [00:00<00:00, 100.54it/s]


## 7-2. File-Level

In [34]:
answer_fileids = [
    answer_df.iloc[x]['target_file_id']
    for x in range(len(queries))
]

retrieved_fileids = [
    [
        doc_store.get(x)[0].metadata['source_id']
        for x in chunkids
    ]
    for chunkids in retrieved_chunkids
]

In [35]:
# mAP
for at_k in [5,10,15]:
    aps = [
        calculate_filelevel_ap(retrieved, answer, at_k=at_k)
        for retrieved, answer in zip(retrieved_fileids, answer_fileids, strict=True)
    ]
    print("mAP@{}: {:.4f}".format(at_k, sum(aps)/len(aps)))

mAP@5: 0.6094
mAP@10: 0.5752
mAP@15: 0.5455


In [36]:
# mRR
at_k = 5
for at_k in [5,10,15]:
    rrs = [
        calculate_filelevel_rr(retrieved, answer, at_k=at_k)
        for retrieved, answer in zip(retrieved_fileids, answer_fileids, strict=True)
    ]

    print("mRR@{}: {:.4f}".format(at_k, sum(rrs)/len(rrs)))

mRR@5: 0.7653
mRR@10: 0.7697
mRR@15: 0.7697


## 7-2. Page Level

In [37]:
answers = [
    (answer_df.iloc[x]['target_file_id'], int(answer_df.iloc[x]['target_page_no']))
    for x in range(len(queries))
]

retrieved_metadata = [
    [
        doc_store.get(x)[0].metadata
        for x in chunkids
    ]
    for chunkids in retrieved_chunkids
]

In [38]:
# mAP
for at_k in [5,10,15]:
    aps = [
        calculate_pagelevel_ap(retrieved, answer, at_k=at_k)
        for retrieved, answer in zip(retrieved_metadata, answers, strict=True)
    ]
    print("mAP@{}: {:.4f}".format(at_k, sum(aps)/len(aps)))

mAP@5: 0.1604
mAP@10: 0.1279
mAP@15: 0.1080


In [39]:
# mRR
for at_k in [5,10,15]:
    rrs = [
        calculate_pagelevel_rr(retrieved, answer, at_k=at_k)
        for retrieved, answer in zip(retrieved_metadata, answers, strict=True)
    ]
    print("mRR@{}: {:.4f}".format(at_k, sum(rrs)/len(rrs)))

mRR@5: 0.3172
mRR@10: 0.3320
mRR@15: 0.3370
