# Load Test Dataset

In [10]:
import pandas as pd
from pathlib import Path
dataset_name = "data/new_rag-eval-ko-dataset-public.csv"
path = Path('.').resolve().parent 
dataset_path = path / dataset_name
dataset = pd.read_csv(dataset_path)

In [11]:
dataset.head(3)

Unnamed: 0,domain,question,target_answer,target_file_name,target_page_no,context_type
0,public,국세수입 담당부서와 담당자는 누구인가요?,국세수입 담당자는 세제실 주세분석과의 김태경 사무관이 입니다.,(240411보도자료) 재정동향 4월호.pdf,1,table
1,public,"2024년 1월, 2월, 3월 각각의 평균 조달금리와 응찰률이 어떻게 되나요?","2024년 1월의 평균 조달금리는 3.27%, 응찰률은 333이며, 2월의 평균 조...",(240411보도자료) 재정동향 4월호.pdf,4,table
2,public,2023년에 비해 2027년에 의무지출의 비중이 얼마나 늘어난 것인가?,2.8,2023_2027 국가재정운용계획 주요내용.pdf,10,table


# Connect VectorDB

In [12]:

result_base_path = path / "data/results"

result_base_path.mkdir(exist_ok=True)
result_file_name = "03-01_bmt_result.csv"

In [16]:
from qdrant_client import QdrantClient

HOST = "localhost"
PORT = 6333
COLLECTION_NAME = "RAG_evaluation_test"

DENSE_VECTOR = "dense"

client = QdrantClient(host=HOST, port=PORT)

# Load Embedding Model and Retrieve

In [17]:
from embedding import DenseEmbedding

dense_model = DenseEmbedding()
test_query = "model loading"
dense_model.query_embed(test_query)

Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 336441.50it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


array([-0.0293819 , -0.02064265, -0.05381404, ...,  0.05163255,
       -0.00160881,  0.01638607], shape=(1024,), dtype=float32)

In [18]:
from datetime import datetime

retrieve_result = {
    "question": [],
    "target_file_name": [],
    "target_page_no": [],
    "latency_secs": [],
    "retrieved_doc1": [],
    "retrieved_page1": [],
    "retrieved_cont1": [],
    "retrieved_doc2": [],
    "retrieved_page2": [],
    "retrieved_cont2": [],
    "retrieved_doc3": [],
    "retrieved_page3": [],
    "retrieved_cont3": [],
    "retrieved_doc4": [],
    "retrieved_page4": [],
    "retrieved_cont4": [],
    "retrieved_doc5": [],
    "retrieved_page5": [],
    "retrieved_cont5": [],
}

for idx, query in enumerate(dataset.question):
    start_time = datetime.now()
    # VectorDB 검색
    q_embedding = dense_model.query_embed(query)
    results = client.query_points(
        collection_name=COLLECTION_NAME,
        query=q_embedding,
        using=DENSE_VECTOR,
        limit=5,
    ).model_dump()

    retrieve_result["question"].append(query)
    retrieve_result["target_file_name"].append(dataset.target_file_name[idx])
    retrieve_result["target_page_no"].append(dataset.target_page_no[idx])

    # 검색결과저장
    for p_idx, point in enumerate(results["points"]):
        _payload: dict = point["payload"]
        # file_name = _payload["file_name"]
        # page = _payload["page"]
        # content = _payload["content"]
        retrieve_result[f"retrieved_doc{p_idx + 1}"].append(_payload["file_name"])
        retrieve_result[f"retrieved_page{p_idx + 1}"].append(_payload["page"])
        retrieve_result[f"retrieved_cont{p_idx + 1}"].append(_payload["content"])
    end_time = datetime.now()
    latency_secs = float(f"{(end_time - start_time).total_seconds():.4f}")
    retrieve_result["latency_secs"].append(latency_secs)
    print(f">> Complete Question {idx + 1:02} / Latency: {latency_secs}")


>> Complete Question 01 / Latency: 0.1309
>> Complete Question 02 / Latency: 0.1832
>> Complete Question 03 / Latency: 0.1009
>> Complete Question 04 / Latency: 0.7852
>> Complete Question 05 / Latency: 0.0859
>> Complete Question 06 / Latency: 0.0881
>> Complete Question 07 / Latency: 0.0779
>> Complete Question 08 / Latency: 0.1508
>> Complete Question 09 / Latency: 0.0781
>> Complete Question 10 / Latency: 0.0761
>> Complete Question 11 / Latency: 0.0871
>> Complete Question 12 / Latency: 0.1053
>> Complete Question 13 / Latency: 0.0779
>> Complete Question 14 / Latency: 0.0749
>> Complete Question 15 / Latency: 0.0408
>> Complete Question 16 / Latency: 0.0406
>> Complete Question 17 / Latency: 0.1179
>> Complete Question 18 / Latency: 0.076
>> Complete Question 19 / Latency: 0.0785
>> Complete Question 20 / Latency: 0.0385
>> Complete Question 21 / Latency: 0.074
>> Complete Question 22 / Latency: 0.0371
>> Complete Question 23 / Latency: 0.0371
>> Complete Question 24 / Latency: 0

# Retrieve result save to CSV

In [19]:
import os 
result_df = pd.DataFrame(retrieve_result)
result_df.to_csv(os.path.join(result_base_path, result_file_name), index=None)

# Evaluate Result

In [20]:
# Load Data
result_df = pd.read_csv(os.path.join(result_base_path, result_file_name))

# Evaluation
limit = 5

recalls = []
reciprocal_ranks = []

for idx in range(result_df.shape[0]):
    target_file_name = result_df.target_file_name[idx]
    target_page_no = result_df.target_page_no[idx]

    for p_idx in range(limit):
        rank = p_idx + 1
        if (
            result_df[f"retrieved_doc{rank}"][idx] == target_file_name
            and result_df[f"retrieved_page{rank}"][idx] == target_page_no
        ):
            reciprocal_ranks.append(float(f"{(1 / rank):.2f}"))
            recalls.append(1)
            break
    else:
        reciprocal_ranks.append(0)
        recalls.append(0)

In [21]:
# 평가 결과 추가
result_df["recall"] = recalls
result_df["reciprocal_rank"] = reciprocal_ranks

In [22]:
# 평가 결과를 포함해 다시 저장
result_df.to_csv(os.path.join(result_base_path, result_file_name), index=None)

In [23]:
# 최종평가결과
import math


def get_cnt_by_ratio(total_len: int, ratio: float = 0.95):
    return math.ceil(total_len * ratio)


r_95 = get_cnt_by_ratio(len(result_df.latency_secs), 0.95)

recall_5 = sum(recalls) / len(recalls)
mrr_5 = sum(reciprocal_ranks) / len(reciprocal_ranks)
latency_95 = sum(sorted(result_df.latency_secs)[:r_95]) / r_95

In [24]:
print(f"{recall_5=}")
print(f"{mrr_5=:.2f}")
print(f"{latency_95=:.4f}")

recall_5=0.28
mrr_5=0.16
latency_95=0.0727
