# Load Test Dataset

In [None]:
import pandas as pd

dataset_path = "/Users/claion/Projects/experimental-code/rag-bench-marking-test/data/new_rag-eval-ko-dataset-public.csv"
dataset = pd.read_csv(dataset_path)

In [None]:
dataset.head(3)

# Connect VectorDB

In [None]:
import os

result_base_path = (
    "/Users/claion/Projects/experimental-code/rag-bench-marking-test/data/results"
)
os.makedirs(result_base_path, exist_ok=True)
result_file_name = "03-04_bmt_result.csv"

In [None]:
from qdrant_client import QdrantClient, models

HOST = "localhost"
PORT = 6333
COLLECTION_NAME = "bench-marking-test"

DENSE_VECTOR = "dense"
SPARSE_VECTOR = "sparse"

client = QdrantClient(host=HOST, port=PORT)

# Load Embedding Model and Retrieve

In [None]:
from embedding import DenseEmbedding, SparseEmbedding

dense_model = DenseEmbedding()
sparse_model = SparseEmbedding()

test_query = "model loading"

dense_model.query_embed(test_query)
sparse_model.query_embed(test_query)

In [None]:
from datetime import datetime

retrieve_result = {
    "question": [],
    "target_file_name": [],
    "target_page_no": [],
    "latency_secs": [],
    "retrieved_doc1": [],
    "retrieved_page1": [],
    "retrieved_cont1": [],
    "retrieved_doc2": [],
    "retrieved_page2": [],
    "retrieved_cont2": [],
    "retrieved_doc3": [],
    "retrieved_page3": [],
    "retrieved_cont3": [],
    "retrieved_doc4": [],
    "retrieved_page4": [],
    "retrieved_cont4": [],
    "retrieved_doc5": [],
    "retrieved_page5": [],
    "retrieved_cont5": [],
}

for idx, query in enumerate(dataset.question):
    start_time = datetime.now()

    # VectorDB 검색
    q_dense_embedding = dense_model.query_embed(query)
    q_sparse_embedding = sparse_model.query_embed(query)

    pref_limit = 20
    prefetch = [
        models.Prefetch(
            query=q_dense_embedding,
            using=DENSE_VECTOR,
            limit=pref_limit,
        ),
        models.Prefetch(
            query=models.SparseVector(**q_sparse_embedding),
            using=SPARSE_VECTOR,
            limit=pref_limit,
        ),
    ]

    results = client.query_points(
        collection_name=COLLECTION_NAME,
        prefetch=prefetch,
        query=models.FusionQuery(fusion=models.Fusion.DBSF),
        limit=5,
    ).model_dump()

    retrieve_result["question"].append(query)
    retrieve_result["target_file_name"].append(dataset.target_file_name[idx])
    retrieve_result["target_page_no"].append(dataset.target_page_no[idx])

    # 검색결과저장
    for p_idx, point in enumerate(results["points"]):
        _payload: dict = point["payload"]
        # file_name = _payload["file_name"]
        # page = _payload["page"]
        # content = _payload["content"]
        retrieve_result[f"retrieved_doc{p_idx + 1}"].append(_payload["file_name"])
        retrieve_result[f"retrieved_page{p_idx + 1}"].append(_payload["page"])
        retrieve_result[f"retrieved_cont{p_idx + 1}"].append(_payload["content"])
    end_time = datetime.now()
    latency_secs = float(f"{(end_time - start_time).total_seconds():.4f}")
    retrieve_result["latency_secs"].append(latency_secs)
    print(f">> Complete Question {idx + 1:02} / Latency: {latency_secs}")


# Retrieve result save to CSV

In [None]:
result_df = pd.DataFrame(retrieve_result)
result_df.to_csv(os.path.join(result_base_path, result_file_name), index=None)

# Evaluate Result

In [None]:
# Load Data
result_df = pd.read_csv(os.path.join(result_base_path, result_file_name))

# Evaluation
limit = 5

recalls = []
reciprocal_ranks = []

for idx in range(result_df.shape[0]):
    target_file_name = result_df.target_file_name[idx]
    target_page_no = result_df.target_page_no[idx]

    for p_idx in range(limit):
        rank = p_idx + 1
        if (
            result_df[f"retrieved_doc{rank}"][idx] == target_file_name
            and result_df[f"retrieved_page{rank}"][idx] == target_page_no
        ):
            reciprocal_ranks.append(float(f"{(1 / rank):.2f}"))
            recalls.append(1)
            break
    else:
        reciprocal_ranks.append(0)
        recalls.append(0)

In [None]:
# 평가 결과 추가
result_df["recall"] = recalls
result_df["reciprocal_rank"] = reciprocal_ranks

In [None]:
# 평가 결과를 포함해 다시 저장
result_df.to_csv(os.path.join(result_base_path, result_file_name), index=None)

In [None]:
# 최종평가결과
import math


def get_cnt_by_ratio(total_len: int, ratio: float = 0.95):
    return math.ceil(total_len * ratio)


r_95 = get_cnt_by_ratio(len(result_df.latency_secs), 0.95)

recall_5 = sum(recalls) / len(recalls)
mrr_5 = sum(reciprocal_ranks) / len(reciprocal_ranks)
latency_95 = sum(sorted(result_df.latency_secs)[:r_95]) / r_95

In [None]:
# RRF_10
print(f"{recall_5=}")
print(f"{mrr_5=:.2f}")
print(f"{latency_95=:.4f}")

In [None]:
# RRF_20
print(f"{recall_5=}")
print(f"{mrr_5=:.2f}")
print(f"{latency_95=:.4f}")

In [None]:
# DBSF_10
print(f"{recall_5=}")
print(f"{mrr_5=:.2f}")
print(f"{latency_95=:.4f}")

In [None]:
# DBSF_20
print(f"{recall_5=}")
print(f"{mrr_5=:.2f}")
print(f"{latency_95=:.4f}")