# Generate testdataSet


In [11]:
import json
import os
import sys

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
from langchain_core.documents import Document
from datetime import datetime
from DataProcessing.utils import load_yaml
from DataProcessing.extract_graphstate import (
    extract_documents_for_singlestore,
)

config = load_yaml("../config/embedding.yaml")
category_id = config["settings"]["category_id"]
filetype = config["settings"]["filetype"]
edit_path = config["settings"]["edit_path"]
# path = os.path.join(edit_path, category_id, "json")
output_path = config["settings"]["output_path"]
os.makedirs(output_path, exist_ok=True)


def load_json_files(path):
    data_list = []
    for filename in os.listdir(path):
        if filename.endswith(".json"):
            with open(os.path.join(path, filename), "r", encoding="utf-8") as file:
                data = json.load(file)
                data_list.append(data)
    return data_list


data_list = []
for category in config["settings"]["category_id"]:
    category_path = os.path.join(edit_path, category, "json")
    data_list.extend(load_json_files(category_path))


all_documents = []
for data in data_list:
    documents = extract_documents_for_singlestore(data)
    all_documents.extend(documents)

In [12]:
for i in range(len(all_documents)):
    filename = all_documents[i].metadata["doc_id"].split("_")[0]
    all_documents[i].metadata["filename"] = filename

for doc in all_documents:
    doc.metadata = {k: v for k, v in doc.metadata.items() if v != [] and v != ""}

In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import random
import pandas as pd
from autorag.data.utils.util import corpus_df_to_langchain_documents

generator_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.56)
critic_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

generator = TestsetGenerator.from_langchain(generator_llm, critic_llm, embeddings)

corpus_df = pd.read_parquet("./data/filtered_corpus.parquet")
langchain_docs = corpus_df_to_langchain_documents(corpus_df)
random_samples = random.sample(langchain_docs, 100)

"""language = "korean"

generator.adapt(
    language, evolutions=[simple, reasoning, multi_context], cache_dir="./data/cache"
)
generator.save(evolutions=[simple, reasoning, multi_context], cache_dir="./data/cache")"""

testset = generator.generate_with_langchain_docs(
    random_samples,
    test_size=10,
    distributions={simple: 0.25, reasoning: 0.25, multi_context: 0.5},
)
testset.to_pandas().to_csv("./data/ragas2_testset_10_2.csv")

print("Done")

In [None]:
from autorag.utils.preprocess import cast_qa_dataset
import uuid

result_df = pd.DataFrame(
    {
        "qid": [str(uuid.uuid4()) for _ in range(len(testset))],
        "query": testset["question"].tolist(),
        "generation_gt": list(map(lambda x: x, testset["ground_truth"].tolist())),
    }
)

result_df["retrieval_gt"] = testset["metadata"].apply(
    lambda x: list(map(lambda y: y["filename"], x))
)
result_df = cast_qa_dataset(result_df)
result_df.to_parquet("./data/use_splited_qa_ragas2_10_0.parquet")

# Evaluation


In [None]:
import pandas as pd
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)
from langchain_upstage.embeddings import UpstageEmbeddings
from langchain_openai import ChatOpenAI
import os
import json
from datasets import Dataset
from dotenv import load_dotenv


def run_evaluate(json_path: str, testcount: int = 0):
    load_dotenv()
    with open(json_path, "r", encoding="utf-8") as file:
        json_data = json.load(file)

    filename = os.path.basename(json_path).split(".")[0]

    dataset = Dataset.from_dict(json_data)

    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)
    embeddings = UpstageEmbeddings(model="solar-embedding-1-large-passage")

    result = evaluate(
        dataset=dataset,
        metrics=[
            context_precision,
            context_recall,
            faithfulness,
            answer_relevancy,
        ],
        llm=llm,
        embeddings=embeddings,
    )

    df = result.to_pandas()
    df.to_csv(f"./data/results/result_{filename}_{testcount}.csv")


base_path = "./data"
json_list = [x for x in os.listdir(base_path) if x.startswith("customtestset_")]
print(json_list)
for path in json_list:
    file_path = os.path.join(base_path, path)
    run_evaluate(file_path, 0)
    print("Done")

# Analysis


In [35]:
import pandas as pd
import os

base_path = "./data/results"
result_list = os.listdir(base_path)


def print_describe(base_path, filename):
    path = os.path.join(base_path, filename)
    df = pd.read_csv(path, index_col=0)
    df = df[["context_precision", "context_recall", "faithfulness", "answer_relevancy"]]
    return df.agg(["mean", "std"])


result_df_mean = pd.DataFrame()
result_df_std = pd.DataFrame()

for file in result_list:
    if file.startswith("result_"):
        result = print_describe(base_path, file)
        file_name = file.replace("result_customtestset_", "").replace("_0.csv", "")

        result_mean = result.loc["mean"]
        result_mean["파일명"] = file_name
        result_df_mean = pd.concat([result_df_mean, result_mean.to_frame().T])

        result_std = result.loc["std"]
        result_std["파일명"] = file_name
        result_df_std = pd.concat([result_df_std, result_std.to_frame().T])

In [39]:
result_df_mean

Unnamed: 0,context_precision,context_recall,faithfulness,answer_relevancy,파일명
mean,1.0,0.12,0.547473,0.040684,sonnet_multivector
mean,1.0,0.307143,0.453698,0.151005,sonnet_parentdocument
mean,0.966393,0.4,0.801328,0.046288,sonnet_bm25
mean,1.0,0.238889,0.627508,0.09526,mini_multivector
mean,1.0,0.357143,0.469314,0.107169,mini_parentdocument
mean,0.966393,0.511667,0.736923,0.055993,mini_bm25


In [38]:
result_df_std

Unnamed: 0,context_precision,context_recall,faithfulness,answer_relevancy,파일명
std,0.0,0.315524,0.417318,0.075617,sonnet_multivector
std,0.0,0.364107,0.435119,0.12878,sonnet_parentdocument
std,0.074134,0.442217,0.325952,0.090583,sonnet_bm25
std,0.0,0.38873,0.392216,0.109286,mini_multivector
std,0.0,0.417801,0.43134,0.12229,mini_parentdocument
std,0.074134,0.464549,0.313787,0.09633,mini_bm25
