# 1.make corpus


## use splited data


In [6]:
import json
import os
import sys

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

from DataProcessing.utils import load_yaml
from DataProcessing.extract_graphstate import (
    extract_documents_for_docstore,
    extract_documents_for_single_store,
    extract_documents_for_vectorstore,
)
from autorag.data.corpus import langchain_documents_to_parquet

config = load_yaml("../config/embedding.yaml")
category_id = config["settings"]["category_id"]
filetype = config["settings"]["filetype"]
edit_path = config["settings"]["edit_path"]
# path = os.path.join(edit_path, category_id, "json")
output_path = config["settings"]["output_path"]
os.makedirs(output_path, exist_ok=True)


def load_json_files(path):
    data_list = []
    for filename in os.listdir(path):
        if filename.endswith(".json"):
            with open(os.path.join(path, filename), "r", encoding="utf-8") as file:
                data = json.load(file)
                data_list.append(data)
    return data_list


data_list = []
for category in config["settings"]["category_id"]:
    category_path = os.path.join(edit_path, category, "json")
    data_list.extend(load_json_files(category_path))


all_documents = []
for data in data_list:
    documents = extract_documents_for_single_store(data)
    all_documents.extend(documents)

corpus_df = langchain_documents_to_parquet(
    all_documents, "./data/use_splited_corpus.parquet"
)

In [6]:
import pandas as pd

from langchain_openai import ChatOpenAI

corpus_df = pd.read_parquet("./data/use_splited_corpus.parquet")
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.56)

In [8]:
filtered_df = corpus_df[
    corpus_df["contents"].apply(lambda x: 30 < llm.get_num_tokens(x) < 500)
]
filtered_df.to_parquet("./data/filtered_corpus.parquet")
print(f"저장된 행 수: {len(filtered_df)}")

저장된 행 수: 769


# 2.Generation QA


## use AutoRAG


In [None]:
import pandas as pd

from llama_index.llms.openai import OpenAI
from autorag.data.qacreation import generate_qa_llama_index, make_single_content_qa
import nest_asyncio
from dotenv import load_dotenv

load_dotenv()
nest_asyncio.apply()

prompt = """The given text is a financial columns written in Korean.
Generate question and answer pairs related to financial events, considering the importance of dates, timelines, and the sequence of events. Focus on financial concepts, the timing of the events, and how specific dates or time periods impact the content.

Passage:
{{text}}

Number of questions to generate: {{num_questions}}

Guidelines:
1. Ensure that the questions are relevant to specific dates or timeframes mentioned in the passage.
2. Include questions that ask about the significance of key financial events, focusing on when they occurred and their subsequent impact.
3. The answers should provide precise information related to both the event and the date/time context.
4. Be clear about the source (whose claim it is)
5. Please write the result in Korean

Example:
[Q]: On what date did the stock market experience a significant drop?
[A]: The stock market experienced a significant drop on March 9, 2020.

[Q]: How did the interest rate change on July 15, 2021, affect the housing market?
[A]: The interest rate increase on July 15, 2021, caused housing prices to stabilize as borrowing costs rose.

Result:
"""

corpus_df = pd.read_parquet("./data/filtered_corpus.parquet")
llm = OpenAI(model="gpt-4o-mini", temperature=0.56)


qa_df = make_single_content_qa(
    corpus_df.sample(n=250),
    content_size=100,
    qa_creation_func=generate_qa_llama_index,
    llm=llm,
    prompt=prompt,
    question_num_per_content=2,
    output_filepath="./data/use_splited_qa_autorag.parquet",
    upsert=True,
)

## use RAGAS


In [None]:
from autorag.data.qacreation.ragas import generate_qa_ragas
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.testset.evolutions import simple, reasoning, multi_context, conditional
import pandas as pd
import nest_asyncio
from dotenv import load_dotenv

load_dotenv()
nest_asyncio.apply()

distributions = {
    simple: 0.25,
    reasoning: 0.25,
    multi_context: 0.25,
    conditional: 0.25,
}

corpus_df = pd.read_parquet("./data/filtered_corpus.parquet")
generator_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.56)
critic_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")
qa_df = generate_qa_ragas(
    corpus_df.sample(n=200),
    test_size=10,
    distributions=distributions,
    generator_llm=generator_llm,
    critic_llm=critic_llm,
    embedding_model=embedding_model,
)
qa_df.to_parquet("./data/use_splited_qa_ragas2.parquet")

# 3.Evaluation


In [None]:
from autorag.evaluator import Evaluator
import nest_asyncio
from dotenv import load_dotenv
import autorag
from langchain_upstage.embeddings import UpstageEmbeddings

load_dotenv()
nest_asyncio.apply()
# autorag.embedding_models["upstage_embed"] = autorag.LazyInit(UpstageEmbeddings)

evaluator = Evaluator(
    qa_data_path="./data/autorag_testset_100_0.parquet",
    corpus_data_path="./data/use_splited_corpus.parquet",
    project_dir="./benchmark/test_2",
)
evaluator.start_trial("./config/retriever_test.yaml")
# evaluator.restart_trial(trial_path='your/path/to/trial_path')

In [6]:
from autorag.deploy import extract_best_config

trial_path = "./benchmark/test_1/4"
pipeline_dict = extract_best_config(
    trial_path=trial_path,
    output_path=f"{trial_path}/best_pipeline.yaml",
)

In [None]:
#dashboard 실행
!autorag dashboard --trial_dir ./benchmark/test_1/4

In [None]:
#streamlit 실행
!autorag run_web --trial_path ./benchmark/test_1/4