In [1]:
import json
import sys
import os

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
from langchain_core.documents import Document
from datetime import datetime
from DataProcessing.utils import load_yaml
from DataProcessing.extract_graphstate import (
    extract_documents_for_singlestore,
)

config = load_yaml("../config/embedding.yaml")
category_id = config["settings"]["category_id"]
filetype = config["settings"]["filetype"]
edit_path = config["settings"]["edit_path"]
# path = os.path.join(edit_path, category_id, "json")
output_path = config["settings"]["output_path"]
os.makedirs(output_path, exist_ok=True)


def load_json_files(path):
    data_list = []
    for filename in os.listdir(path):
        if filename.endswith(".json"):
            with open(os.path.join(path, filename), "r", encoding="utf-8") as file:
                data = json.load(file)
                data_list.append(data)
    return data_list


data_list = []
for category in config["settings"]["category_id"]:
    category_path = os.path.join(edit_path, category, "json")
    data_list.extend(load_json_files(category_path))

all_documents = []
for data in data_list:
    document = extract_documents_for_singlestore(data)
    all_documents.extend(document)

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain_upstage.embeddings import UpstageEmbeddings
import uuid
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_upstage.embeddings import UpstageEmbeddings
from langchain.storage import LocalFileStore
from langchain.embeddings import CacheBackedEmbeddings
from dotenv import load_dotenv
from langchain_community.vectorstores.utils import filter_complex_metadata
from datetime import datetime
from langchain.storage.encoder_backed import EncoderBackedStore
import pickle


def key_encoder(key: int | str) -> str:
    return str(key)


def value_serializer(value: float) -> str:
    return pickle.dumps(value)


def value_deserializer(serialized_value: str) -> float:
    return pickle.loads(serialized_value)


load_dotenv()
time = datetime.now().strftime("%Y.%m.%d")
proj_name = f"2024.09.14_for_parentdocument_store"
store = LocalFileStore(
    f"/Users/youngseoklee/Desktop/workplace/MacroAgent-withRAG/cache/{proj_name}/data"
)

encoder_store = EncoderBackedStore(
    store=store,
    key_encoder=key_encoder,
    value_serializer=value_serializer,
    value_deserializer=value_deserializer,
)


child_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=50)
passage_embeddings = UpstageEmbeddings(model="solar-embedding-1-large-passage")


cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings=passage_embeddings,
    document_embedding_cache=store,
    namespace=passage_embeddings.model,
)
DB_PATH = f"/Users/youngseoklee/Desktop/workplace/MacroAgent-withRAG/db/2024.09.14_for_parentdocument_store"
db = Chroma(persist_directory=DB_PATH, embedding_function=cached_embedder)


Parent_retriever = ParentDocumentRetriever(
    vectorstore=db,
    docstore=encoder_store,
    child_splitter=child_splitter,
)


filtered_all_documents = filter_complex_metadata(documents=all_documents)

Parent_retriever.add_documents(
    documents=filtered_all_documents, ids=None, add_to_docstore=True
)

In [None]:
len(list(store.yield_keys()))

In [None]:
retrieved_docs = Parent_retriever.invoke("엔화의 전망에 대해 알려줘")
retrieved_docs

In [6]:
from datasets import Dataset
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
import pandas as pd
import sys
import os
import json
from langchain_anthropic import ChatAnthropic

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
from DataProcessing.utils import load_yaml


path = "./data/custom_testdataset.xlsx"
# testset = pd.read_csv(path)
testset = pd.read_excel(path)

questions = testset["question"].to_list()
ground_truth = testset["ground_truth"].to_list()

data = {"question": [], "answer": [], "contexts": [], "ground_truth": ground_truth}

prompt_template = load_yaml("../prompts/Retriever._prompt.yaml")["prompt"]
prompt = PromptTemplate.from_template(prompt_template)
# llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.5)
llm = ChatAnthropic(model="claude-3-5-sonnet-20240620", temperature=0.5)
rag_chain = (
    {"context": Parent_retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


for query in questions:
    data["question"].append(query)
    data["answer"].append(rag_chain.invoke(query))
    data["contexts"].append(
        [doc.page_content for doc in Parent_retriever.invoke(query)]
    )

path = "./data/customtestset_sonnet_parentdocument.json"
with open(path, "w") as file:
    json.dump(data, file)