In [2]:
from dotenv import load_dotenv
from langchain_teddynote.community.pinecone import init_pinecone_index
from langchain_upstage.embeddings import UpstageEmbeddings
from langchain_teddynote.community.pinecone import PineconeKiwiHybridRetriever
from langchain_teddynote.korean import stopwords
import os

load_dotenv()


pinecone_params = init_pinecone_index(
    index_name="globalmacro-chatbot",
    namespace="financical-data-00",
    api_key=os.environ["PINECONE_API_KEY"],
    sparse_encoder_path="../data/sparse_encoder_01.pkl",
    stopwords=stopwords(),
    tokenizer="kiwi",
    embeddings=UpstageEmbeddings(model="solar-embedding-1-large-query"),
    top_k=10,
    alpha=0.4,  # alpha=0.75로 설정한 경우, (0.75: Dense Embedding, 0.25: Sparse Embedding)
)


pinecone_retriever = PineconeKiwiHybridRetriever(**pinecone_params)

[init_pinecone_index]
{'dimension': 4096,
 'index_fullness': 0.0,
 'namespaces': {'financical-data-00': {'vector_count': 2012}},
 'total_vector_count': 2012}


# make testdata


In [5]:
from datasets import Dataset
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import PromptTemplate
import pandas as pd
import sys
import os
import json

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
from DataProcessing.utils import load_yaml


path = "./data/custom_testdataset.xlsx"
# testset = pd.read_csv(path)
testset = pd.read_excel(path)

questions = testset["question"].to_list()
ground_truth = testset["ground_truth"].to_list()

data = {"question": [], "answer": [], "contexts": [], "ground_truth": ground_truth}

prompt_template = load_yaml("../prompts/Retriever._prompt.yaml")["prompt"]
prompt = PromptTemplate.from_template(prompt_template)
# llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.5)
llm = ChatAnthropic(model="claude-3-5-sonnet-20240620", temperature=0.5)

rag_chain = (
    {"context": pinecone_retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


for query in questions:
    data["question"].append(query)
    data["answer"].append(rag_chain.invoke(query))
    data["contexts"].append(
        [doc.page_content for doc in pinecone_retriever.invoke(query)]
    )

path = "./data/customtestset_sonnet_bm25.json"
with open(path, "w") as file:
    json.dump(data, file)

# conversation


In [None]:
query = "현재를 기준으로 최근 1년동안 금리의 추세를 분석해주세요"
context = pinecone_retriever.invoke(query)
result = rag_chain.invoke(query)
context
print(result)