# 1. Split the document.
# 2. Separate the document.
#     - Cannot generate the answer due to exceeding the number of tokens.
#     - Generation time depends on the length of the document.
# 3. Embed and save to the vector database.
# 4. Query: perform similarity search on the vector database.
# 5. Pass the documents from the similarity search to the LLM.

In [None]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,    
    chunk_overlap=200, 
)

loader = Docx2txtLoader("./tax.docx")
document_lists = loader.load_and_split(text_splitter=text_splitter)
len(document_lists)

In [None]:
document_lists

In [None]:
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv()

embedding = OpenAIEmbeddings(model='text-embedding-3-large')

In [None]:
from langchain_chroma import Chroma

# database = Chroma.from_documents(documents=document_lists, embedding=embedding, collection_name='chroma_tax', persist_directory='./chroma_db')
database = Chroma(collection_name='chroma_tax', persist_directory='./chroma_db', embedding_function=embedding)

In [None]:
query = '연봉이 5000만원인 직장인은 소득세로 얼마를 내야하나요?'
# 컨텍스트 길이 제한으로 인해 검색 문서 수를 줄임
returied_docs = database.similarity_search(query)

In [None]:
returied_docs

In [None]:
from langchain_openai import ChatOpenAI

# llm = ChatOpenAI(base_url="http://localhost:8000/v1", api_key="dummy", model="yanolja/EEVE-Korean-Instruct-2.8B-v1.0", temperature=0.7)
llm = ChatOpenAI()

In [None]:
prompt = f"""[identity]
- 당신은 한국 소득세 전문가입니다.
- [Context]를 참고해서 사용자의 질문에 답변해주세요.

[Context]
- 소득세 관련 법령: {returied_docs}

Question: {query}
"""

In [None]:
ai_msg = llm.invoke(prompt)

In [None]:
ai_msg.content

In [None]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

In [None]:
prompt

In [None]:
from langchain.chains import create_retrieval_chain

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=database.as_retriever(),
    chain_type_kwargs={"prompt": prompt},
)

In [None]:
ai_msg = qa_chain({"query": query})

In [None]:
ai_msg