# 1. Split the document.
# 2. Separate the document.
#     - Cannot generate the answer due to exceeding the number of tokens.
#     - Generation time depends on the length of the document.
# 3. Embed and save to the vector database.
# 4. Query: perform similarity search on the vector database.
# 5. Pass the documents from the similarity search to the LLM.

In [None]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,    
    chunk_overlap=200, 
)

loader = Docx2txtLoader("./tax.docx")
document_lists = loader.load_and_split(text_splitter=text_splitter)
len(document_lists)

In [None]:
document_lists

In [None]:
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv()

embedding = OpenAIEmbeddings(model='text-embedding-3-large')

In [None]:
import os
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from dotenv import load_dotenv

load_dotenv()

index_name = 'tax-index'
pinecone_api_key = os.getenv('PINECONE_API_KEY')

pc = Pinecone(api_key=pinecone_api_key)

database = PineconeVectorStore.from_documents(
    documents=document_lists,
    embedding=embedding,
    index_name=index_name
)

In [None]:
query = '연봉이 5000만원인 직장인은 소득세로 얼마를 내야하나요?'
retrieved_docs = database.similarity_search(query)

In [None]:
retrieved_docs

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI()

In [None]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

In [None]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=database.as_retriever(),
    chain_type_kwargs={"prompt": prompt},
)

In [None]:
ai_msg = qa_chain.invoke({"query": query})

In [None]:
ai_msg