In [3]:
from langchain.document_loaders import PyPDFLoader

load_target = "The_Adventures_of_Tom_Sawyer.pdf"

loader = PyPDFLoader(load_target)

documents = loader.load()

print(documents[5].page_content[:5000])

Chapter 1    The Fence 
 
Tom Sawyer lived with his aunt because his mother and 
father were dead. Tom didn’t like going to school, and he didn’t like working. He liked playing and having adventures. One Friday, he didn’t go to school—he went to the river. 
Aunt Polly was angry. “You’re a bad boy!” she said. 
“Tomorrow you can’t play with your friends because you didn’t go to school today. Tomorrow you’re going to work for me. You can paint the fence.” 
Saturday morning, Tom was not happy, but he started to 
paint the fence. His friend Jim was in the street. 
Tom asked him, “Do you want to paint?” 
Jim said, “No, I can’t. I’m going to get water.” 
Then Ben came to Tom’s house. He watched Tom and 
said, “I’m going to swim today. You can’t swim because you’re working.” 
Tom said, “This isn’t work. I like painting.” 
“Can I paint, too?” Ben asked. 
“No, you can’t,” Tom answered. “Aunt Polly asked me 
because I’m a very good painter.” 
Ben said, “I’m a good painter, too. Please, can I pain

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

n_chunk_size = 1000
n_chunk_overlap = 200

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = n_chunk_size,
    chunk_overlap = n_chunk_overlap
)
splits = text_splitter.split_documents(documents=documents)
print(f"number of chunks: {len(splits)}")

number of chunks: 56


In [None]:
# from langchain.vectorstores import FAISS
# from langchain_openai import OpenAIEmbeddings

# embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
# vectorstore = FAISS.from_documents(document, embeddings)

from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)

In [14]:
text = "마을 무덤에 있던 남자를 죽인 사람은 누구니?"
text_embedding = embeddings.embed_query(text)  
print(text_embedding)
print(len(text_embedding))

[0.007295262534171343, -0.02066235989332199, 0.018254339694976807, -0.02288912981748581, -0.01965254358947277, 0.031148893758654594, -0.025271255522966385, 0.01723157800734043, -0.018526213243603706, -0.0048678237944841385, -0.004935792181640863, 0.00827271118760109, 0.0032527679577469826, 0.0045700580812990665, 0.011140326038002968, 0.023911889642477036, 0.024831080809235573, -0.013056384399533272, 0.021154319867491722, -0.04078097268939018, 0.012952813878655434, 0.0018901657313108444, -0.005738465115427971, 0.0031394874677062035, 0.009120697155594826, 0.039020270109176636, 0.019031120464205742, -0.018060144037008286, 0.014771774411201477, -0.0048775337636470795, 0.02174985222518444, -0.00538891414180398, -0.010538320988416672, -0.024947598576545715, -0.03275424242019653, 0.030475685372948647, 0.007949053309857845, -0.0019969730637967587, -0.0010098145576193929, -0.001849708380177617, 0.00929547194391489, -0.012422013096511364, 0.01092671137303114, -0.03448905050754547, 0.004155775066

In [None]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

chat = ChatOpenAI(
    temperature=0.5,
    max_completion_tokens=2048,
    model="gpt-4o-mini"
)

retriever = vectorstore.as_retriever()
qa_chain = RetrievalQA.from_chain_type(
    chain_type="stuff",
    retriever=retriever,
    llm=chat
)

query = "마을 무덤에 있던 남자를 죽인 사람은?"
result = qa_chain.invoke({"query": query})
print(f"query: {query}")
print(f"result : {result['result']}")

query: 마을 무덤에 있던 남자를 죽인 사람은?
result : 무덤에 있던 남자를 죽인 사람은 인준 조(Injun Joe)입니다.


In [None]:
# 통합 코드

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA


raw_data = "The_Adventures_of_Tom_Sawyer.pdf"
n_chunk_size = 200
n_chunk_overlap = 50
select_model = "gpt-4o-mini"
max_tokens = 2048
select_temperature = 0.5
select_chaintype = "stuff"
query = "마을 무덤에 있던 남자를 죽인 사람은?"


loader = PyPDFLoader(raw_data)
documents = loader.load()


r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=n_chunk_size, chunk_overlap=n_chunk_overlap
)
splits = r_splitter.split_documents(documents=documents)


embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)
retriever = vectorstore.as_retriever()


chat = ChatOpenAI(
    temperature=select_temperature,
    model=select_model,
    max_completion_tokens=max_tokens,
)


qa_chain = RetrievalQA.from_chain_type(
    chain_type=select_chaintype,
    llm=chat,
    retriever=retriever,
    return_source_documents=True,
)

result = qa_chain.invoke({"query": query})
print(f"query: {query}")
print(f"result : {result['result']}")
print(f'source documents num: {len(result["source_documents"])}')


query: 마을 무덤에 있던 남자를 죽인 사람은?
result : 무덤에서 남자를 죽인 사람은 인준 조(Injun Joe)입니다.
source documents num: 4
