In [None]:
!pip install pypdf

In [None]:

# pdf -> text

from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("data.pdf")
pages = loader.load_and_split()

print(pages[0])

In [None]:
# text -> snippets

from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
docs = text_splitter.split_documents(pages)

len(docs)

In [None]:
# embeding
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings import OpenAIEmbeddings

# embeddings = OpenAIEmbeddings()
embeddings = HuggingFaceEmbeddings() # sentence-transformers/all-mpnet-base-v2

from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import FAISS

index = VectorstoreIndexCreator(
    vectorstore_cls=FAISS,
    embedding=embeddings,
    ).from_loaders([loader])

In [None]:
# save vector db

index.vectorstore.save_local("faiss-nj-pdf-data")
# db = FAISS.load_local("faiss-nj-pdf-data", embeddings=embeddings)

In [None]:
# query
from langchain.chat_models import ChatOpenAI

chat = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0.9)

In [None]:
index.query("프로젝트 장소가 어디지?", llm=chat, verbose=True)

In [None]:
index.query("시간당 수당은?", llm=chat, verbose=True)

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_community.chat_models import ChatOpenAI

retriever = index.vectorstore.as_retriever()

template = """다음 지문에만 근거해서 질문에 답하세요:
{context}

질문: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
model = model = ChatOpenAI(model="gpt-3.5-turbo")
output_parser = StrOutputParser()

setup_and_retrieval = RunnableParallel( # 여러개의 Runnable을 병렬적으로 실행
    {"context": retriever, "question": RunnablePassthrough()} # RunnablePassthrough는 값을 입력받아 그대로 전달하는 객체
)
chain = setup_and_retrieval | prompt | model | output_parser

# retrieval | prompt | model | output. RAG의 기본 파이프라인

In [None]:
chain.invoke("하루에 얼마를 벌수 있을까?")