In [None]:
%pip install PyPDF2

In [1]:
from dotenv import load_dotenv

# Load the .env file
load_dotenv()

True

In [2]:
from PyPDF2 import PdfReader

# PDF 문서에서 텍스트를 추출
def get_pdf_text(pdf):
    text = ""
    pdf_reader = PdfReader(pdf)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text


In [3]:
from langchain_text_splitters import CharacterTextSplitter

# CharacterTextSplitter를 사용하여 텍스트를 청크로 분할
def split_text(text, chunk_size=1000, chunk_overlap=200):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )
    docs = text_splitter.split_text(text)

    return docs

In [4]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

# 주어진 텍스트 청크에 대한 임베딩을 생성하고 FAISS를 사용하여 벡터 저장소를 생성
def get_vectorstore(text_chunks):
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=OpenAIEmbeddings())
    return vectorstore

In [5]:
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def build_chain(retriever, llm):
    # 프롬프트 모듈을 가져와서 사용
    prompt = hub.pull("rlm/rag-prompt")

    # 체인을 구성
    chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

    return chain

In [None]:
import streamlit as st
from langchain_openai import ChatOpenAI

st.title("PDF 요약하기")
st.divider()

pdf = st.file_uploader("PDF파일을 업로드해주세요", type="pdf")

if pdf is not None:
  text = get_pdf_text(pdf)
  chunks = split_text(text)
  vectorstore = get_vectorstore(chunks)

  query = "업로드된 PDF 파일의 내용을 약 3~5문장으로 요약해주세요."  # LLM에 PDF파일 요약 요청

  if query:
    llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0.1)
    retriever = vectorstore.as_retriever(
      search_type="similarity", 
      search_kwargs={"k": 3}
    )

    # chain을 구성
    chain = build_chain(retriever, llm)
    response = chain.invoke(query)

    st.subheader("요약 결과")
    st.write(response)
