In [1]:
from dotenv import load_dotenv
import os

# load .env
load_dotenv()

API_KEY = os.environ.get('API_KEY')
ACCESS_KEY = os.environ.get('ACCESS_KEY')
SERVER = os.environ.get('SERVER')

In [7]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/subway.txt")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = Chroma.from_documents(docs, cached_embeddings)

In [8]:
results = vectorstore.similarity_search("문서의 주제가 무엇인가요?")
print(results)

[Document(page_content='"2412","우이신설경전철","408","우이신설선","4709","북한산보국문"\n"2412","우이신설경전철","408","우이신설선","4710","정릉"\n"2412","우이신설경전철","408","우이신설선","4711","성신여대입구(돈암)"\n"2412","우이신설경전철","408","우이신설선","4712","보문"\n"2412","우이신설경전철","408","우이신설선","4713","신설동"\n"2127","국토교통부(한국철도공사)","114","서해선","4804","소사"\n"2127","국토교통부(한국철도공사)","114","서해선","4805","소새울"\n"2127","국토교통부(한국철도공사)","114","서해선","4806","시흥대야"\n"2127","국토교통부(한국철도공사)","114","서해선","4807","신천"', metadata={'source': './files/subway.txt'}), Document(page_content='"2111","한국철도공사","102","경인선","1803","역곡"\n"2111","한국철도공사","102","경인선","1804","부천"\n"2111","한국철도공사","102","경인선","1805","송내"\n"2111","한국철도공사","102","경인선","1806","부평"\n"2111","한국철도공사","102","경인선","1807","백운"\n"2111","한국철도공사","102","경인선","1808","동암"\n"2111","한국철도공사","102","경인선","1809","주안"\n"2111","한국철도공사","102","경인선","1810","제물포"\n"2111","한국철도공사","102","경인선","1811","동인천"\n"2111","한국철도공사","102","경인선","1812","인천"\n"2111","한국철도공사","102","경인선","1813","구일"\n"2111","한국철도공사","102","경인선"

In [6]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough

llm = ChatOpenAI(
    temperature=0.1,
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/subway.txt")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)


# 검색기 정의: 벡터 스토어를 검색기로 이용한다.
retriver = vectorstore.as_retriever()

# context에 검색기를 집어 넣는다.
# question에 새로 입력한 텍스트를 통과시켜서 받는다.
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """당신은 훌륭한 어시스턴트입니다.
            이어지는 context만 사용해서 질문에 답변해주세요.
            만약 정답을 모를 경우에는, 답을 지어내지 말아주세요.:\n\n{context}""",
        ),
        ("human", "{question}"),
    ]
)

chain = (
    {
        "context": retriver,
        "question": RunnablePassthrough(),
    }
    | prompt
    | llm
)

chain.invoke("문서의 내용을 30글자 내로 요약해주세요")

AIMessage(content='서울과 인천 지하철 노선과 역 정보.')

In [9]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

llm = ChatOpenAI(
    temperature=0.1,
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/subway.txt")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriever = vectorstore.as_retriever()


map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            주어진 긴 문서를 사용하여, 질문을 답하기 위해 관련이 있는 텍스트를 찾아주세요. 
            관련 있는 텍스트를 반환해주세요. 만약 관련 있는 텍스트가 없으면 ''를 반환해주세요.
            -------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

map_doc_chain = map_doc_prompt | llm


def map_docs(inputs):
    documents = inputs["documents"]
    question = inputs["question"]
    return "\n\n".join(
        map_doc_chain.invoke({"context": doc.page_content, "question": question}).content for doc in documents
    )


map_chain = {
    "documents": retriever,
    "question": RunnablePassthrough(),
} | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            긴 문서 하나와 질문이 주어지면, 답안을 만들어 주세요. 
            답을 모를 경우, 모른다고 답해주세요. 답을 지어내지 말아주세요. 
            ------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

chain = {"context": map_chain, "question": RunnablePassthrough()} | final_prompt | llm

chain.invoke("문서의 주제가 무엇인가요")

AIMessage(content='한국의 철도 및 지하철 노선에 관한 정보입니다.')

In [13]:
chain.invoke("역의 개수가 가장 많은 호선은 몇호선인가요?")

AIMessage(content='분당선입니다.')

In [16]:
chain.invoke("역의 개수가 가장 많은 호선은 몇호선인가요?")

AIMessage(content='7호선')

In [18]:
chain.invoke("분당선에는 몇개의 역이 있나요?")

AIMessage(content='"분당선"에는 총 7개의 역이 있습니다.')