In [10]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore

# Load -> Transform -> Embed -> Store -> Retrieve

cache_dir = LocalFileStore("./.cache/")

# Method 1: RecursiveCharacterTextSplitter
# splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
#     chunk_size=200, # Should be small enough, but not to distort the meaning or context
#     chunk_overlap=True,
# )

# Method 2: CharacterTextSplitter
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=200, # Should be small enough, but not to distort the meaning or context
    chunk_overlap=True,
)

loader = TextLoader("./files/promises.txt", encoding="utf-8")
docs = loader.load_and_split(text_splitter=splitter)
embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)
vectorstore = Chroma.from_documents(docs, embeddings)

Created a chunk of size 427, which is longer than the specified 200
Created a chunk of size 353, which is longer than the specified 200
Created a chunk of size 355, which is longer than the specified 200
Created a chunk of size 208, which is longer than the specified 200
Created a chunk of size 224, which is longer than the specified 200
Created a chunk of size 226, which is longer than the specified 200
Created a chunk of size 201, which is longer than the specified 200
Created a chunk of size 243, which is longer than the specified 200
Created a chunk of size 288, which is longer than the specified 200
Created a chunk of size 203, which is longer than the specified 200
Created a chunk of size 203, which is longer than the specified 200
Created a chunk of size 203, which is longer than the specified 200
Created a chunk of size 229, which is longer than the specified 200
Created a chunk of size 373, which is longer than the specified 200
Created a chunk of size 236, which is longer tha

Created a chunk of size 222, which is longer than the specified 200
Created a chunk of size 241, which is longer than the specified 200
Created a chunk of size 226, which is longer than the specified 200
Created a chunk of size 205, which is longer than the specified 200
Created a chunk of size 405, which is longer than the specified 200
Created a chunk of size 420, which is longer than the specified 200
Created a chunk of size 308, which is longer than the specified 200
Created a chunk of size 210, which is longer than the specified 200
Created a chunk of size 228, which is longer than the specified 200
Created a chunk of size 226, which is longer than the specified 200
Created a chunk of size 222, which is longer than the specified 200
Created a chunk of size 235, which is longer than the specified 200
Created a chunk of size 215, which is longer than the specified 200
Created a chunk of size 238, which is longer than the specified 200
Created a chunk of size 531, which is longer tha

In [17]:
# results = vectorstore.similarity_search("what does the party called 더불어민주당 say?")
# print(results)

[Document(page_content='● 더불어민주당: 지배권을 가진 대주주와 소액주주 사이 힘의 불균형이 너무 커. 소액주주에 대한 차별을 시정해 권리를 강화해야 해. 회사가 인수합병할 때 소액주주의 피해를 최소화하는 의무공개매수제, 주주가 자신의 의결권을 잘 사용하도록 절차를 간소화한 전자투표제를 도입할게.', metadata={'source': './files/promises.txt'}), Document(page_content='“부동산 가격, 오를 거다 vs. 내릴 거다”, “전세사기, 또 벌어졌다” 이런 뉴스 들으면 “난 대체 어디서 살아야 해!” 싶었던 뉴니커 주목! 이번 총선에 나서는 정당들이 “내 집 마련, 우리가 도울게!” 라면서 앞다투어 주거 공약을 내놨어요. 집을 어떻게 공급할 생각인지, 주거비 지원은 얼마나 해 줄 건지, 전·월세에 사는 사람들을 위한 공약은 뭐가 있는지 싹 정리했어요.', metadata={'source': './files/promises.txt'}), Document(page_content='*\n정당 공약집과 보도자료, 공식 답변 등을 기초 자료로 작성되었으며, 선거 전까지 지속적으로 업데이트 됩니다.\n01\n대학이 뭐길래 🧑\u200d🎓\n배경 알아보기', metadata={'source': './files/promises.txt'}), Document(page_content='*\n정당 공약집과 보도자료, 공식 답변 등을 기초 자료로 작성되었으며, 선거 전까지 지속적으로 업데이트 됩니다.\n01\n집이 있어야 살지 🏠!\n배경 알아보기', metadata={'source': './files/promises.txt'})]
