In [38]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

# Load -> Transform -> Embed -> Store -> Retrieve

llm = ChatOpenAI()

cache_dir = LocalFileStore("./.cache/")

# Method 1: RecursiveCharacterTextSplitter
# splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
#     chunk_size=200, # Should be small enough, but not to distort the meaning or context
#     chunk_overlap=True,
# )

# Method 2: CharacterTextSplitter
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=200, # Should be small enough, but not to distort the meaning or context
    chunk_overlap=True,
)

loader = TextLoader("./files/promises.txt", encoding="utf-8")
docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)

vectorstore = Chroma.from_documents(docs, embeddings)

# results = vectorstore.similarity_search("what does the party called 더불어민주당 say?")
# print(results)

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
)


Created a chunk of size 427, which is longer than the specified 200
Created a chunk of size 353, which is longer than the specified 200
Created a chunk of size 355, which is longer than the specified 200
Created a chunk of size 208, which is longer than the specified 200
Created a chunk of size 224, which is longer than the specified 200
Created a chunk of size 226, which is longer than the specified 200
Created a chunk of size 201, which is longer than the specified 200
Created a chunk of size 243, which is longer than the specified 200
Created a chunk of size 288, which is longer than the specified 200
Created a chunk of size 203, which is longer than the specified 200
Created a chunk of size 203, which is longer than the specified 200
Created a chunk of size 203, which is longer than the specified 200
Created a chunk of size 229, which is longer than the specified 200
Created a chunk of size 373, which is longer than the specified 200
Created a chunk of size 236, which is longer tha

In [34]:
chain.run("What is the promise of 녹색정의당? Answer in Korean and explain in bullet point, organize by themes")

'**주거 환경**\n- 저소득층 주거안정을 위한 정부지원 주택 공급 확대\n- 임대차보호제 확대 및 전월세 평균가격 안정화\n\n**도시 환경**\n- 생활 환경 향상을 위한 도시재생사업 및 자치구단위 민관합작사업 활성화\n- 대중교통 확충 및 녹지공간 확대\n\n**자원 환경**\n- 에너지 신산업 육성과 신재생 에너지 활성화\n- 지속가능한 자원운용을 위한 자원순환시스템 구축\n\n**기타**\n- 장애인 보장사회 실현을 위한 장애인정책 추진\n- 소상공인과 중소기업 활성화를 위한 지원책 마련'