In [1]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader, TextLoader, UnstructuredFileLoader
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
)

In [2]:
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo-1106",
    temperature=0.1,
)

  warn_deprecated(


In [3]:
# vectorization
# 각각의 문서마다 벡터를 만들어줄 거 -> embed
# openAI의 embed 모델은 최소 1000차원을 갖는 벡터를 제공함

# 그럼 무엇을 embed하는 거냐? -> 단어를 embed하게 됨

In [4]:
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")

In [5]:
# Embedd model
# from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore

embeddings = OpenAIEmbeddings()

  warn_deprecated(


In [6]:
# embeddings.embed_query("Hi") # Hi를 임베딩한 벡터

vector = embeddings.embed_documents(  # embed_documents는 이런식으로 리스트로 토큰을 전달해야 함
    [
        "Hi",
        "how",
        "are",
        "you",
        "?",
        "My name is nico",
    ]
)

In [7]:
print(len(vector), len(vector[0]), len(vector[1]))  # 각각 1536개의 차원을 가짐

6 1536 1536


In [8]:
# 그런데 매번 문서를 임베딩 해서 써야 할까??
# 문서의 임베딩을 저장해놓고 불러와서 쓰자

# 그러려면 vector store가 필요함
# vector store는 일종의 데이터베이스
# 여기에다가 임베딩을 저장해놨다가 검색해서 꺼내 쓰면 됨
# 문서의 임베딩이 변경되지 않는 이상 그대로 있을 것
# langchain에는 많은 vector store가 있고 일부는 유료, cloud -> 어떤건 무료로 사용할 수 있음

# Chroma는 클라우드 환경이 아니라 로컬에서 직접 실행되는 vector store

docs = loader.load_and_split(text_splitter=splitter)

vectorstore = Chroma.from_documents(docs, embeddings)

In [9]:
result = vectorstore.similarity_search("where does winston live")

result  # 질문에 대해서 유사도가 가장 높은 문서 조각(chunk)을 검색하는거 -> 이걸 prompt에 넘겨 줄 수 있을 거

[Document(metadata={'source': './files/chapter_one.docx'}, page_content="The Ministry of Love was the really frightening one. There were no windows in it at all. Winston had never been inside the Ministry of Love, nor within half a kilometre of it. It was a place impossible to enter except on official business, and then only by penetrating through a maze of barbed-wire entanglements, steel doors, and hidden machine-gun nests. Even the streets leading up to its outer barriers were roamed by gorilla-faced guards in black uniforms, armed with jointed truncheons.\nWinston turned round abruptly. He had set his features into the expression of quiet optimism which it was advisable to wear when facing the telescreen. He crossed the room into the tiny kitchen. By leaving the Ministry at this time of day he had sacrificed his lunch in the canteen, and he was aware that there was no food in the kitchen except a hunk of dark-coloured bread which had got to be saved for tomorrow's breakfast. He too

In [10]:
# vectorstore를 위에처럼 그대로 두면 실행할때마다 store를 만들어야 됨 -> 비용
# 그래서 저장해서 캐싱할 거

cache_dir = LocalFileStore("./.cache/")

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings,  # 이 임베딩을
    cache_dir,  # 요기다 저장하겠다고 하는거 -> 경로에 캐싱된 임베딩이 없으면 저장하고 불러오고, 있으면 불러오기만 함
)

In [11]:
vectorstore = Chroma.from_documents(docs, cached_embeddings)  # docs

# 1. 다음에 또 임베딩 작업을 할때, 캐시에 임베딩이 이미 존재하는지 확인할 거
# 2. 없다면, vector store(Chroma.from_documents)를 호출할 때 docs와 함께 OpenAIEmbeddings를 사용할 거 -> 그 다음에 그 임베딩을 캐시해서 저장
# 3. 두번째 호출 부터는, 이미 캐시에 저장되어있는 임베딩을 가져올 거

In [12]:
results = vectorstore.similarity_search("where does winston live?")

results

[Document(metadata={'source': './files/chapter_one.docx'}, page_content='Part 1, Chapter 1\nPart One\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up,