# RAG
Retrieval Augmented Generation (검색증강생성)

# data 준비

In [1]:
# 아래와 같이 파일들을 준비합니다
# 구글드라이브 사용자는 자신의 구글드라이브 공간에 생성해두시길 바랍니다

# 출처는  조지오웰의 소설 '1984' Part1 Chapter1
#  http://www.george-orwell.org/1984/0.html

# 너무 길거나, 너무 짧지 않으면 좋습니다
# 파일이 너무 길면 나중에 임베딩 과정에서 비용지출이 발생.

In [2]:
import os
base_path = r'D:\NLP2501\dataset\files'

In [3]:
from langchain_openai.chat_models.base import ChatOpenAI

In [4]:
llm = ChatOpenAI(temperature=0.1)

# DataLoaders

## Retrieve 란

https://python.langchain.com/v0.1/docs/modules/data_connection/

![](https://python.langchain.com/v0.1/assets/images/data_connection-95ff2033a8faa5f3ba41376c0f6dd32a.jpg)


In [None]:
# RAG 의 첫번째 단계인 Retrieval 의 일반적인 과정
# - data source 에서 데이터 load
# - 데이터는 split 하면서 transform
# - transform 한 데이터를 embed.
# - embed 된 데이터를 store 에 저장.


## DataLoader 란

In [None]:
# 랭체인에서 제공하는 다양한 document loader 들이 있다
# CSV, File Directory, HTML, JSON, Markdown, PDF 등
# ※그 밖에서도 3rd party loader 들도 있다.

In [None]:
"""
Data Loader 는 소스에서 데이터를 추출하고 langchain 에 가져다 주는 코드다.

정말 많은 document loader source 들이 제공된다. (함 보자 ↓)
https://python.langchain.com/docs/integrations/document_loaders/#all-document-loaders

GitHub, Figma, Facebook Caht, MS power point, slack, telegram, trello, Twitter 등..
전부다 랭체인에서 활용해볼수 있다는 것이다.

다양한 Data Loader 이지만 거의 동일한 API 인터페이스로 설계되어 있다.
"""
None

## TextLoader

In [5]:
# v0.3
from langchain_community.document_loaders.text import TextLoader
# https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.text.TextLoader.html
# Load text file.


In [6]:
loader = TextLoader(os.path.join(base_path, 'chapter_one.txt'))

In [7]:
loader.load()

# ↓ List[Document] 객체 리턴
# [Document(
#   metadata={..},
#   page_content="...",
# )]

[Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.txt'}, page_content="Part 1, Chapter 1\n\nPart One\n\n\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. Th

## PyPDFLoader

In [8]:
# v0.3
from langchain_community.document_loaders.pdf import PyPDFLoader

# https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html
# PyPDFLoader document loader integration

In [9]:
loader = PyPDFLoader(os.path.join(base_path, 'chapter_one.pdf'))
loader.load()

[Document(metadata={'producer': 'Microsoft® Word 2016', 'creator': 'Microsoft® Word 2016', 'creationdate': '2025-01-30T23:19:00+09:00', 'author': 'Yeonchul Sung', 'moddate': '2025-01-30T23:19:00+09:00', 'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Part 1, Chapter 1 \n \n \nPart One \n \n \n1 \nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his \nchin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through \nthe glass doors of Victory Mansions, though not quickly enough to prevent a swirl of \ngritty dust from entering along with him. \n \nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured \nposter, too large for indoor display, had been tacked to the wall. It depicted simply an \nenormous face, more than a metre wide: the face of a man of about forty-five, with a \nheavy black moustache and ruggedly handsome 

## UnstructuredFileLoader

In [10]:
# 위와 같이 서로 다른 포맷의 데이터를 따로따로 읽어올수도 있지만
# UnstructuredFileLoader 를 통해 다양한 포맷의 파일을 읽어올수도 있다.

In [11]:
# v0.3
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader

# https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.unstructured.UnstructuredFileLoader.html

In [12]:
loader = UnstructuredFileLoader(os.path.join(base_path, 'chapter_one.pdf'))
loader.load()

  loader = UnstructuredFileLoader(os.path.join(base_path, 'chapter_one.pdf'))
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.pdf'}, page_content="Part 1, Chapter 1\n\nPart One\n\n1\n\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his\n\nchin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through\n\nthe glass doors of Victory Mansions, though not quickly enough to prevent a swirl of\n\ngritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured\n\nposter, too large for indoor display, had been tacked to the wall. It depicted simply an\n\nenormous face, more than a metre wide: the face of a man of about forty-five, with a\n\nheavy black moustache and ruggedly handsome features. Winston made for the stairs. It\n\nwas no use trying the lift. Even at the best of times it was seldom working, and at\n\npresent the electric current was cut off during daylight hours. It was part of the economy\n\ndrive in pr

In [13]:
loader = UnstructuredFileLoader(os.path.join(base_path, 'chapter_one.txt'))
loader.load()

libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.


[Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.txt'}, page_content="Part 1, Chapter 1\n\nPart One\n\n1 It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The f

In [14]:
loader = UnstructuredFileLoader(os.path.join(base_path, 'chapter_one.docx'))
loader.load()

[Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content="Part 1, Chapter 1\n\nPart One\n\n\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. T

# Splitter

## data 를 split 해야 하는 이유

In [15]:
len(loader.load())
# loader.load() 의 리턴값을 보면 'Document로 이루어진 list' 다.
# 지금의 경우는 전체 챕터가 '하나의 Document' 에 들어가 있다.


1

In [None]:
# 특정 질문에 답해야 하기 위해서, 필요한 '파일의 일부분' 만들 전달해야 할 수도 있다.

#  그래서 문서를 쪼개두어야(split) 한다

# 가령: "Ministry of peace" 를 찾고자 한다면.
# 해당 키워드가 있는 문서(들)만 모델에 넘겨주면 된다.

# 작은 조각들로 쪼개어 두면 필요한 것들을 찾기가 용이해진다.


In [None]:
"""
TextSplitter 계층도

BaseDocumentTransformer --> TextSplitter --> <name>TextSplitter  # Example: CharacterTextSplitter
                                             RecursiveCharacterTextSplitter -->  <name>TextSplitter

https://python.langchain.com/api_reference/text_splitters/index.html

"""
None

## RecursiveCharacterTextSplitter

In [16]:
# v0.3
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
# https://python.langchain.com/api_reference/text_splitters/character/langchain_text_splitters.character.RecursiveCharacterTextSplitter.html

# Splitting text by recursively look at characters.
# Recursively tries to split by different characters to find one that works.
# Create a new TextSplitter.

In [19]:
splitter = RecursiveCharacterTextSplitter()

# RecursiveCharacterTextSplitter 는 파일을 split 해주는데
# 문장의 끝이나, 문단의 끝부분마다 끊어준다.
# 문장 중간을 끊지는 않는다.  최대한 문장 중간에서 split 되지 않도록 하려 한다.
# 문장 중간에 짤림으로 의미있는 문장들을 잃고 싶지 않다.

# ↓ splitter 사용방법은 두가지 가 있다.

In [17]:
docs = loader.load()  # -> List[Document]

In [20]:
# 방법1
documents = splitter.split_documents(docs)
documents

[Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content="Part 1, Chapter 1\n\nPart One\n\n\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. T

In [21]:
len(documents)  # 여러개의 Document 들로 split

11

In [22]:
# 방법2 
documents = loader.load_and_split(text_splitter=splitter)
documents

[Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content="Part 1, Chapter 1\n\nPart One\n\n\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. T

In [23]:
len(documents)

11

In [24]:
print(documents[0].page_content)

Part 1, Chapter 1

Part One


1
It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.

The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his righ

### chunk_size=

In [25]:
# 좀 더 작은 Document 를 만들 필요가 있다.
# Context Window 가 크지 않은 경우라든지
# chunk_size= 값으로 조정해보자

In [26]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200  # 얼만큼의 양으로 split 할지 조정
)

documents = loader.load_and_split(text_splitter=splitter)
print(len(documents))

3498


In [27]:
documents[:5]

[Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content='Part 1, Chapter 1\n\nPart One'),
 Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content='1'),
 Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content='It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors'),
 Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content='was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of'),
 Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content='cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzz

### chunk_overlap=

In [None]:
"""
↑ 보다시피 Document 한 덩어리가 작아진 걸 확인할수 있다.

그러나 자세히 보라!  문제가 발생했다! => 문단의 중간을 잘라버렸다.
아런식으로 잘라먹으면 그닥 쓸만하지 않다. <- 문장을 파괴해버린셈이다. (의미상 말이 안되는 문장들이 나온다)

작은 덩어리이면서도 중간을 잘라먹지 않는 방법은 없을까?
=> chunk_overlap=
    이 속성은 문장이나 문단을 분할할 때 앞 조각 일부분을 가져오게 만든다.
    앞 조각의 끝부분을 조금 가져와서 다음 조각에 연결시키는 거다.
    이 경우 Document 사이에는 곂치는 부분이 생길수 있다. (중복된 부분)
    어떤 Document 의 끝부분이 다른 Document 의 시작점이 되는 거다.
"""
None

In [28]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50,
)

documents = loader.load_and_split(text_splitter=splitter)
print(len(documents))

250


In [30]:
for document in documents[10:15]:
    print('🔷', document.page_content)

🔷 move. BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
🔷 Inside the flat a fruity voice was reading out a list of figures which had something to do with the production of pig-iron. The voice came from an oblong metal plaque like a dulled mirror which
🔷 an oblong metal plaque like a dulled mirror which formed part of the surface of the right-hand wall. Winston turned a switch and the voice sank somewhat, though the words were still distinguishable.
🔷 though the words were still distinguishable. The instrument (the telescreen, it was called) could be dimmed, but there was no way of shutting it off completely. He moved over to the window: a
🔷 it off completely. He moved over to the window: a smallish, frail figure, the meagreness of his body merely emphasized by the blue overalls which were the uniform of the party. His hair was very


In [31]:
# ↑ Document 간에 겹치는 부분이 있다.
# 앞 Document 의 뒷부분을 가져다가 다음 Document 의 앞에 넣었다.
# 이렇게 하므로 문장의 (의미적) 구조를 해치지 않도록 split 했다.


## CharacterTextSplitter

In [32]:
# v0.3
from langchain_text_splitters.character import CharacterTextSplitter
# https://python.langchain.com/api_reference/text_splitters/character/langchain_text_splitters.character.CharacterTextSplitter.html

# Splitting text that looks at characters.
# Create a new TextSplitter.

In [33]:
# CharacterTextSplitter 도 동작방식은 비슷하다
# separator=  : 특정 문자열 찾은 다음 이를 기준으로 분할한다.


In [34]:
splitter = CharacterTextSplitter(
    separator='\n',   # 줄바꿈 단락별로 split
    chunk_size=600,
    chunk_overlap=100,
)

documents = loader.load_and_split(text_splitter=splitter)
print(len(documents))

for document in documents[0:5]:
  print('🔷', document.page_content)

Created a chunk of size 963, which is longer than the specified 600
Created a chunk of size 774, which is longer than the specified 600
Created a chunk of size 954, which is longer than the specified 600
Created a chunk of size 922, which is longer than the specified 600
Created a chunk of size 881, which is longer than the specified 600
Created a chunk of size 821, which is longer than the specified 600
Created a chunk of size 700, which is longer than the specified 600
Created a chunk of size 745, which is longer than the specified 600
Created a chunk of size 735, which is longer than the specified 600
Created a chunk of size 671, which is longer than the specified 600
Created a chunk of size 991, which is longer than the specified 600
Created a chunk of size 990, which is longer than the specified 600
Created a chunk of size 1289, which is longer than the specified 600
Created a chunk of size 1605, which is longer than the specified 600
Created a chunk of size 1900, which is longer 

46
🔷 Part 1, Chapter 1
Part One
1
It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.
🔷 The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his r

# TikToken

In [35]:
"""
기본적으로 모든 splitter 들은 텍스트의 length 를 계산해서
한 덩어리(chunk) 의 크기를 알아낸다.
그 작업에는 파이썬 표준 라이브러리가 지원하는 표준 len() 함수를 사용한다. (디폴트)

Splitter 에는 length 를 계산하는 함수를 제공해줄수도 있다
  바로 length_function= 속성이다
"""
None

In [36]:
splitter = CharacterTextSplitter(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100,
    length_function=len,  # <- 기본적인 chunk 개수 카운트 함수
)

# 그러나, LLM 에서 말하는 token 은 문자(letter) 와는 다르다

## OpenAI Tokenizer 예시 

In [37]:
"""
OpenAI 에서의 token 예시
https://platform.openai.com/tokenizer
↓ model 의 관점에서, 몇개의 token 을 사용하는지 확인해 볼수 있다.
"""
None

## from_tiktoken_encoder()

In [38]:
# tiktoken 은 OpenAI 에 의해 만들어진거다.
# https://github.com/openai/tiktoken   <- 아까 위의 Tokenizer 페이지 하단에 보면 이 링크가 있다.

# 아래의 from_tiktoken_encoder() 을 사용하면 tiktoken 패키지가 동작하는 것이다.

In [40]:
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100,    
)

documents = loader.load_and_split(text_splitter=splitter)
print(f'💚 {len(documents)} 개')

for document in documents[0:5]:
  print('🔷', document.page_content)

💚 17 개
🔷 Part 1, Chapter 1
Part One
1
It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.
The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his

# Vectors

https://python.langchain.com/v0.1/docs/modules/data_connection/

![](https://python.langchain.com/v0.1/assets/images/data_connection-95ff2033a8faa5f3ba41376c0f6dd32a.jpg)


## Embedding 과 Vector

In [41]:
# Embedding 은 사람이 읽는 텍스트를 컴퓨터가 이해(연산)할 수 있는 숫자들로 변환하는 작업이다.
# 우리가 만든 Document 마다 각각의 벡터를 만들어 주게 될겁니다.
# OpenAI 는 크기가 최소 1000차원 이상!의 벡터를 제공해준다.

In [42]:
"""
3개의 차원을 정의해보자

첫번째 차원을 Masculinity (남성성)
두번째 차원을 Femininity (여성성)
세번째 차원을 Royalty (왕족스러움)

이제 특정 단어에 대한 차원 값(점수)를 줘보자

        Masculinity | Femininity  | Royalty
king  | 0.9         | 0.1         | 1.0
queen | 0.1         | 0.9         | 1.0
man   | 0.9         | 0.1         | 0.0
woman | 0.1         | 0.9         | 0.0

이렇게 3차원 벡터에 점수를 매겨 보았다.

단어를 이렇게 벡터로 점수를 매기면 연산을 할수 있게 된다.
king - man <- 이런거

king - man = 0.0    |  0.0       | 1.0 ==> 이러면 'royal' 이 되겠네요 ㅋ

royal + woman = 0.1  | 0.9 | 1.0 ==> 이러면 'queen' 이 되겠네요.

단어를 숫자화(벡터화) 하니까 의미에 대한 연산이 가능해진다.

"""
None


![](https://miro.medium.com/v2/resize:fit:2000/1*SYiW1MUZul1NvL1kc1RxwQ.png)

In [None]:
"""
단어를 입력하면 비슷한 벡터를 가진 단어들을 보여주는 곳. (word2vec)

https://turbomaze.github.io/word2vecjson/
"""
None

# Vector Store

## OpenAIEmbeddings

In [43]:
from langchain_openai.embeddings.base import OpenAIEmbeddings
# https://python.langchain.com/api_reference/openai/embeddings/langchain_openai.embeddings.base.OpenAIEmbeddings.html#langchain_openai.embeddings.base.OpenAIEmbeddings
# OpenAI embedding models.

In [44]:
embedder = OpenAIEmbeddings()   # OpenAI 의 Embedding model

In [45]:
"""
OpenAIEmbeddings 를 통해
 embed_documents()  <- 문서를 embed 하는것 뿐만 아니라
 embed_query()      <- query 도 embed 하는 것이 가능하다.
"""
None

In [46]:
vector = embedder.embed_query("Hi")
vector  # -> "Hi" 에 대한 벡터를 받아온다 -> List[Float]

[-0.03629858046770096,
 -0.007224537897855043,
 -0.03371885418891907,
 -0.02866363152861595,
 -0.02686564065515995,
 0.03460482135415077,
 -0.012318846769630909,
 -0.007752209436148405,
 0.0019380523590371013,
 -0.0027018729597330093,
 0.024781012907624245,
 -0.002477124100551009,
 -0.00573272630572319,
 -0.002905449829995632,
 0.006677323020994663,
 -0.00303248199634254,
 0.033849142491817474,
 -0.001503212028183043,
 0.02109382674098015,
 -0.008996471762657166,
 -0.02171921543776989,
 0.01038405206054449,
 0.006244111340492964,
 0.007081219926476479,
 -0.012312332168221474,
 0.0008998099947348237,
 0.005876044277101755,
 -0.009888952597975731,
 -0.0030731973238289356,
 -0.024572549387812614,
 0.010742347687482834,
 -0.01381065882742405,
 -0.024429231882095337,
 -0.01411032397300005,
 0.0024347801227122545,
 -0.018878910690546036,
 0.0005618723225779831,
 -0.011270018294453621,
 0.018110202625393867,
 -0.009967125952243805,
 0.01302892342209816,
 -0.011328648775815964,
 -0.00913327559

In [47]:
len(vector)  # 1536차원! <- 각 토큰마다 1536 크기의 차원을 갖는다!

1536

In [48]:
# document 를 embed 해보자
vectors = embedder.embed_documents([
    "hi",
    "how are you",
    "good to meet you",
]) # -> List[List[Float]]

In [49]:
len(vectors)

3

In [50]:
for vector in vectors:
    print(len(vector))

1536
1536
1536


In [None]:
"""
이제 실제 우리 문서를 embed 해보자

직접 embed_documents() 를 호출하진 않을거다
코드를 실행할때마다 '매번' 문서 embedding 을 반복해서 수행하는건 매우 비효율적이다
 => 시간 소요 + ㄴ또한 비용 지출

대신! 그 embeded 된 결과들을 '저장'해 줄겁니다.
LangChain 은 embedding 한것들을 캐싱하는 기능을 제공해준다

Document는 이와 같이 한번만 embedding 해주는게 좋다. (Document 가 변경되지 않는한.)
"""
None

## Vector Store 란?

In [None]:
# 벡터들을 저장하는 일종의 저장소

# 일단 벡터를 만들고 나서, 그것들을 캐시해주고, vector store 에 넣어주면,
# 우리가 검색을 할수 있다.
# 그리하여, 관련있는 문서들만 찾아낼수 있게 되는 거다

# 랭체인은 다양한 vector store 를 제공한다,  어떤거는 cloud 형태이고, 어떤건 유료이기도 하다.

# 우리는 예제에서 무료로 사용할수 있고 로컬로 저장되는 Chroma 라는 것을 사용해볼겁니다

In [None]:
# 로컬 지원 벡터 스토어 : Chroma, FAISS...
# 클라우드 벡터 스토어 : pinecone...

## Chroma vector store

In [51]:
# v0.3
from langchain_community.vectorstores.chroma import Chroma
# https://python.langchain.com/api_reference/community/vectorstores/langchain_community.vectorstores.chroma.Chroma.html
# https://python.langchain.com/docs/integrations/vectorstores/chroma/

# ChromaDB vector store.
# To use, you should have the chromadb python package installed.


In [None]:
# ↓이 ChromaDB 에 'split 된 문서' 와 'OpenAI embedding model' 을 전달해야 한다

# OpenAIEmbeddings 의 옵션에 model= 이 있다. 여기에 원하는 모델 지정가능 (지정안하면 default 동작)

# ★ embedding 모델을 사용하는것도 비용이 발생한다!

# 참고) OpenAI 사에서 제공하는 embedding 모델 정보
#   https://platform.openai.com/docs/guides/embeddings
#   2025.1 현재 : text-embedding-3-small 과 text-embedding-3-large 이 최신 임베딩모델

In [52]:
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100,    
)

documents = loader.load_and_split(text_splitter=splitter)

# embedding 모델 
embeddings = OpenAIEmbeddings()

# 여기에 'split 된 문서'와 '임베딩 모델'을 건네준다
vectorstore = Chroma.from_documents(documents, embeddings)

In [54]:
# vectorstore 를 사용하여 유사도 검색을 해보다.

results = vectorstore.similarity_search("where does winston live")  # => List[Document] 리턴

results

[Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content="The Ministry of Love was the really frightening one. There were no windows in it at all. Winston had never been inside the Ministry of Love, nor within half a kilometre of it. It was a place impossible to enter except on official business, and then only by penetrating through a maze of barbed-wire entanglements, steel doors, and hidden machine-gun nests. Even the streets leading up to its outer barriers were roamed by gorilla-faced guards in black uniforms, armed with jointed truncheons.\nWinston turned round abruptly. He had set his features into the expression of quiet optimism which it was advisable to wear when facing the telescreen. He crossed the room into the tiny kitchen. By leaving the Ministry at this time of day he had sacrificed his lunch in the canteen, and he was aware that there was no food in the kitchen except a hunk of dark-coloured bread which had got to be saved for tomorro

In [56]:
len(results)  # 주어진 query  와 유사도가 높은 Document 들이 리턴

4

In [57]:
results[0].page_content

"The Ministry of Love was the really frightening one. There were no windows in it at all. Winston had never been inside the Ministry of Love, nor within half a kilometre of it. It was a place impossible to enter except on official business, and then only by penetrating through a maze of barbed-wire entanglements, steel doors, and hidden machine-gun nests. Even the streets leading up to its outer barriers were roamed by gorilla-faced guards in black uniforms, armed with jointed truncheons.\nWinston turned round abruptly. He had set his features into the expression of quiet optimism which it was advisable to wear when facing the telescreen. He crossed the room into the tiny kitchen. By leaving the Ministry at this time of day he had sacrificed his lunch in the canteen, and he was aware that there was no food in the kitchen except a hunk of dark-coloured bread which had got to be saved for tomorrow's breakfast. He took down from the shelf a bottle of colourless liquid with a plain white l

In [58]:
results[1].page_content

'Part 1, Chapter 1\nPart One\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his rig

## embedding cache

In [None]:
# 다시 실행하면 임베딩 결과는 다 사라진다. 재실행하면 다시 재계산 발생 (비용발생!)
# 그래서 embedding 을 캐싱해주자

In [59]:
# v0.3
from langchain.embeddings.cache import CacheBackedEmbeddings
# https://python.langchain.com/api_reference/langchain/embeddings/langchain.embeddings.cache.CacheBackedEmbeddings.html

# Interface for caching results from embedding models.
# The interface allows works with any store that implements the abstract store interface accepting keys of type str and values of list of floats.

In [60]:
# v0.3
from langchain.storage.file_system import LocalFileStore
# https://python.langchain.com/api_reference/langchain/storage/langchain.storage.file_system.LocalFileStore.html
# BaseStore interface that works on the local file system.

In [62]:
## 캐시 경로를 지정해주자.  여기에 embedding 이 저장될거다.
cache_dir = LocalFileStore('.cache')

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings,   # 임베딩 모델
    cache_dir,   # 임베딩 저장(캐시)장소
)

vectorstore = Chroma.from_documents(documents, cached_embeddings)
# ↑ 이렇게 하면,  다음에 Chroma.from_documents() 를 호출할때는
#  OpenAIEmbeddings 대신에 미리 cache 되어 있듣 embeddings 를 전달할거다.

# 위코드를 실행하여 우리가 또 파일 embedding 작업을 할때는,
# 1.첫번째로!
#   캐시에 embeddings 가 이미 존재하는지 확인할거다.
# 2.만약 없다면!
#    vector store(Chroma.from_documents) 를 호출할 때
#   문서들(docs) 과 함께 OpenAIEmbeddings 를 사용할거다.

results = vectorstore.similarity_search("where does winston live")
results

[Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content="The Ministry of Love was the really frightening one. There were no windows in it at all. Winston had never been inside the Ministry of Love, nor within half a kilometre of it. It was a place impossible to enter except on official business, and then only by penetrating through a maze of barbed-wire entanglements, steel doors, and hidden machine-gun nests. Even the streets leading up to its outer barriers were roamed by gorilla-faced guards in black uniforms, armed with jointed truncheons.\nWinston turned round abruptly. He had set his features into the expression of quiet optimism which it was advisable to wear when facing the telescreen. He crossed the room into the tiny kitchen. By leaving the Ministry at this time of day he had sacrificed his lunch in the canteen, and he was aware that there was no food in the kitchen except a hunk of dark-coloured bread which had got to be saved for tomorro