# API Key

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
os.getenv('OPENAI_API_KEY')[:20]

'sk-proj-iKU13YeoxNgF'

# import

In [3]:
from langchain_openai.chat_models.base import ChatOpenAI
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain_core.runnables.passthrough import RunnablePassthrough

# RAG
Retrieval Augmented Generation (검색증강생성)

In [None]:
# 사전학습된 모델은 이미 많은 데이터를 통해 학습한 상태이긴 하나..
# 개인 DB 나 회사내 문서 와 같이 'private 한 데이터' 들에는 접근할수 없다
# 그래서 RAG 를 사용한다!

In [4]:
# 1. Retrieval 단계
# private 으로부터 제공된 data 를 사용하거나 탐색함으로써
# language model 의 능력을 더 '확장(augment)'

# 2. Augmented Generation
# Model 로 하여금 '우리가 보낸 문서 data 만'을 가지고 답변하도록 할수도 있다.
# (경우에 따라, 우리 문서가 더 최신 data 일수도 있기 때문이다)
# 이를 통해 Model 이 과거에 학습한 data 를 참조하지 않게도 할수 있다.

In [None]:
# RAG 는 특정 라이브러리나 프레임워크 이름이 아니라
# 위와 같은 작업을 하는 '기법'을 일반적으로 통칭하는 용어

# RAG 를 수행하는 방법은 굉~장히 많고 다양.

In [None]:
# 어떤 방식으로 RAG 를 구현할른지는

# - 우리가 얼마나 많은 문서들을 가지고 있는지
# - 우리가 얼마나 많은 비용으로 운영할지 (어떤 모델, 가용한 token 개수등..)

# 등에 따라 결정될 문제다.

# 1.1 DataLoaders

![](https://miro.medium.com/v2/resize:fit:1100/format:webp/1*qyXS4oRtrW2NhhMRBxsdQQ.png)

In [5]:
# RAG 의 첫번째 단계인 Retrieval 의 일반적인 과정
# - data source 에서 데이터 load
# - 데이터는 split 하면서 transform
# - transform 한 데이터를 embed.
# - embed 된 데이터를 store 에 저장.
# - 검색(질의) 가 입력되면 store 에서 관련 문서들을 retrieve!

## DataLoader 란

In [7]:
# 랭체인에서 제공하는 다양한 document loader 들이 있다
# CSV, File Directory, HTML, JSON, Markdown, PDF 등
# ※그 밖에서도 3rd party loader 들도 있다.

# v0.3 ★
# https://python.langchain.com/docs/integrations/document_loaders/


## 파일 준비

In [None]:
# 출처는  조지오웰의 소설 '1984' Part1 Chapter1
#  http://www.george-orwell.org/1984/0.html

# 너무 길거나, 너무 짧지 않으면 좋습니다
# 파일이 너무 길면 나중에 임베딩 과정에서 비용지출이 발생.


In [8]:
llm = ChatOpenAI(temperature=0.1)
base_path = r'D:\LANG2509\dataset'

## TextLoader

In [9]:
from langchain_community.document_loaders.text import TextLoader

In [10]:
loader = TextLoader(os.path.join(base_path, 'chapter_one.txt'))

In [12]:
docs = loader.load()
print(len(docs))

docs

# List[Document] 가 리턴된다.  

1


[Document(metadata={'source': 'D:\\LANG2509\\dataset\\chapter_one.txt'}, page_content="Part 1, Chapter 1\n\nPart One\n\n\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat

## PyPDFLoader

In [13]:
from langchain_community.document_loaders.pdf import PyPDFLoader

In [15]:
loader = PyPDFLoader(os.path.join(base_path, 'chapter_one.pdf'))
docs = loader.load()
print(len(docs))

docs

15


[Document(metadata={'producer': 'Microsoft® Word 2016', 'creator': 'Microsoft® Word 2016', 'creationdate': '2025-01-30T23:19:00+09:00', 'author': 'Yeonchul Sung', 'moddate': '2025-01-30T23:19:00+09:00', 'source': 'D:\\LANG2509\\dataset\\chapter_one.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Part 1, Chapter 1 \n \n \nPart One \n \n \n1 \nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his \nchin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through \nthe glass doors of Victory Mansions, though not quickly enough to prevent a swirl of \ngritty dust from entering along with him. \n \nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured \nposter, too large for indoor display, had been tacked to the wall. It depicted simply an \nenormous face, more than a metre wide: the face of a man of about forty-five, with a \nheavy black moustache and ruggedly handsome featur

## UnstructuredFileLoader

In [None]:
# 서로 다른 타입의 문서를 읽어오기 위해 각각의 DataLoader 를 사용하기 보다
# UnstructuredFileLoader 라는 것도 사용해볼수 있다. -> 꽤 다양한 포맷의 파일을 읽어올 수 있다

In [16]:
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader

In [17]:
loader = UnstructuredFileLoader(os.path.join(base_path, 'chapter_one.pdf'))
docs = loader.load()
print(len(docs))

docs

  loader = UnstructuredFileLoader(os.path.join(base_path, 'chapter_one.pdf'))


1


[Document(metadata={'source': 'D:\\LANG2509\\dataset\\chapter_one.pdf'}, page_content="Part 1, Chapter 1\n\nPart One\n\n1\n\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his\n\nchin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through\n\nthe glass doors of Victory Mansions, though not quickly enough to prevent a swirl of\n\ngritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured\n\nposter, too large for indoor display, had been tacked to the wall. It depicted simply an\n\nenormous face, more than a metre wide: the face of a man of about forty-five, with a\n\nheavy black moustache and ruggedly handsome features. Winston made for the stairs. It\n\nwas no use trying the lift. Even at the best of times it was seldom working, and at\n\npresent the electric current was cut off during daylight hours. It was part of the economy\n\ndrive in preparat

In [18]:
loader = UnstructuredFileLoader(os.path.join(base_path, 'chapter_one.docx'))
docs = loader.load()
print(len(docs))

docs

1


[Document(metadata={'source': 'D:\\LANG2509\\dataset\\chapter_one.docx'}, page_content="Part 1, Chapter 1\n\nPart One\n\n\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The fla

# 1.2.Splitter

## data 를 split 해야 하는 이유.

In [None]:
# loader.load() 의 리턴값을 보면 'Document로 이루어진 list' 다.
# 지금의 경우는 전체 챕터가 '하나의 Document' 에 들어가 있다.

In [19]:
len(loader.load())

1

In [None]:
# 특정 질문에 답해야 하기 위해서, 필요한 '파일의 일부분' 만들 전달해야 할 수도 있다.

#  그래서 문서를 쪼개두어야(split) 한다

# 가령: "Ministry of peace" 를 찾고자 한다면.
# 해당 키워드가 있는 문서(들)만 모델에 넘겨주면 된다.

# 작은 조각들로 쪼개어 두면 필요한 것들을 찾기가 용이해진다.
#  - prompt 도 짧아질거다 (적은 token 사용, 적은 비용.)

# split 하는 방법은 다양하다.


In [None]:
"""
TextSplitter 계층도

BaseDocumentTransformer --> TextSplitter --> <name>TextSplitter  # Example: CharacterTextSplitter
                                             RecursiveCharacterTextSplitter -->  <name>TextSplitter

https://python.langchain.com/api_reference/text_splitters/index.html

"""
None

## RecursiveCharacterTextSplitter

In [20]:
from langchain_text_splitters.character import RecursiveCharacterTextSplitter

In [21]:
splitter = RecursiveCharacterTextSplitter()

# RecursiveCharacterTextSplitter 는 파일을 split 해주는데
# 문장의 끝이나, 문단의 끝부분마다 끊어준다.
# 문장 중간을 끊지는 않는다.  최대한 문장 중간에서 split 되지 않도록 하려 한다.
# 문장 중간에 짤림으로 의미있는 문장들을 잃고 싶지 않다.

# ↓ splitter 사용방법은 두가지 가 있다.

In [22]:
docs = loader.load()

In [24]:
# 방법1
documents = splitter.split_documents(docs)
print(len(documents))

documents

11


[Document(metadata={'source': 'D:\\LANG2509\\dataset\\chapter_one.docx'}, page_content="Part 1, Chapter 1\n\nPart One\n\n\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The fla

In [25]:
# 방법2
documents = loader.load_and_split(text_splitter=splitter) 
print(len(documents))

documents   # List[Document]

11


[Document(metadata={'source': 'D:\\LANG2509\\dataset\\chapter_one.docx'}, page_content="Part 1, Chapter 1\n\nPart One\n\n\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The fla

In [28]:
# 첫번째 Document
print(documents[0].page_content)

Part 1, Chapter 1

Part One


1
It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.

The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his righ

### chunk_size=

In [29]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,   # chunk 의 단위는 splitter 마다 다르다.
    # CharacterTextSplitter 의 경우 chunk_size 는 문자의 개수.
)

documents = loader.load_and_split(text_splitter=splitter)
print(len(documents))

documents[:5]


3498


[Document(metadata={'source': 'D:\\LANG2509\\dataset\\chapter_one.docx'}, page_content='Part 1, Chapter 1\n\nPart One'),
 Document(metadata={'source': 'D:\\LANG2509\\dataset\\chapter_one.docx'}, page_content='1'),
 Document(metadata={'source': 'D:\\LANG2509\\dataset\\chapter_one.docx'}, page_content='It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors'),
 Document(metadata={'source': 'D:\\LANG2509\\dataset\\chapter_one.docx'}, page_content='was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of'),
 Document(metadata={'source': 'D:\\LANG2509\\dataset\\chapter_one.docx'}, page_content='cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effo

In [None]:
# ↑ 문제점: 문단의 중간이 잘려버렸다 -> 문장의 의미가 파괴된다.

# 작은 덩어리이면서 문장의 중간을 잘라먹지 않는 방법은?
# chunk_overlap=
#    split 할때 앞 조각의 일부를 가져와서 연결해준다.
#    Document 간의 겹치는 부분 생길수 있다.

### chunk_overlap=

In [30]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,   
    chunk_overlap=50,
)

documents = loader.load_and_split(text_splitter=splitter)
print(len(documents))

for document in documents[10:15]:
    print('🔷', document.page_content)


250
🔷 move. BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
🔷 Inside the flat a fruity voice was reading out a list of figures which had something to do with the production of pig-iron. The voice came from an oblong metal plaque like a dulled mirror which
🔷 an oblong metal plaque like a dulled mirror which formed part of the surface of the right-hand wall. Winston turned a switch and the voice sank somewhat, though the words were still distinguishable.
🔷 though the words were still distinguishable. The instrument (the telescreen, it was called) could be dimmed, but there was no way of shutting it off completely. He moved over to the window: a
🔷 it off completely. He moved over to the window: a smallish, frail figure, the meagreness of his body merely emphasized by the blue overalls which were the uniform of the party. His hair was very


In [None]:
# ↑ Document 간에 겹치는 부분이 있다.
# 앞 Document 의 뒷부분을 가져다가 다음 Document 의 앞에 넣었다.
# 이렇게 하므로 문장의 (의미적) 구조를 크게 해치지 않도록 split 했다.

## CharacterTextSplitter

In [31]:
from langchain_text_splitters.character import CharacterTextSplitter

In [32]:
# 특정 문자열 단위로 split 한다.

In [33]:
splitter = CharacterTextSplitter(
    separator='\n',  # 줄바꿈 문자 단위로 쪼갬.
    chunk_size=600,   
    chunk_overlap=100,
)

documents = loader.load_and_split(text_splitter=splitter)
print(len(documents))

for document in documents[0:5]:
    print('🔷', document.page_content)



46
🔷 Part 1, Chapter 1
Part One
1
It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.
🔷 The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his r

# 2.TikToken

## length_function=

In [None]:
# splitter 에 lenth 를 계산하는 함수를 제공해줄수 있다.
#  length_function=  

In [34]:
splitter = CharacterTextSplitter(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100,
    length_function=len,    # 기본적인 chunk 개수 카운트 함수.
)


In [None]:
# 디폴트로 len() 함수가 동작함. CharacterTextSplitter 에선 '글자의 개수'를 chunk 카운트 함.
# 그러나 LLM 에서 말하는 token 은 문자(letter) 와는 다르다.
# 어떤 경우에는 문자 두개, 혹은 세개...  가 한개의 token 으로 카운트 된다.


## OpenAI Tokenizer 예시

In [None]:
"""
OpenAI 에서의 token 예시
https://platform.openai.com/tokenizer
↓ model 의 관점에서, 몇개의 token 을 사용하는지 확인해 볼수 있다.
"""
None

In [None]:
# OpenAI 모델의 tokenizer 를 우리 splitter 에 사용할수 있다!

## from_tiktoken_encoder()

In [37]:
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator='\n',
    chunk_size=600,  
    chunk_overlap=100, 
)

documents = loader.load_and_split(text_splitter=splitter)
print(f'💚 {len(documents)} 개')

for document in documents[0:5]:
  print('🔷', document.page_content)

💚 17 개
🔷 Part 1, Chapter 1
Part One
1
It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.
The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his

# 3.Vectors

## Embedding 과 Vector

In [38]:
# Embedding 은 사람이 읽는 텍스트를 컴퓨터가 이해(연산)할 수 있는 숫자들(벡터)로 변환하는 작업이다.
# 우리가 만든 Document 마다 각각의 벡터를 만들어 주게 될겁니다.
# OpenAI 는 크기가 최소 1000차원 이상!의 벡터를 제공해준다.

In [None]:
"""
3개의 차원을 정의해보자

첫번째 차원을 Masculinity (남성성)
두번째 차원을 Femininity (여성성)
세번째 차원을 Royalty (왕족스러움)

이제 특정 단어에 대한 차원 값(점수)를 줘보자

        Masculinity | Femininity  | Royalty
king   | 0.9        | 0.1         | 1.0        
queen  | 0.1        | 0.9         | 1.0
man    | 0.9        | 0.1         | 0.0
woman  | 0.1        | 0.9         | 0.0

이렇게 3차원 벡터에 점수를 매겨 보았다.

단어를 이렇게 벡터로 점수를 매기면 연산을 할수 있게 된다.

king - man = 0.0  | 0.0   | 1.0  => 'royal' 만 남는다.

royal + woman = 0.1 |  0.9  | 1.0 => 'queen' 디 된다.

단어를 벡터화(숫자화) 하니까 의미에 대한 연산이 가능해진다.

"""
None

![](https://miro.medium.com/v2/resize:fit:2000/1*SYiW1MUZul1NvL1kc1RxwQ.png)

## word2vec 예시

https://turbomaze.github.io/word2vecjson/

## OpenAIEmbeddings

In [39]:
from langchain_openai.embeddings.base import OpenAIEmbeddings

In [40]:
embedder = OpenAIEmbeddings()   # OpenAI 의 embedding model 

In [41]:
embedder.model

'text-embedding-ada-002'

In [None]:
# OpenAIEmbeddings 를 통해
#  embed_documents()  <- 문서를 embed 하는것 뿐만 아니라
#  embed_query()      <- query 도 embed 하는 것이 가능하다.


In [43]:
vector = embedder.embed_query("Hi")  # 모델 호출 발생, "Hi" 에 대한 embedding vector 를 얻는다.
print(len(vector))  
print(vector)

1536
[-0.03629858046770096, -0.007224537897855043, -0.03371885418891907, -0.02866363152861595, -0.02686564065515995, 0.03460482135415077, -0.012318846769630909, -0.007752209436148405, 0.0019380523590371013, -0.0027018729597330093, 0.024781012907624245, -0.002477124100551009, -0.00573272630572319, -0.002905449829995632, 0.006677323020994663, -0.00303248199634254, 0.033849142491817474, -0.001503212028183043, 0.02109382674098015, -0.008996471762657166, -0.02171921543776989, 0.01038405206054449, 0.006244111340492964, 0.007081219926476479, -0.012312332168221474, 0.0008998099947348237, 0.005876044277101755, -0.009888952597975731, -0.0030731973238289356, -0.024572549387812614, 0.010742347687482834, -0.01381065882742405, -0.024429231882095337, -0.01411032397300005, 0.0024347801227122545, -0.018878910690546036, 0.0005618723225779831, -0.011270018294453621, 0.018110202625393867, -0.009967125952243805, 0.01302892342209816, -0.011328648775815964, -0.009133275598287582, -0.009654432535171509, -0.02

In [None]:
# 각 토큰/문장 마다 1536개의 차원 값을 가지고 있게 된다.

In [44]:
# document 를 embed 해보자
vectors = embedder.embed_documents([
    "hi",
    "how are you",
    "goot to meet you",
])  # -> List[List[Float]]

In [47]:
len(vectors)

3

In [48]:
# 각각은 동일한 차원을 가지고 있다
for vector in vectors:
    print(len(vector))

1536
1536
1536
