# RAG
Retrieval Augmented Generation (검색증강생성)

# data 준비

In [2]:
# 아래와 같이 파일들을 준비합니다
# 구글드라이브 사용자는 자신의 구글드라이브 공간에 생성해두시길 바랍니다

# 출처는  조지오웰의 소설 '1984' Part1 Chapter1
#  http://www.george-orwell.org/1984/0.html

# 너무 길거나, 너무 짧지 않으면 좋습니다
# 파일이 너무 길면 나중에 임베딩 과정에서 비용지출이 발생.

In [3]:
import os
base_path = r'D:\NLP2501\dataset\files'

In [4]:
from langchain_openai.chat_models.base import ChatOpenAI

In [5]:
llm = ChatOpenAI(temperature=0.1)

# DataLoaders

## Retrieve 란

https://python.langchain.com/v0.1/docs/modules/data_connection/

![](https://python.langchain.com/v0.1/assets/images/data_connection-95ff2033a8faa5f3ba41376c0f6dd32a.jpg)


In [6]:
# RAG 의 첫번째 단계인 Retrieval 의 일반적인 과정
# - data source 에서 데이터 load
# - 데이터는 split 하면서 transform
# - transform 한 데이터를 embed.
# - embed 된 데이터를 store 에 저장.


## DataLoader 란

In [7]:
# 랭체인에서 제공하는 다양한 document loader 들이 있다
# CSV, File Directory, HTML, JSON, Markdown, PDF 등
# ※그 밖에서도 3rd party loader 들도 있다.

In [8]:
"""
Data Loader 는 소스에서 데이터를 추출하고 langchain 에 가져다 주는 코드다.

정말 많은 document loader source 들이 제공된다. (함 보자 ↓)
https://python.langchain.com/docs/integrations/document_loaders/#all-document-loaders

GitHub, Figma, Facebook Caht, MS power point, slack, telegram, trello, Twitter 등..
전부다 랭체인에서 활용해볼수 있다는 것이다.

다양한 Data Loader 이지만 거의 동일한 API 인터페이스로 설계되어 있다.
"""
None

## TextLoader

In [9]:
# v0.3
from langchain_community.document_loaders.text import TextLoader
# https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.text.TextLoader.html
# Load text file.


In [10]:
loader = TextLoader(os.path.join(base_path, 'chapter_one.txt'))

In [11]:
loader.load()

# ↓ List[Document] 객체 리턴
# [Document(
#   metadata={..},
#   page_content="...",
# )]

[Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.txt'}, page_content="Part 1, Chapter 1\n\nPart One\n\n\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. Th

## PyPDFLoader

In [12]:
# v0.3
from langchain_community.document_loaders.pdf import PyPDFLoader

# https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html
# PyPDFLoader document loader integration

In [13]:
loader = PyPDFLoader(os.path.join(base_path, 'chapter_one.pdf'))
loader.load()

[Document(metadata={'producer': 'Microsoft® Word 2016', 'creator': 'Microsoft® Word 2016', 'creationdate': '2025-01-30T23:19:00+09:00', 'author': 'Yeonchul Sung', 'moddate': '2025-01-30T23:19:00+09:00', 'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Part 1, Chapter 1 \n \n \nPart One \n \n \n1 \nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his \nchin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through \nthe glass doors of Victory Mansions, though not quickly enough to prevent a swirl of \ngritty dust from entering along with him. \n \nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured \nposter, too large for indoor display, had been tacked to the wall. It depicted simply an \nenormous face, more than a metre wide: the face of a man of about forty-five, with a \nheavy black moustache and ruggedly handsome 

## UnstructuredFileLoader

In [14]:
# 위와 같이 서로 다른 포맷의 데이터를 따로따로 읽어올수도 있지만
# UnstructuredFileLoader 를 통해 다양한 포맷의 파일을 읽어올수도 있다.

In [15]:
# v0.3
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader

# https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.unstructured.UnstructuredFileLoader.html

In [16]:
loader = UnstructuredFileLoader(os.path.join(base_path, 'chapter_one.pdf'))
loader.load()

  loader = UnstructuredFileLoader(os.path.join(base_path, 'chapter_one.pdf'))
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.pdf'}, page_content="Part 1, Chapter 1\n\nPart One\n\n1\n\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his\n\nchin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through\n\nthe glass doors of Victory Mansions, though not quickly enough to prevent a swirl of\n\ngritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured\n\nposter, too large for indoor display, had been tacked to the wall. It depicted simply an\n\nenormous face, more than a metre wide: the face of a man of about forty-five, with a\n\nheavy black moustache and ruggedly handsome features. Winston made for the stairs. It\n\nwas no use trying the lift. Even at the best of times it was seldom working, and at\n\npresent the electric current was cut off during daylight hours. It was part of the economy\n\ndrive in pr

In [17]:
loader = UnstructuredFileLoader(os.path.join(base_path, 'chapter_one.txt'))
loader.load()

libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.


[Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.txt'}, page_content="Part 1, Chapter 1\n\nPart One\n\n1 It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The f

In [18]:
loader = UnstructuredFileLoader(os.path.join(base_path, 'chapter_one.docx'))
loader.load()

[Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content="Part 1, Chapter 1\n\nPart One\n\n\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. T

# Splitter

## data 를 split 해야 하는 이유

In [19]:
len(loader.load())
# loader.load() 의 리턴값을 보면 'Document로 이루어진 list' 다.
# 지금의 경우는 전체 챕터가 '하나의 Document' 에 들어가 있다.


1

In [20]:
# 특정 질문에 답해야 하기 위해서, 필요한 '파일의 일부분' 만들 전달해야 할 수도 있다.

#  그래서 문서를 쪼개두어야(split) 한다

# 가령: "Ministry of peace" 를 찾고자 한다면.
# 해당 키워드가 있는 문서(들)만 모델에 넘겨주면 된다.

# 작은 조각들로 쪼개어 두면 필요한 것들을 찾기가 용이해진다.


In [21]:
"""
TextSplitter 계층도

BaseDocumentTransformer --> TextSplitter --> <name>TextSplitter  # Example: CharacterTextSplitter
                                             RecursiveCharacterTextSplitter -->  <name>TextSplitter

https://python.langchain.com/api_reference/text_splitters/index.html

"""
None

## RecursiveCharacterTextSplitter

In [22]:
# v0.3
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
# https://python.langchain.com/api_reference/text_splitters/character/langchain_text_splitters.character.RecursiveCharacterTextSplitter.html

# Splitting text by recursively look at characters.
# Recursively tries to split by different characters to find one that works.
# Create a new TextSplitter.

In [23]:
splitter = RecursiveCharacterTextSplitter()

# RecursiveCharacterTextSplitter 는 파일을 split 해주는데
# 문장의 끝이나, 문단의 끝부분마다 끊어준다.
# 문장 중간을 끊지는 않는다.  최대한 문장 중간에서 split 되지 않도록 하려 한다.
# 문장 중간에 짤림으로 의미있는 문장들을 잃고 싶지 않다.

# ↓ splitter 사용방법은 두가지 가 있다.

In [24]:
docs = loader.load()  # -> List[Document]

In [25]:
# 방법1
documents = splitter.split_documents(docs)
documents

[Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content="Part 1, Chapter 1\n\nPart One\n\n\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. T

In [26]:
len(documents)  # 여러개의 Document 들로 split

11

In [27]:
# 방법2 
documents = loader.load_and_split(text_splitter=splitter)
documents

[Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content="Part 1, Chapter 1\n\nPart One\n\n\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. T

In [28]:
len(documents)

11

In [29]:
print(documents[0].page_content)

Part 1, Chapter 1

Part One


1
It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.

The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his righ

### chunk_size=

In [30]:
# 좀 더 작은 Document 를 만들 필요가 있다.
# Context Window 가 크지 않은 경우라든지
# chunk_size= 값으로 조정해보자

In [31]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200  # 얼만큼의 양으로 split 할지 조정
)

documents = loader.load_and_split(text_splitter=splitter)
print(len(documents))

3498


In [32]:
documents[:5]

[Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content='Part 1, Chapter 1\n\nPart One'),
 Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content='1'),
 Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content='It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors'),
 Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content='was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of'),
 Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content='cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzz

### chunk_overlap=

In [33]:
"""
↑ 보다시피 Document 한 덩어리가 작아진 걸 확인할수 있다.

그러나 자세히 보라!  문제가 발생했다! => 문단의 중간을 잘라버렸다.
아런식으로 잘라먹으면 그닥 쓸만하지 않다. <- 문장을 파괴해버린셈이다. (의미상 말이 안되는 문장들이 나온다)

작은 덩어리이면서도 중간을 잘라먹지 않는 방법은 없을까?
=> chunk_overlap=
    이 속성은 문장이나 문단을 분할할 때 앞 조각 일부분을 가져오게 만든다.
    앞 조각의 끝부분을 조금 가져와서 다음 조각에 연결시키는 거다.
    이 경우 Document 사이에는 곂치는 부분이 생길수 있다. (중복된 부분)
    어떤 Document 의 끝부분이 다른 Document 의 시작점이 되는 거다.
"""
None

In [34]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50,
)

documents = loader.load_and_split(text_splitter=splitter)
print(len(documents))

250


In [35]:
for document in documents[10:15]:
    print('🔷', document.page_content)

🔷 move. BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
🔷 Inside the flat a fruity voice was reading out a list of figures which had something to do with the production of pig-iron. The voice came from an oblong metal plaque like a dulled mirror which
🔷 an oblong metal plaque like a dulled mirror which formed part of the surface of the right-hand wall. Winston turned a switch and the voice sank somewhat, though the words were still distinguishable.
🔷 though the words were still distinguishable. The instrument (the telescreen, it was called) could be dimmed, but there was no way of shutting it off completely. He moved over to the window: a
🔷 it off completely. He moved over to the window: a smallish, frail figure, the meagreness of his body merely emphasized by the blue overalls which were the uniform of the party. His hair was very


In [36]:
# ↑ Document 간에 겹치는 부분이 있다.
# 앞 Document 의 뒷부분을 가져다가 다음 Document 의 앞에 넣었다.
# 이렇게 하므로 문장의 (의미적) 구조를 해치지 않도록 split 했다.


## CharacterTextSplitter

In [37]:
# v0.3
from langchain_text_splitters.character import CharacterTextSplitter
# https://python.langchain.com/api_reference/text_splitters/character/langchain_text_splitters.character.CharacterTextSplitter.html

# Splitting text that looks at characters.
# Create a new TextSplitter.

In [38]:
# CharacterTextSplitter 도 동작방식은 비슷하다
# separator=  : 특정 문자열 찾은 다음 이를 기준으로 분할한다.


In [39]:
splitter = CharacterTextSplitter(
    separator='\n',   # 줄바꿈 단락별로 split
    chunk_size=600,
    chunk_overlap=100,
)

documents = loader.load_and_split(text_splitter=splitter)
print(len(documents))

for document in documents[0:5]:
  print('🔷', document.page_content)

Created a chunk of size 963, which is longer than the specified 600
Created a chunk of size 774, which is longer than the specified 600
Created a chunk of size 954, which is longer than the specified 600
Created a chunk of size 922, which is longer than the specified 600
Created a chunk of size 881, which is longer than the specified 600
Created a chunk of size 821, which is longer than the specified 600
Created a chunk of size 700, which is longer than the specified 600
Created a chunk of size 745, which is longer than the specified 600
Created a chunk of size 735, which is longer than the specified 600
Created a chunk of size 671, which is longer than the specified 600
Created a chunk of size 991, which is longer than the specified 600
Created a chunk of size 990, which is longer than the specified 600
Created a chunk of size 1289, which is longer than the specified 600
Created a chunk of size 1605, which is longer than the specified 600
Created a chunk of size 1900, which is longer 

46
🔷 Part 1, Chapter 1
Part One
1
It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.
🔷 The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his r

# TikToken

In [40]:
"""
기본적으로 모든 splitter 들은 텍스트의 length 를 계산해서
한 덩어리(chunk) 의 크기를 알아낸다.
그 작업에는 파이썬 표준 라이브러리가 지원하는 표준 len() 함수를 사용한다. (디폴트)

Splitter 에는 length 를 계산하는 함수를 제공해줄수도 있다
  바로 length_function= 속성이다
"""
None

In [41]:
splitter = CharacterTextSplitter(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100,
    length_function=len,  # <- 기본적인 chunk 개수 카운트 함수
)

# 그러나, LLM 에서 말하는 token 은 문자(letter) 와는 다르다

## OpenAI Tokenizer 예시 

In [42]:
"""
OpenAI 에서의 token 예시
https://platform.openai.com/tokenizer
↓ model 의 관점에서, 몇개의 token 을 사용하는지 확인해 볼수 있다.
"""
None

## from_tiktoken_encoder()

In [43]:
# tiktoken 은 OpenAI 에 의해 만들어진거다.
# https://github.com/openai/tiktoken   <- 아까 위의 Tokenizer 페이지 하단에 보면 이 링크가 있다.

# 아래의 from_tiktoken_encoder() 을 사용하면 tiktoken 패키지가 동작하는 것이다.

In [44]:
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100,    
)

documents = loader.load_and_split(text_splitter=splitter)
print(f'💚 {len(documents)} 개')

for document in documents[0:5]:
  print('🔷', document.page_content)

💚 17 개
🔷 Part 1, Chapter 1
Part One
1
It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.
The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his

# Vectors

https://python.langchain.com/v0.1/docs/modules/data_connection/

![](https://python.langchain.com/v0.1/assets/images/data_connection-95ff2033a8faa5f3ba41376c0f6dd32a.jpg)


## Embedding 과 Vector

In [45]:
# Embedding 은 사람이 읽는 텍스트를 컴퓨터가 이해(연산)할 수 있는 숫자들로 변환하는 작업이다.
# 우리가 만든 Document 마다 각각의 벡터를 만들어 주게 될겁니다.
# OpenAI 는 크기가 최소 1000차원 이상!의 벡터를 제공해준다.

In [46]:
"""
3개의 차원을 정의해보자

첫번째 차원을 Masculinity (남성성)
두번째 차원을 Femininity (여성성)
세번째 차원을 Royalty (왕족스러움)

이제 특정 단어에 대한 차원 값(점수)를 줘보자

        Masculinity | Femininity  | Royalty
king  | 0.9         | 0.1         | 1.0
queen | 0.1         | 0.9         | 1.0
man   | 0.9         | 0.1         | 0.0
woman | 0.1         | 0.9         | 0.0

이렇게 3차원 벡터에 점수를 매겨 보았다.

단어를 이렇게 벡터로 점수를 매기면 연산을 할수 있게 된다.
king - man <- 이런거

king - man = 0.0    |  0.0       | 1.0 ==> 이러면 'royal' 이 되겠네요 ㅋ

royal + woman = 0.1  | 0.9 | 1.0 ==> 이러면 'queen' 이 되겠네요.

단어를 숫자화(벡터화) 하니까 의미에 대한 연산이 가능해진다.

"""
None


![](https://miro.medium.com/v2/resize:fit:2000/1*SYiW1MUZul1NvL1kc1RxwQ.png)

In [47]:
"""
단어를 입력하면 비슷한 벡터를 가진 단어들을 보여주는 곳. (word2vec)

https://turbomaze.github.io/word2vecjson/
"""
None

# Vector Store

## OpenAIEmbeddings

In [48]:
from langchain_openai.embeddings.base import OpenAIEmbeddings
# https://python.langchain.com/api_reference/openai/embeddings/langchain_openai.embeddings.base.OpenAIEmbeddings.html#langchain_openai.embeddings.base.OpenAIEmbeddings
# OpenAI embedding models.

In [49]:
embedder = OpenAIEmbeddings()   # OpenAI 의 Embedding model

In [50]:
"""
OpenAIEmbeddings 를 통해
 embed_documents()  <- 문서를 embed 하는것 뿐만 아니라
 embed_query()      <- query 도 embed 하는 것이 가능하다.
"""
None

In [51]:
vector = embedder.embed_query("Hi")
vector  # -> "Hi" 에 대한 벡터를 받아온다 -> List[Float]

[-0.03629858046770096,
 -0.007224537897855043,
 -0.03371885418891907,
 -0.02866363152861595,
 -0.02686564065515995,
 0.03460482135415077,
 -0.012318846769630909,
 -0.007752209436148405,
 0.0019380523590371013,
 -0.0027018729597330093,
 0.024781012907624245,
 -0.002477124100551009,
 -0.00573272630572319,
 -0.002905449829995632,
 0.006677323020994663,
 -0.00303248199634254,
 0.033849142491817474,
 -0.001503212028183043,
 0.02109382674098015,
 -0.008996471762657166,
 -0.02171921543776989,
 0.01038405206054449,
 0.006244111340492964,
 0.007081219926476479,
 -0.012312332168221474,
 0.0008998099947348237,
 0.005876044277101755,
 -0.009888952597975731,
 -0.0030731973238289356,
 -0.024572549387812614,
 0.010742347687482834,
 -0.01381065882742405,
 -0.024429231882095337,
 -0.01411032397300005,
 0.0024347801227122545,
 -0.018878910690546036,
 0.0005618723225779831,
 -0.011270018294453621,
 0.018110202625393867,
 -0.009967125952243805,
 0.01302892342209816,
 -0.011328648775815964,
 -0.00913327559

In [52]:
len(vector)  # 1536차원! <- 각 토큰마다 1536 크기의 차원을 갖는다!

1536

In [53]:
# document 를 embed 해보자
vectors = embedder.embed_documents([
    "hi",
    "how are you",
    "good to meet you",
]) # -> List[List[Float]]

In [54]:
len(vectors)

3

In [55]:
for vector in vectors:
    print(len(vector))

1536
1536
1536


In [56]:
"""
이제 실제 우리 문서를 embed 해보자

직접 embed_documents() 를 호출하진 않을거다
코드를 실행할때마다 '매번' 문서 embedding 을 반복해서 수행하는건 매우 비효율적이다
 => 시간 소요 + ㄴ또한 비용 지출

대신! 그 embeded 된 결과들을 '저장'해 줄겁니다.
LangChain 은 embedding 한것들을 캐싱하는 기능을 제공해준다

Document는 이와 같이 한번만 embedding 해주는게 좋다. (Document 가 변경되지 않는한.)
"""
None

## Vector Store 란?

In [57]:
# 벡터들을 저장하는 일종의 저장소

# 일단 벡터를 만들고 나서, 그것들을 캐시해주고, vector store 에 넣어주면,
# 우리가 검색을 할수 있다.
# 그리하여, 관련있는 문서들만 찾아낼수 있게 되는 거다

# 랭체인은 다양한 vector store 를 제공한다,  어떤거는 cloud 형태이고, 어떤건 유료이기도 하다.

# 우리는 예제에서 무료로 사용할수 있고 로컬로 저장되는 Chroma 라는 것을 사용해볼겁니다

In [58]:
# 로컬 지원 벡터 스토어 : Chroma, FAISS...
# 클라우드 벡터 스토어 : pinecone...

## Chroma vector store

In [59]:
# v0.3
from langchain_community.vectorstores.chroma import Chroma
# https://python.langchain.com/api_reference/community/vectorstores/langchain_community.vectorstores.chroma.Chroma.html
# https://python.langchain.com/docs/integrations/vectorstores/chroma/

# ChromaDB vector store.
# To use, you should have the chromadb python package installed.


In [60]:
# ↓이 ChromaDB 에 'split 된 문서' 와 'OpenAI embedding model' 을 전달해야 한다

# OpenAIEmbeddings 의 옵션에 model= 이 있다. 여기에 원하는 모델 지정가능 (지정안하면 default 동작)

# ★ embedding 모델을 사용하는것도 비용이 발생한다!

# 참고) OpenAI 사에서 제공하는 embedding 모델 정보
#   https://platform.openai.com/docs/guides/embeddings
#   2025.1 현재 : text-embedding-3-small 과 text-embedding-3-large 이 최신 임베딩모델

In [61]:
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100,    
)

documents = loader.load_and_split(text_splitter=splitter)

# embedding 모델 
embeddings = OpenAIEmbeddings()

# 여기에 'split 된 문서'와 '임베딩 모델'을 건네준다
vectorstore = Chroma.from_documents(documents, embeddings)

In [62]:
# vectorstore 를 사용하여 유사도 검색을 해보다.

results = vectorstore.similarity_search("where does winston live")  # => List[Document] 리턴

results

[Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content="The Ministry of Love was the really frightening one. There were no windows in it at all. Winston had never been inside the Ministry of Love, nor within half a kilometre of it. It was a place impossible to enter except on official business, and then only by penetrating through a maze of barbed-wire entanglements, steel doors, and hidden machine-gun nests. Even the streets leading up to its outer barriers were roamed by gorilla-faced guards in black uniforms, armed with jointed truncheons.\nWinston turned round abruptly. He had set his features into the expression of quiet optimism which it was advisable to wear when facing the telescreen. He crossed the room into the tiny kitchen. By leaving the Ministry at this time of day he had sacrificed his lunch in the canteen, and he was aware that there was no food in the kitchen except a hunk of dark-coloured bread which had got to be saved for tomorro

In [63]:
len(results)  # 주어진 query  와 유사도가 높은 Document 들이 리턴

4

In [64]:
results[0].page_content

"The Ministry of Love was the really frightening one. There were no windows in it at all. Winston had never been inside the Ministry of Love, nor within half a kilometre of it. It was a place impossible to enter except on official business, and then only by penetrating through a maze of barbed-wire entanglements, steel doors, and hidden machine-gun nests. Even the streets leading up to its outer barriers were roamed by gorilla-faced guards in black uniforms, armed with jointed truncheons.\nWinston turned round abruptly. He had set his features into the expression of quiet optimism which it was advisable to wear when facing the telescreen. He crossed the room into the tiny kitchen. By leaving the Ministry at this time of day he had sacrificed his lunch in the canteen, and he was aware that there was no food in the kitchen except a hunk of dark-coloured bread which had got to be saved for tomorrow's breakfast. He took down from the shelf a bottle of colourless liquid with a plain white l

In [65]:
results[1].page_content

'Part 1, Chapter 1\nPart One\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his rig

## embedding cache

In [66]:
# 다시 실행하면 임베딩 결과는 다 사라진다. 재실행하면 다시 재계산 발생 (비용발생!)
# 그래서 embedding 을 캐싱해주자

In [67]:
# v0.3
from langchain.embeddings.cache import CacheBackedEmbeddings
# https://python.langchain.com/api_reference/langchain/embeddings/langchain.embeddings.cache.CacheBackedEmbeddings.html

# Interface for caching results from embedding models.
# The interface allows works with any store that implements the abstract store interface accepting keys of type str and values of list of floats.

In [68]:
# v0.3
from langchain.storage.file_system import LocalFileStore
# https://python.langchain.com/api_reference/langchain/storage/langchain.storage.file_system.LocalFileStore.html
# BaseStore interface that works on the local file system.

In [69]:
## 캐시 경로를 지정해주자.  여기에 embedding 이 저장될거다.
cache_dir = LocalFileStore('.cache')

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings,   # 임베딩 모델
    cache_dir,   # 임베딩 저장(캐시)장소
)

vectorstore = Chroma.from_documents(documents, cached_embeddings)
# ↑ 이렇게 하면,  다음에 Chroma.from_documents() 를 호출할때는
#  OpenAIEmbeddings 대신에 미리 cache 되어 있듣 embeddings 를 전달할거다.

# 위코드를 실행하여 우리가 또 파일 embedding 작업을 할때는,
# 1.첫번째로!
#   캐시에 embeddings 가 이미 존재하는지 확인할거다.
# 2.만약 없다면!
#    vector store(Chroma.from_documents) 를 호출할 때
#   문서들(docs) 과 함께 OpenAIEmbeddings 를 사용할거다.

results = vectorstore.similarity_search("where does winston live")
results

[Document(metadata={'source': 'D:\\NLP2501\\dataset\\files\\chapter_one.docx'}, page_content="The Ministry of Love was the really frightening one. There were no windows in it at all. Winston had never been inside the Ministry of Love, nor within half a kilometre of it. It was a place impossible to enter except on official business, and then only by penetrating through a maze of barbed-wire entanglements, steel doors, and hidden machine-gun nests. Even the streets leading up to its outer barriers were roamed by gorilla-faced guards in black uniforms, armed with jointed truncheons.\nWinston turned round abruptly. He had set his features into the expression of quiet optimism which it was advisable to wear when facing the telescreen. He crossed the room into the tiny kitchen. By leaving the Ministry at this time of day he had sacrificed his lunch in the canteen, and he was aware that there was no food in the kitchen except a hunk of dark-coloured bread which had got to be saved for tomorro

In [70]:
# 벡터파일 하나만 보기

import glob
for cached_file in glob.glob(os.path.join('.cache', '*')):
    with open(cached_file, 'r') as f:
        print(f.read())
        break

[-0.023815609514713287, -0.009824610315263271, -0.0004901385400444269, -0.01809018664062023, -0.025858482345938683, 0.000790017656981945, -0.006427660584449768, -0.019797060638666153, -0.020374979823827744, -0.038196366280317307, -0.0031046306248754263, 0.04034676030278206, -0.003279350232332945, -0.01424635760486126, 0.012344603426754475, 0.017727306112647057, 0.04596466198563576, 0.02725623920559883, 0.030562467873096466, -0.03580405190587044, -0.01424635760486126, 0.0008416774799115956, -0.014340437017381191, 0.006179021671414375, -0.03204086422920227, -0.008796453475952148, 0.015899471938610077, -0.022955451160669327, 0.0020865537226200104, -0.00772125693038106, 0.003443989669904113, -0.022888250648975372, 0.0017219948349520564, 0.006585580296814442, -0.03975540027022362, -0.02638264186680317, -0.007741416804492474, -0.01804986596107483, 0.006817419547587633, -0.00946845207363367, 0.013977558352053165, -0.0013498759362846613, -0.004502386320382357, 0.0008462974801659584, -0.0306431

In [71]:
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain.memory.summary_buffer import ConversationSummaryBufferMemory
from langchain_core.runnables.passthrough import RunnablePassthrough
from langchain_core.prompts.chat import MessagesPlaceholder

llm = ChatOpenAI(temperature=0.1)

memory = ConversationSummaryBufferMemory(
    llm=llm,
    max_token_limit=120,
    return_messages=True,
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a helpful AI talking to a human"),
        MessagesPlaceholder(variable_name="history"),
        ("human", "{question}"),
    ]
)

def load_memory(_):
    return memory.load_memory_variables({})["history"]

chain = RunnablePassthrough.assign(history=load_memory) | prompt | llm


def invoke_chain(question):
    result = chain.invoke({"question": question})  # ★ 체인 실행!
    memory.save_context(
        {"input": question},
        {"output": result.content},
    )
    print(result)

invoke_chain("My name is John")
invoke_chain("What is my name?")


  memory = ConversationSummaryBufferMemory(


content='Hello John! How can I assist you today?' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 24, 'total_tokens': 34, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-BkU5uemUuSIdvv2onlym5QNpNSAjn', 'finish_reason': 'stop', 'logprobs': None} id='run--ad32fd3e-7100-4713-8353-348f6c49f6b1-0' usage_metadata={'input_tokens': 24, 'output_tokens': 10, 'total_tokens': 34, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
content='Your name is John.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 47, 'total_tokens': 52, 'completion_tokens_details': {'accepted_prediction_to

# Langsmith

https://www.langchain.com/langsmith

![](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRejzbKjev2a8d-EKtUU06p84fh5NX_S7dDLA&s)


In [72]:
# LangSmith 는 (LLM) 기반 애플리케이션의
# 개발, 디버깅, 테스트, 평가, 모니터링, 배포를 지원하는 통합 플랫폼 입니다
# https://www.langchain.com/langsmith

# LangSmith를 사용하면 우리의 체인이 무엇을 하고 있는지 시각적으로 볼수 있다.

# ★ 우선 위 사이트에 회원가입하고 API Key 받아옵니다 ★


In [73]:
"""
↓ .env 파일에 환경변수 입력 (추가)

OPENAI_API_KEY=xxxx

LANGCHAIN_TRACING_V2=true
LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
LANGCHAIN_API_KEY=xxxx
"""
None

In [74]:
"""
그리고 환경변수를 다시 loading 해주자

이걸로 설정은 끝이다
"""
None

# Retrieval QA

## Stuff documents chain 

https://js.langchain.com/v0.1/docs/modules/chains/document/stuff/

![](https://js.langchain.com/v0.1/assets/images/stuff-818da4c66ee17911bc8861c089316579.jpg)

In [75]:
# v0.3
from langchain.chains.retrieval_qa.base import RetrievalQA
# https://python.langchain.com/api_reference/langchain/chains/langchain.chains.retrieval_qa.base.RetrievalQA.html
# Chain for question-answering against an index.


In [76]:
llm = ChatOpenAI()

cache_dir = LocalFileStore(os.path.join(base_path, ".cache"))

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader(os.path.join(base_path, 'chapter_one.txt'))

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = Chroma.from_documents(docs, cached_embeddings)

libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.


In [77]:
chain = RetrievalQA.from_chain_type(   # => RetrievalQA 생성 (chain 을 생성!)
    llm=llm,

    chain_type="stuff",  # stuff 가 default 다

    # retriever= 는 class interface 다
    #  Document들을 여러 소스로부터부터 retrieve(선별해서 가져오기) 할 수 있다.
    #  (vector store 뿐 아니라, DB, doc, cloud, 등 다른 곳에서부터도 가능)
    #  기존에 만들어 놓은 vectorstore로부터 retriever 를 뽑아올수 있다. =>  .as_retriever()
    retriever=vectorstore.as_retriever()
)

# chain 호출
chain.invoke("Where does Winston live?")


{'query': 'Where does Winston live?',
 'result': 'Winston lives in Victory Mansions, which is a place described as having a grim atmosphere with a colored poster of a face, and he resides on the seventh floor of the building.'}

In [78]:
chain.invoke("Describe Victory Mansions")

{'query': 'Describe Victory Mansions',
 'result': 'Victory Mansions is a building with a hallway that smells of boiled cabbage and old rag mats. The building has a coloured poster at one end depicting an enormous face of a man about forty-five with a heavy black moustache. The flat in the building is seven flights up, and the protagonist, Winston, lives there. The building is described as not having a reliable lift, and there is a constant reminder that "BIG BROTHER IS WATCHING YOU" displayed on a poster on each landing. The interior of the flat seems to be equipped with a telescreen that cannot be completely shut off.'}

## Refine document chain

https://js.langchain.com/v0.1/docs/modules/chains/document/refine/

![](https://js.langchain.com/v0.1/assets/images/refine-a70f30dd7ada6fe5e3fcc40dd70de037.jpg)

## Map Reduce document chain

https://js.langchain.com/v0.1/docs/modules/chains/document/map_reduce/

![](https://js.langchain.com/v0.1/assets/images/map_reduce-c65525a871b62f5cacef431625c4d133.jpg)



## Map re-rank documents chain

![](https://www.jiniai.biz/wp-content/uploads/2023/11/image-8-1024x380.png)

In [79]:
chain = RetrievalQA.from_chain_type(
    llm=llm,

    # RetrievalQA 를 사용하는 장점중 하나 chain_type= 은 손쉽게 바꿀수 있다.
    chain_type="refine",  # Refine document chain

    retriever=vectorstore.as_retriever()
)

# chain 호출
chain.invoke("Where does Winston live?")

{'query': 'Where does Winston live?',
 'result': "The additional context provided further highlights Winston's living situation in Victory Mansions. Despite the constant surveillance from the telescreen in his living room, Winston finds a way to sit in a specific alcove where he can avoid being directly seen by the screen. This detail showcases Winston's attempts to find some semblance of privacy and autonomy within the oppressive environment of the Party-controlled apartment building. \n\nThe mention of the book that Winston acquires from a junk shop adds another layer to his living situation. The fact that owning such a book is considered a compromising possession in a society where free thought and individuality are suppressed underscores the restrictions on personal freedoms that Winston faces in his living space. The antique nature of the book and the risk he takes in obtaining it further emphasize the oppressive nature of his living conditions and the lengths he is willing to go 

## FAISS vector store

- FAISS (Facebook AI Similarity Search) 는 Facebook AI에서 개발한 고속 벡터 검색 및 유사도 검색 라이브러리.
- 대량의 고차원 벡터 데이터를 빠르게 검색할 수 있도록 최적화되어 있다,
- AI, 자연어 처리(NLP), 이미지 검색, 추천 시스템 등에서 자주 사용됨.

In [80]:
"""
Chroma 는 내 컴퓨터에서 local 로 실행되는 vector store 다.

FAISS 라는 local로 실행되는 vector store 도 있다. (가끔은 이게 더 좋은 성능을 보이기도 한다)

※ FAISS 나 Chroma 는 무료로 사용할수 있는 로컬 vector store 다
  나중에 실제 서비스에서 사용한다면 cloud 기반의 vector store 를 찾아야 한다
"""
None


In [81]:
# v0.3
from langchain_community.vectorstores.faiss import FAISS
# FAISS vector store integration.

# https://python.langchain.com/api_reference/community/vectorstores/langchain_community.vectorstores.faiss.FAISS.html

In [None]:
# 사용하려면 faiss-cpu, 혹은 faise-gpu 필요
# !pip install faiss-cpu

In [82]:
vectorstore = FAISS.from_documents(documents, embeddings)

In [83]:
chain = RetrievalQA.from_chain_type(
    llm=llm,

    chain_type="refine",

    retriever=vectorstore.as_retriever()
)

# chain 호출
chain.invoke("Where does Winston live?")

{'query': 'Where does Winston live?',
 'result': 'Winston lives in a small apartment in Victory Mansions, a dilapidated building in a dystopian society controlled by a totalitarian regime led by Big Brother. The Ministry of Love, a terrifying place impossible to enter except on official business, looms nearby with its heavily guarded perimeters. Winston carefully navigates his movements within his apartment, always mindful of the telescreen watching his every action, knowing that even the smallest slip-up could lead to severe repercussions from the oppressive regime. He finds solace in his forbidden book and secret moments of defiance, such as indulging in Victory Gin and cigarettes, as he grapples with the suffocating control and constant surveillance imposed on every aspect of his existence.'}

In [84]:
chain.invoke("Describe Victory Mansions")

{'query': 'Describe Victory Mansions',
 'result': "The new context provided further description of Winston's living quarters within Victory Mansions, highlighting the unique layout of the room with the telescreen placed in an unusual position. This positioning allows Winston to sit in an alcove and remain out of sight from the telescreen, enabling him to engage in prohibited activities without immediate detection. Additionally, the description of the book Winston acquires, an old and rare item, adds to the forbidden nature of his actions, as possession of such items is considered compromising and could lead to severe consequences if discovered by the Party.\n\nVictory Mansions, with its deteriorating infrastructure and strict surveillance, serves as a symbol of the oppressive regime under which Winston lives. The ominous presence of the telescreens in every room, including Winston's living space, illustrates the constant monitoring and control imposed by the Party. Despite the bleak en

In [86]:
chain = RetrievalQA.from_chain_type(
    llm=llm,

    chain_type="map_reduce",  # Map Reduce document chain

    retriever=vectorstore.as_retriever()
)

# chain 호출
print(chain.invoke("Where does Winston live?"))
print('🟨' * 20)
print(chain.invoke("Describe Victory Mansions"))

{'query': 'Where does Winston live?', 'result': 'Winston lives in Victory Mansions, near the Ministry of Love. The specific address or location of his flat is not mentioned in the extracted parts of the document.'}
🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨
{'query': 'Describe Victory Mansions', 'result': 'Victory Mansions is described as a dilapidated, run-down apartment complex in the novel 1984 by George Orwell. The building is characterized by its crumbling infrastructure, lack of basic amenities, and constant state of disrepair. Surrounding the building are remnants of bombed-out houses, adding to the bleak and desolate atmosphere of the area. The living conditions in Victory Mansions are squalid and oppressive, reflecting the overall oppressive and authoritarian society depicted in the novel.'}


In [87]:
chain = RetrievalQA.from_chain_type(
    llm=llm,

    chain_type="map_rerank",  # Map Re-rank document chain

    retriever=vectorstore.as_retriever()
)

# chain 호출
print(chain.invoke("Where does Winston live?"))
print('🟨' * 20)
print(chain.invoke("Describe Victory Mansions"))



{'query': 'Where does Winston live?', 'result': 'Victory Mansions'}
🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨




{'query': 'Describe Victory Mansions', 'result': 'Victory Mansions is a dilapidated building with a gritty, smelly hallway that smells like boiled cabbage and old rag mats. The building has seven flights of stairs, with a broken lift that is seldom working. The hallway features a large coloured poster of a man\'s face with a caption that reads "BIG BROTHER IS WATCHING YOU." Inside the flat, there is a telescreen that constantly broadcasts figures related to the production of pig-iron. The protagonist, Winston, lives in one of the flats in Victory Mansions.'}


# Sfuff LCEL Chain

## Retriever
Retriever 도 Chain 을 구성하는 component 다

https://python.langchain.com/docs/concepts/retrievers/

- Retriever 의 입력
  - 질문이나, 그와 관련성이 있는 Document 를 얻기위한 query (한개의 string)

- Retriever 의 출력
  - Document 들의 List

![](https://python.langchain.com/assets/images/retriever_concept-1093f15a8f63ddb90bd23decbd249ea5.png)


In [88]:
llm = ChatOpenAI(temperature=0.1)

In [89]:
retriever = vectorstore.as_retriever()

In [91]:
from langchain_core.prompts.chat import ChatPromptTemplate

In [92]:
# chain 에서 retreiver 는 Document 의 list 를 리턴하게 된다
# 그리고 그 document 들은 template 에 입력되어야 한다
# 그렇게 만들어진 prompt 가 LLM 에 전달되어야 한다.

prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        """
        You are a helpful assistant.
        Answer questions using only the following context.
        If you don't know the answer just say you don't know,
        don't make it up:\n\n{context}
        """,
    ),
    ("human", "{question}"),    
])

In [93]:
chain = retriever | prompt | llm

In [94]:
chain.invoke("Describe Victory Mansions")

TypeError: Expected mapping type as input to ChatPromptTemplate. Received <class 'list'>.
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_PROMPT_INPUT 

In [None]:
# chain.invoke("Describe Victory Mansions")   # <-- 에러

# ↑↑↑↑↑↑
# 1. 이 문자열 query기 retriever 에 전달되는거다.
# 2. retriever 는 List[Document] 를 리턴할거고 prompt 의 {context} 로 전달된다.
#    또한, query 는 prompt 의 {question}으로 입력되어야 하는데..

#  당연히 지금은 작동하지 않는다.  (알아서 동작해주는게 아니다.)


In [95]:
#  prompt 의 {context} property 는 retriever 로부터 오도록 하고
#  retriever 는 invoke 에 입력해준 query 를 받아 호출(call)되어야 한다

# 또한 query 는 prompt 의 {question} property 로 전달되게 해줘야 한다
# 어떻게? RunnablePassthrough 를 사용!

from langchain_core.runnables.passthrough import RunnablePassthrough

# RunnablePassthrough 는 간단한 기능의 class 다.
#  입력값을 말그대로 통과하게 (pass through) 해준다
#  "question": RunnablePassthrough()  =>  "question": "Descrive Victory Mansions"

In [96]:
chain = {"context":retriever, "question": RunnablePassthrough()} | prompt | llm

chain.invoke("Describe Victory Mansions")

AIMessage(content='Victory Mansions is a building where Winston Smith resides. It is described as having glass doors, a hallway that smells of boiled cabbage and old rag mats, and being seven flights up. The building has a faulty lift, and the electricity is cut off during daylight hours as part of an economy drive in preparation for Hate Week. Inside the flat, there is a telescreen that cannot be completely shut off, and a fruity voice reading out figures related to the production of pig-iron. The building is also adorned with a poster depicting an enormous face with the caption "BIG BROTHER IS WATCHING YOU."', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 124, 'prompt_tokens': 2104, 'total_tokens': 2228, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-01

In [100]:
chain = (
    {
        "context":retriever, 
        "question": RunnablePassthrough(),
        "xxxx": RunnablePassthrough(),
    } 
    | prompt 
    | llm
)

chain.invoke("Describe Victory Mansions")

AIMessage(content='Victory Mansions is a building where Winston Smith resides. It is described as having glass doors through which gritty dust can enter. The hallway smells of boiled cabbage and old rag mats. The building has a faulty lift due to the electricity being cut off during daylight hours as part of an economy drive. Winston\'s flat in Victory Mansions is on the seventh floor, and the building has a large colored poster of an enormous face with the caption "BIG BROTHER IS WATCHING YOU" displayed inside.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 101, 'prompt_tokens': 2104, 'total_tokens': 2205, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-BkUsLFEkShbtN7lx0QKoy4iVIunGN', 'finish_reason': 's

# Map Reduce LCEL Chain

https://js.langchain.com/v0.1/docs/modules/chains/document/map_reduce/

![](https://js.langchain.com/v0.1/assets/images/map_reduce-c65525a871b62f5cacef431625c4d133.jpg)



In [None]:
"""
> query 가 주어지면 retriever 로부터  List[Document] 를 얻어낸다
  list of docs

> 각각의 Document 를 위한 prompt 를 만들어 준뒤 llm 에 전달할거다.
     for doc in list of docs | prompt | llm

  ↑ 이때 prompt 는 '이 Document 를 읽고, 사용자의 질문에 답변하기에 적절한(관련있는) 정보를 추출하세요'
  그러면, 이를 전달받은 LLM 은 응답을 출력할거다

> 그리고 LLM 으로 부터 받은 response 들을 취합해 하나의 Document 를 만들어낼거다

   for resposne in list of llms response | put them all together

> 그렇게 만들어진 단 하나의 최종 Document가, LLM 을 위한 prompt 로 전달될거다.

  final doc | prompt | llm

"""
None

In [101]:
# ▶ list of docs

# ▶ for doc in list of docs | prompt | llm

# ▶ for resposne in list of llms responee | put them all together

# ▶ final doc | prompt | llm

# v0.3
from langchain_core.runnables.base import RunnableLambda
# https://python.langchain.com/api_reference/core/runnables/langchain_core.runnables.base.RunnableLambda.html
# RunnableLambda converts a python callable into a Runnable.

# ---------------------------------------------------------------
# map_doc_chain 은 저 아래 map_docs() 안에서 각 document 에 대해 실행될 chain 이다
map_doc_prompt = ChatPromptTemplate.from_messages([
    (
      "system",
      """
      Use the following portion of a long document to see if any of the text is relevant to answer the question.
      Return any relevant text verbatim.
      If there is no relevant text, return : ''
      -------
      {context}
      """,
    ),
    ("human", "{question}"),    
])

map_doc_chain = map_doc_prompt | llm


# ---------------------------------------------------------------
#  map_docs(inputs) 함수는
#    저 아래에 있는 map_chain 에서의 출력을 받아서 수행할 함수.
#    리턴값은 한개의 string 이어야 한다.  (※아래 map_chain 에서 설명)
def map_docs(inputs):
    documents = inputs['documents']
    question = inputs['question']

    # 각각의 Document 에 대해 또 다른 체인
    # map_doc_chain 을 수행한 결과를 join 하여 하나의 string 으로 리턴.
    return "\n\n".join(
        map_doc_chain.invoke(
            {
                "context": doc.page_content,
                "question": question
            }
        ).content
        for doc in documents
    )


# ---------------------------------------------------------------
# map_chain 은 최종 chain 내부 에서 호출될거다.
# map_chain 은 두개의 데이터 필요
# - documents : documents 는 retriever 를 사용해 얻을수 있다.
# - question : map_chain 은 사용자 질문이 필요.  그래야 LLM 에게 요청할 수 있다.
#         '각 document' 를 살펴보면서 사용자 질문에 대답하는데 필요한 정보가
#         담겨져 있는지 알아보고 추출해달라고 하는 거다.
#
map_chain = (
    {
        "documents": retriever,    # retriever("Describe Victory Mansions") 호출 => List[Document] 리턴
        "question": RunnablePassthrough(),
    }    
    #  위 결과를 map_docs() 의 input 으로 주어 호출할거다. 이를 위해 RunnableLambda
) | RunnableLambda(map_docs)

# ---------------------------------------------------------
final_prompt = ChatPromptTemplate.from_messages([
    (
      "system",
      """
      Given the following extracted parts of a long document and a question, create a final answer.
      If you don't know the answer, just say that you don't know. Don't try to make up an answer.
      ------
      {context}
      """,
    ),
    ("human", "{question}"),    
])


chain = {"context": map_chain, "question": RunnablePassthrough()} | final_prompt | llm 

chain.invoke("Describe Victory Mansions")  # <- 최종적으로 실행할 query

AIMessage(content='Victory Mansions is a dilapidated and run-down apartment building in a dystopian society where Winston resides. The building is characterized by cramped living spaces, lack of basic amenities, and a general sense of decay. It has glass doors letting in gritty dust, a hallway smelling of boiled cabbage, and an old rag mat. A colored poster of an enormous face with the caption "BIG BROTHER IS WATCHING YOU" is tacked to the wall. The building has a faulty lift, intermittent electricity, and a telescreen for surveillance. It is located in a slummy quarter of the town and is where all four Ministries of the government can be seen simultaneously.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 137, 'prompt_tokens': 490, 'total_tokens': 627, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cach