In [1]:
# LLM , 사전학습 모델을 
#  나만의 데이터를 다룰수 있도록 하는 방법은

# 방법 1. 전이학습 (그중의 하나 Fine tunning (미세조정))
# 방벙 2. RAG : Retrieval Augmented Generation (검색증강생성)

# RAG
Retrieval Augmented Generation (검색증강생성)

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
from langchain_openai.chat_models.base import ChatOpenAI
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain_core.runnables.passthrough import RunnablePassthrough

In [4]:
# 사전학습된 모델은 이미 많은 데이터를 통해 학습한 상태이긴 하나..
# 개인 DB 나 회사내 문서 와 같이 'private 한 데이터' 들에는 접근할수 없다
# 그래서 RAG 를 사용한다!

In [5]:
# 1. Retrieval 단계
# private 으로부터 제공된 data 를 사용하거나 탐색함으로써
# language model 의 능력을 더 '확장(augment)'

# 2. Augmented Generation
# Model 로 하여금 '우리가 보낸 문서 data 만'을 가지고 답변하도록 할수도 있다.
# (경우에 따라, 우리 문서가 더 최신 data 일수도 있기 때문이다)
# 이를 통해 Model 이 과거에 학습한 data 를 참조하지 않게도 할수 있다.

In [6]:
# RAG 는 특정 라이브러리나 프레임워크 이름이 아니라
# 위와 같은 작업을 하는 '기법'을 일반적으로 통칭하는 용어

# RAG 를 수행하는 방법은 굉~장히 많고 다양.

# Retrieve 란

https://python.langchain.com/v0.1/docs/modules/data_connection/

![](https://miro.medium.com/v2/resize:fit:1100/format:webp/1*qyXS4oRtrW2NhhMRBxsdQQ.png)

In [7]:
# RAG 의 첫번째 단계인 Retrieval 의 일반적인 과정
# - data source 에서 데이터 load
# - 데이터는 split 하면서 transform
# - transform 한 데이터를 embed.
# - embed 된 데이터를 store 에 저장.
# - 검색(질의) 가 입력되면 store 에서 관련 문서들을 retrieve!


# DataLoader

In [8]:
# 랭체인에서 제공하는 다양한 document loader 들이 있다
# CSV, File Directory, HTML, JSON, Markdown, PDF 등
# ※그 밖에서도 3rd party loader 들도 있다.

# v0.3 ★
# https://python.langchain.com/docs/integrations/document_loaders/

## 파일준비

In [9]:
# 아래와 같이 파일들을 준비합니다

# 출처는  조지오웰의 소설 '1984' Part1 Chapter1
#  http://www.george-orwell.org/1984/0.html

# 너무 길거나, 너무 짧지 않으면 좋습니다
# 파일이 너무 길면 나중에 임베딩 과정에서 비용지출이 발생.

In [10]:
llm = ChatOpenAI(temperature=0.1)

base_path = r'D:\NLP2507\dataset'

## TextLoader
텍스트 데이터 로더

In [11]:
from langchain_community.document_loaders.text import TextLoader

In [12]:
loader = TextLoader(os.path.join(base_path, 'chapter_one.txt'))

In [13]:
loader.load() # => List[Document] 리턴

[Document(metadata={'source': 'D:\\NLP2507\\dataset\\chapter_one.txt'}, page_content="Part 1, Chapter 1\n\nPart One\n\n\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat 

## PyPDFLoader

In [14]:
from langchain_community.document_loaders.pdf import PyPDFLoader

In [15]:
loader = PyPDFLoader(os.path.join(base_path, 'chapter_one.pdf'))
loader.load()

[Document(metadata={'producer': 'Microsoft® Word 2016', 'creator': 'Microsoft® Word 2016', 'creationdate': '2025-01-30T23:19:00+09:00', 'author': 'Yeonchul Sung', 'moddate': '2025-01-30T23:19:00+09:00', 'source': 'D:\\NLP2507\\dataset\\chapter_one.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Part 1, Chapter 1 \n \n \nPart One \n \n \n1 \nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his \nchin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through \nthe glass doors of Victory Mansions, though not quickly enough to prevent a swirl of \ngritty dust from entering along with him. \n \nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured \nposter, too large for indoor display, had been tacked to the wall. It depicted simply an \nenormous face, more than a metre wide: the face of a man of about forty-five, with a \nheavy black moustache and ruggedly handsome feature

## UnstructuredFileLoader

In [16]:
# 서로 다른 타입의 문서를 읽어오기 위해 각각의 DataLoader 를 사용하기 보다
# UnstructuredFileLoader 라는 것도 사용해볼수 있다. -> 꽤 다양한 포맷의 파일을 읽어올 수 있다


In [17]:
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader

In [18]:
loader = UnstructuredFileLoader(os.path.join(base_path, 'chapter_one.pdf'))
loader.load()

  loader = UnstructuredFileLoader(os.path.join(base_path, 'chapter_one.pdf'))
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[Document(metadata={'source': 'D:\\NLP2507\\dataset\\chapter_one.pdf'}, page_content="Part 1, Chapter 1\n\nPart One\n\n1\n\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his\n\nchin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through\n\nthe glass doors of Victory Mansions, though not quickly enough to prevent a swirl of\n\ngritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured\n\nposter, too large for indoor display, had been tacked to the wall. It depicted simply an\n\nenormous face, more than a metre wide: the face of a man of about forty-five, with a\n\nheavy black moustache and ruggedly handsome features. Winston made for the stairs. It\n\nwas no use trying the lift. Even at the best of times it was seldom working, and at\n\npresent the electric current was cut off during daylight hours. It was part of the economy\n\ndrive in preparati

In [19]:
loader = UnstructuredFileLoader(os.path.join(base_path, 'chapter_one.txt'))
loader.load()

libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.


[Document(metadata={'source': 'D:\\NLP2507\\dataset\\chapter_one.txt'}, page_content="Part 1, Chapter 1\n\nPart One\n\n1 It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was

In [20]:
loader = UnstructuredFileLoader(os.path.join(base_path, 'chapter_one.docx'))
loader.load()

[Document(metadata={'source': 'D:\\NLP2507\\dataset\\chapter_one.docx'}, page_content="Part 1, Chapter 1\n\nPart One\n\n\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat

# Splitter

## Data 를 Split 해야 하는 이유

In [21]:
# loader.load() 의 리턴값을 보면 'Document로 이루어진 list' 다.
# 지금의 경우는 전체 챕터가 '하나의 Document' 에 들어가 있다.


In [22]:
len(loader.load())

1

In [23]:
# 특정 질문에 답해야 하기 위해서, 필요한 '파일의 일부분' 만들 전달해야 할 수도 있다.

#  그래서 문서를 쪼개두어야(split) 한다

# 가령: "Ministry of peace" 를 찾고자 한다면.
# 해당 키워드가 있는 문서(들)만 모델에 넘겨주면 된다.

# 작은 조각들로 쪼개어 두면 필요한 것들을 찾기가 용이해진다.
#  - prompt 도 짧아질거다 (적은 token 사용, 적은 비용.)

# split 하는 방법은 다양하다.

In [24]:
"""
TextSplitter 계층도

BaseDocumentTransformer --> TextSplitter --> <name>TextSplitter  # Example: CharacterTextSplitter
                                             RecursiveCharacterTextSplitter -->  <name>TextSplitter

https://python.langchain.com/api_reference/text_splitters/index.html

"""
None

## RecursiverCharacterTextSplitter

In [25]:
from langchain_text_splitters.character import RecursiveCharacterTextSplitter

In [26]:
splitter = RecursiveCharacterTextSplitter()

In [27]:
# RecursiveCharacterTextSplitter 는 파일을 split 해주는데
# 문장의 끝이나, 문단의 끝부분마다 끊어준다.
# 문장 중간을 끊지는 않는다.  최대한 문장 중간에서 split 되지 않도록 하려 한다.
# 문장 중간에 짤림으로 의미있는 문장들을 잃고 싶지 않다.

# ↓ splitter 사용방법은 두가지 가 있다.

In [28]:
docs = loader.load()  # List[Document]

In [29]:
# 방법1
documents = splitter.split_documents(docs)  # => list[Document]
documents

[Document(metadata={'source': 'D:\\NLP2507\\dataset\\chapter_one.docx'}, page_content="Part 1, Chapter 1\n\nPart One\n\n\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat

In [30]:
len(documents)
# 11개로 split 된 Document 들

11

In [31]:
# 첫번째 Document
print(documents[0].page_content)

Part 1, Chapter 1

Part One


1
It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.

The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his righ

### chunk_size=

In [32]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,  # 얼마나 큰 덩어리로 나눌지 지정.  (아까보다 굉장히 잘개 쪼개질거다)
                        # chunk_size 의 단위는 splitter 마다 다르다.
                    # CharacterTextSplitter 의 경우 chunk_size 는 문자개수
)

documents = loader.load_and_split(text_splitter=splitter)
print(len(documents))

documents[:5]


3498


[Document(metadata={'source': 'D:\\NLP2507\\dataset\\chapter_one.docx'}, page_content='Part 1, Chapter 1\n\nPart One'),
 Document(metadata={'source': 'D:\\NLP2507\\dataset\\chapter_one.docx'}, page_content='1'),
 Document(metadata={'source': 'D:\\NLP2507\\dataset\\chapter_one.docx'}, page_content='It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors'),
 Document(metadata={'source': 'D:\\NLP2507\\dataset\\chapter_one.docx'}, page_content='was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of'),
 Document(metadata={'source': 'D:\\NLP2507\\dataset\\chapter_one.docx'}, page_content='cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to

In [33]:
# ↑ 문제점: 문단의 중간이 잘려버렸다 -> 문장의 의미가 파괴된다.

# 작은 덩어리이면서 문장의 중간을 잘라먹지 않는 방법은?
# chunk_overlap=
#    split 할때 앞 조각의 일부를 가져와서 연결해준다.
#    Document 간의 겹치는 부분 생길수 있다.

In [34]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,  
    chunk_overlap=50,
)

documents = loader.load_and_split(text_splitter=splitter)
print(len(documents))

for document in documents[10:15]:
    print('🔷', document.page_content)

250
🔷 move. BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
🔷 Inside the flat a fruity voice was reading out a list of figures which had something to do with the production of pig-iron. The voice came from an oblong metal plaque like a dulled mirror which
🔷 an oblong metal plaque like a dulled mirror which formed part of the surface of the right-hand wall. Winston turned a switch and the voice sank somewhat, though the words were still distinguishable.
🔷 though the words were still distinguishable. The instrument (the telescreen, it was called) could be dimmed, but there was no way of shutting it off completely. He moved over to the window: a
🔷 it off completely. He moved over to the window: a smallish, frail figure, the meagreness of his body merely emphasized by the blue overalls which were the uniform of the party. His hair was very


## CharacterTextSplitter

In [35]:
from langchain_text_splitters.character import CharacterTextSplitter

In [36]:
# CharacterTextSplitter 도 동작방식은 비슷하다
# separator=  : 특정 문자열 찾은 다음 이를 기준으로 분할한다.

splitter = CharacterTextSplitter(
    separator='\n',   # 줄바꿈 문자 기준으로 쪼갬.
    chunk_size=600,  
    chunk_overlap=100,
)

documents = loader.load_and_split(text_splitter=splitter)
print(len(documents))

for document in documents[0:5]:
    print('🔷', document.page_content)

Created a chunk of size 963, which is longer than the specified 600
Created a chunk of size 774, which is longer than the specified 600
Created a chunk of size 954, which is longer than the specified 600
Created a chunk of size 922, which is longer than the specified 600
Created a chunk of size 881, which is longer than the specified 600
Created a chunk of size 821, which is longer than the specified 600
Created a chunk of size 700, which is longer than the specified 600
Created a chunk of size 745, which is longer than the specified 600
Created a chunk of size 735, which is longer than the specified 600
Created a chunk of size 671, which is longer than the specified 600
Created a chunk of size 991, which is longer than the specified 600
Created a chunk of size 990, which is longer than the specified 600
Created a chunk of size 1289, which is longer than the specified 600
Created a chunk of size 1605, which is longer than the specified 600
Created a chunk of size 1900, which is longer 

46
🔷 Part 1, Chapter 1
Part One
1
It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.
🔷 The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his r

### length_function=

In [37]:
# splitter 에 lenth 를 계산하는 함수를 제공해줄수 있다.
#  length_function=   
#    기본적으론 파이썬의 len() 을 사용한다 (디폴트)

In [38]:
splitter = CharacterTextSplitter(
    separator='\n',   # 줄바꿈 문자 기준으로 쪼갬.
    chunk_size=600,  
    chunk_overlap=100,
    length_function=len,  # (디폴트)
)

documents = loader.load_and_split(text_splitter=splitter)
print(len(documents))

for document in documents[0:5]:
    print('🔷', document.page_content)

Created a chunk of size 963, which is longer than the specified 600
Created a chunk of size 774, which is longer than the specified 600
Created a chunk of size 954, which is longer than the specified 600
Created a chunk of size 922, which is longer than the specified 600
Created a chunk of size 881, which is longer than the specified 600
Created a chunk of size 821, which is longer than the specified 600
Created a chunk of size 700, which is longer than the specified 600
Created a chunk of size 745, which is longer than the specified 600
Created a chunk of size 735, which is longer than the specified 600
Created a chunk of size 671, which is longer than the specified 600
Created a chunk of size 991, which is longer than the specified 600
Created a chunk of size 990, which is longer than the specified 600
Created a chunk of size 1289, which is longer than the specified 600
Created a chunk of size 1605, which is longer than the specified 600
Created a chunk of size 1900, which is longer 

46
🔷 Part 1, Chapter 1
Part One
1
It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.
🔷 The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his r

# TikToken

## OpenAI Tokenizer

In [39]:
# OpenAI 에서의 token 예시
# https://platform.openai.com/tokenizer
# ↓ model 의 관점에서, 몇개의 token 을 사용하는지 확인해 볼수 있다.


## from_tiktoken_encoder()

In [40]:
# tiktoken 은 OpenAI 에 의해 만들어진거다.
# https://github.com/openai/tiktoken   <- 아까 위의 Tokenizer 페이지 하단에 보면 이 링크가 있다.

# 아래의 from_tiktoken_encoder() 을 사용하면 tiktoken 패키지가 동작하는 것이다.


splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator='\n',
    chunk_size=600,   # 600 글자가 아니라 600 token 단위로 쪼개짐!
    chunk_overlap=100,
)

# 방법2
documents = loader.load_and_split(text_splitter=splitter)
print(f'💚 {len(documents)} 개')

for document in documents[0:5]:
  print('🔷', document.page_content)

💚 17 개
🔷 Part 1, Chapter 1
Part One
1
It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.
The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his

# Vectors

## Embedding 과 Vector

In [41]:
# Embedding 은 사람이 읽는 텍스트를 컴퓨터가 이해(연산)할 수 있는 숫자들(벡터)로 변환하는 작업이다.
# 우리가 만든 Document 마다 각각의 벡터를 만들어 주게 될겁니다.
# OpenAI 는 크기가 최소 1000차원 이상!의 벡터를 제공해준다.


In [42]:
"""
3개의 차원을 정의해보자

첫번째 차원을 Masculinity (남성성)
두번째 차원을 Femininity (여성성)
세번째 차원을 Royalty (왕족스러움)

이제 특정 단어에 대한 차원 값(점수)를 줘보자

        Masculinity | Femininity  | Royalty
king    | 0.9       | 0.1         | 1.0
queen   | 0.1       | 0.9         | 1.0
man     | 0.9       | 0.1         | 0.0
woman   | 0.1       | 0.9         | 0.0

이렇게 3차원 벡터에 점수를 매겨 보았다.

단어를 이렇게 벡터로 점수를 매기면 '연산'을 할수 있게 된다.

king - man =  0.0    |    0.0      |  1.0   ==> 'Royalty'
royal + woman =  0.1   | 0.9        | 1.0   ===> 'queen'

★텍스트를 벡터화 하니까 '의미에 대한 연산'이 가능해진다!

"""
None

![](https://miro.medium.com/v2/resize:fit:2000/1*SYiW1MUZul1NvL1kc1RxwQ.png)

## word2vec 예시

https://turbomaze.github.io/word2vecjson/

In [43]:
# 

# Vector Store

## OpenAIEmbeddings

In [44]:
from langchain_openai.embeddings.base import OpenAIEmbeddings

In [45]:
embedder = OpenAIEmbeddings()   # OpenAI 사의 embedding model

In [46]:
embedder.model

# 'text-embedding-ada-002'
# https://platform.openai.com/docs/models/text-embedding-ada-002
# 1M token 당 $0.1

'text-embedding-ada-002'

In [47]:
# OpenAIEmbeddings 를 통해
#  embed_documents()  <- 문서를 embed 하는것 뿐만 아니라
#  embed_query()      <- query 도 embed 하는 것이 가능하다.


In [48]:
vector = embedder.embed_query("Hi")  # <- 모델 호출 발생했다!
print(len(vector))  # 1536 차원의 벡터
print(vector)  # -> List[Float]

1536
[-0.03629858046770096, -0.007224537897855043, -0.03371885418891907, -0.02866363152861595, -0.02686564065515995, 0.03460482135415077, -0.012318846769630909, -0.007752209436148405, 0.0019380523590371013, -0.0027018729597330093, 0.024781012907624245, -0.002477124100551009, -0.00573272630572319, -0.002905449829995632, 0.006677323020994663, -0.00303248199634254, 0.033849142491817474, -0.001503212028183043, 0.02109382674098015, -0.008996471762657166, -0.02171921543776989, 0.01038405206054449, 0.006244111340492964, 0.007081219926476479, -0.012312332168221474, 0.0008998099947348237, 0.005876044277101755, -0.009888952597975731, -0.0030731973238289356, -0.024572549387812614, 0.010742347687482834, -0.01381065882742405, -0.024429231882095337, -0.01411032397300005, 0.0024347801227122545, -0.018878910690546036, 0.0005618723225779831, -0.011270018294453621, 0.018110202625393867, -0.009967125952243805, 0.01302892342209816, -0.011328648775815964, -0.009133275598287582, -0.009654432535171509, -0.02

In [49]:
vectors = embedder.embed_documents([
    "hi",
    "how are you",
    "good to meet you",    
])  # List[List[Float]]

In [50]:
len(vectors)

3

In [51]:
for vector in vectors:
    print(len(vector))  # 각각 동일한 차원으로 임베딩됨.

1536
1536
1536


In [52]:
# 코드를 실행할때마다 '매번' 문서 embedding 을 반복해서 수행하는건 매우 비효율적이다
#  => 시간 소요 + 또한 비용 지출

# 대신! 그 embeded 된 결과들을 '저장'해 줄겁니다.
# LangChain 은 embedding 한것들을 캐싱하는 기능을 제공해준다

# 동일 Document 는 가급적 한번만 embedding 해주는게 좋다.

## Vector Store

In [53]:
# 일단 벡터를 만들고 나서, 그것들을 캐시해주고, vector store 에 넣어주면,
# 우리가 '검색'을 할수 있다.
# 그리하여, '관련있는 문서'들만 찾아낼수 있게 되는 거다

# 랭체인은 다양한 vector store 를 제공한다,  어떤거는 cloud 형태이고, 어떤건 유료이기도 하다.

# 우리는 예제에서 무료로 사용할수 있고 로컬로 저장되는 Chroma 라는 것을 사용해볼겁니다


## Chroma vector store

In [54]:
from langchain_community.vectorstores.chroma import Chroma

In [55]:
# ↓이 ChromaDB 에 'split 된 문서' 와 'OpenAI embedding model' 을 전달해야 한다

# OpenAIEmbeddings 의 옵션에 model= 이 있다. 여기에 원하는 모델 지정가능 (지정안하면 default 동작)

# ★ embedding 모델을 사용하는것도 비용이 발생한다!

# 참고) OpenAI 사에서 제공하는 embedding 모델 정보
#   https://platform.openai.com/docs/guides/embeddings
#   2025.1 현재 : text-embedding-3-small 과 text-embedding-3-large 이 최신 임베딩모델

In [56]:
# 아래 코드 복사
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader(os.path.join(base_path, 'chapter_one.docx'))

docs = loader.load_and_split(text_splitter=splitter)

In [57]:
embeddings = OpenAIEmbeddings()

In [58]:
# 여기에 '문서' 와 '임베딩모델' 을 건네준다
vectorstore = Chroma.from_documents(docs, embeddings)


In [59]:
# vector store 를 사용하여 '유사도 검색' 을 할수 있다.
results = vectorstore.similarity_search("where does winston live")  # -> List[Document] 리턴
print(len(results), '개')

results

4 개


[Document(metadata={'source': 'D:\\NLP2507\\dataset\\chapter_one.docx'}, page_content="The Ministry of Love was the really frightening one. There were no windows in it at all. Winston had never been inside the Ministry of Love, nor within half a kilometre of it. It was a place impossible to enter except on official business, and then only by penetrating through a maze of barbed-wire entanglements, steel doors, and hidden machine-gun nests. Even the streets leading up to its outer barriers were roamed by gorilla-faced guards in black uniforms, armed with jointed truncheons.\nWinston turned round abruptly. He had set his features into the expression of quiet optimism which it was advisable to wear when facing the telescreen. He crossed the room into the tiny kitchen. By leaving the Ministry at this time of day he had sacrificed his lunch in the canteen, and he was aware that there was no food in the kitchen except a hunk of dark-coloured bread which had got to be saved for tomorrow's bre

In [60]:
results[0].page_content

"The Ministry of Love was the really frightening one. There were no windows in it at all. Winston had never been inside the Ministry of Love, nor within half a kilometre of it. It was a place impossible to enter except on official business, and then only by penetrating through a maze of barbed-wire entanglements, steel doors, and hidden machine-gun nests. Even the streets leading up to its outer barriers were roamed by gorilla-faced guards in black uniforms, armed with jointed truncheons.\nWinston turned round abruptly. He had set his features into the expression of quiet optimism which it was advisable to wear when facing the telescreen. He crossed the room into the tiny kitchen. By leaving the Ministry at this time of day he had sacrificed his lunch in the canteen, and he was aware that there was no food in the kitchen except a hunk of dark-coloured bread which had got to be saved for tomorrow's breakfast. He took down from the shelf a bottle of colourless liquid with a plain white l

In [61]:
results[1].page_content

'Part 1, Chapter 1\nPart One\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his rig

## embedding cache.

In [62]:
# 위 코드들을 다시 실행하면 임베딩 결과들 다 사라지고 다시 임베딩 실행 (모델 호출!)
# embedding 결과를 캐시해두자!

In [63]:
from langchain.embeddings.cache import CacheBackedEmbeddings

In [64]:
# 파일의 형태로 캐시해볼거다
from langchain.storage.file_system import LocalFileStore

In [65]:
# 캐시 경로 지정
cache_dir = LocalFileStore(os.path.join(base_path, '.cache'))

In [66]:
embeddings = OpenAIEmbeddings()

In [67]:
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings,  # embedder (모델)
    cache_dir,   # embedding 을 저장할 장소
)

In [68]:
# Chroma.from_documents(docs, embeddings)
vectorstore = Chroma.from_documents(
    docs, 
    cached_embeddings,
    persist_directory=os.path.join(base_path, '.cache'),
)

In [69]:
# ↑ 이렇게 하면,  
#  최초에 Chroma.from_documents() 를 호출할때는 
#     OpenAIEmbeddings 을 사용하여 임베딩. 결과는 cache 함
#  다음에 Chroma.from_documents() 를 호출할때는
#      OpenAIEmbeddings 대신에 미리 cache 되어 있듣 embeddings 를 전달할거다.

# 위코드를 실행하여 우리가 또 파일 embedding 작업을 할때는,
# 1.첫번째로!
#   캐시에 embeddings 가 이미 존재하는지 확인할거다.
# 2.만약 없다면!
#    vector store(Chroma.from_documents) 를 호출할 때
#   문서들(docs) 과 함께 OpenAIEmbeddings 를 사용할거다.
#

In [70]:
results = vectorstore.similarity_search("where does winston live")
results

[Document(metadata={'source': 'D:\\NLP2507\\dataset\\chapter_one.docx'}, page_content="The Ministry of Love was the really frightening one. There were no windows in it at all. Winston had never been inside the Ministry of Love, nor within half a kilometre of it. It was a place impossible to enter except on official business, and then only by penetrating through a maze of barbed-wire entanglements, steel doors, and hidden machine-gun nests. Even the streets leading up to its outer barriers were roamed by gorilla-faced guards in black uniforms, armed with jointed truncheons.\nWinston turned round abruptly. He had set his features into the expression of quiet optimism which it was advisable to wear when facing the telescreen. He crossed the room into the tiny kitchen. By leaving the Ministry at this time of day he had sacrificed his lunch in the canteen, and he was aware that there was no food in the kitchen except a hunk of dark-coloured bread which had got to be saved for tomorrow's bre

In [71]:
from glob import glob
for cached_file in glob(os.path.join(base_path, '.cache', '*')):
    with open(cached_file, 'r') as f:
        print(f.read())
        break

[-0.023815609514713287, -0.009824610315263271, -0.0004901385400444269, -0.01809018664062023, -0.025858482345938683, 0.000790017656981945, -0.006427660584449768, -0.019797060638666153, -0.020374979823827744, -0.038196366280317307, -0.0031046306248754263, 0.04034676030278206, -0.003279350232332945, -0.01424635760486126, 0.012344603426754475, 0.017727306112647057, 0.04596466198563576, 0.02725623920559883, 0.030562467873096466, -0.03580405190587044, -0.01424635760486126, 0.0008416774799115956, -0.014340437017381191, 0.006179021671414375, -0.03204086422920227, -0.008796453475952148, 0.015899471938610077, -0.022955451160669327, 0.0020865537226200104, -0.00772125693038106, 0.003443989669904113, -0.022888250648975372, 0.0017219948349520564, 0.006585580296814442, -0.03975540027022362, -0.02638264186680317, -0.007741416804492474, -0.01804986596107483, 0.006817419547587633, -0.00946845207363367, 0.013977558352053165, -0.0013498759362846613, -0.004502386320382357, 0.0008462974801659584, -0.0306431

# 5.Langsmith

https://www.langchain.com/langsmith

![](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRejzbKjev2a8d-EKtUU06p84fh5NX_S7dDLA&s)

In [72]:
# LangSmith 는 (LLM) 기반 애플리케이션의
# 개발, 디버깅, 테스트, 평가, 모니터링, 배포를 지원하는 통합 플랫폼 입니다
# https://www.langchain.com/langsmith

# LangSmith를 사용하면 우리의 체인이 무엇을 하고 있는지 시각적으로 볼수 있다.

# ★ 우선 위 사이트에 회원가입하고 API Key 받아옵니다 ★

In [73]:
"""
↓ .env 파일에 환경변수 입력 (추가)

OPENAI_API_KEY=xxxx
LANGCHAIN_TRACING_V2=true
LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
LANGCHAIN_API_KEY=xxxx
"""
None

In [74]:
# 환경변수 다시 로딩
load_dotenv()

True

In [75]:
os.getenv('LANGCHAIN_API_KEY')[:10]

'lsv2_pt_9a'

In [76]:
# 메모리 예제

from langchain_openai.chat_models import ChatOpenAI
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain.memory.summary_buffer import ConversationSummaryBufferMemory
from langchain_core.runnables.passthrough import RunnablePassthrough
from langchain_core.prompts.chat import MessagesPlaceholder

llm = ChatOpenAI(temperature=0.1)

memory = ConversationSummaryBufferMemory(
    llm=llm,
    max_token_limit=120,
    return_messages=True,
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a helpful AI talking to a human"),
        MessagesPlaceholder(variable_name="history"),
        ("human", "{question}"),
    ]
)

def load_memory(_):
    return memory.load_memory_variables({})["history"]

chain = RunnablePassthrough.assign(history=load_memory) | prompt | llm


def invoke_chain(question):
    result = chain.invoke({"question": question})  # ★ 체인 실행!
    memory.save_context(
        {"input": question},
        {"output": result.content},
    )
    print(result)

invoke_chain("My name is John")
invoke_chain("What is my name?")



  memory = ConversationSummaryBufferMemory(


content='Hello John! How can I assist you today?' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 24, 'total_tokens': 34, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-ClXsqUnPFDYNS2g7jIXmzIw9HinKo', 'finish_reason': 'stop', 'logprobs': None} id='run--51e1bd79-e30c-45c2-928f-818f61ea8534-0' usage_metadata={'input_tokens': 24, 'output_tokens': 10, 'total_tokens': 34, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
content='Your name is John.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 47, 'total_tokens': 52, 'completion_tokens_details': {'accepted_prediction_to

In [77]:
# 다양한 RAG 구현하는 document chain 을 사용
# 방법1: RetrievalQA 사용 (off-the-shelf chain)  (현재는 deprecated)
# 방법2: LCEL 직접 구현 (권장*)


# RetrievalQA

## Stuff Document chain 

In [79]:
from langchain.chains.retrieval_qa.base import RetrievalQA

In [80]:
chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',    # 기본값은 'stuff'
    
    # retriever= 는 class interface 다
    #  Document들을 여러 소스로부터부터 retrieve(선별해서 가져오기) 할 수 있다.
    #  (vector store 뿐 아니라, DB, doc, cloud, 등 다른 곳에서부터도 가능)
    #  기존에 만들어 놓은 vectorstore로부터 retriever 를 뽑아올수 있다. =>  .as_retriever()
    retriever=vectorstore.as_retriever(),
)

chain.invoke("Where does Winston live?")

{'query': 'Where does Winston live?',
 'result': 'Winston lives in Victory Mansions, which is a building with seven flights of stairs.'}

In [81]:
chain.invoke("Describe Victory Mansions")

{'query': 'Describe Victory Mansions',
 'result': 'Victory Mansions is a building where Winston Smith lives. It has glass doors that let in gritty dust, and the hallway smells of boiled cabbage and old rag mats. The building has a faulty lift that is rarely working, and the electric current is cut off during daylight hours as part of an economy drive. Inside the flat, there is a telescreen on the wall that cannot be completely shut off, and a fruity voice reading out figures related to pig-iron production. The building is described as having seven flights of stairs, with a poster of an enormous face with the caption "BIG BROTHER IS WATCHING YOU" on each landing.'}

## Refine document chain

In [83]:
chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='refine', # chain type 변경!
    retriever=vectorstore.as_retriever(),
)

print(chain.invoke("Where does Winston live?"))
print('🟨' * 20)
print(chain.invoke("Describe Victory Mansions"))

{'query': 'Where does Winston live?', 'result': "In addition to living in Victory Mansions on the seventh floor, Winston also has a unique living situation within his flat. He has positioned himself in a shallow alcove in the living room, away from the telescreen's direct line of sight, allowing him some privacy from its constant surveillance. Despite the oppressive environment, Winston finds solace in a forbidden book he acquired from a junk shop, which is a rare and beautiful item that he feels compelled to possess. This book becomes a symbol of rebellion and individuality in a society where such thoughts are strictly prohibited."}
🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨
{'query': 'Describe Victory Mansions', 'result': 'Victory Mansions, where Winston Smith resides, is a stark and oppressive apartment building in a dystopian society controlled by a totalitarian regime. The building is dilapidated, with cramped apartments and shabby furnishings, reflecting the bleak environment in which Winston lives. T

## FAISS vector store

- FAISS (Facebook AI Similarity Search) 는 Facebook AI에서 개발한 고속 벡터 검색 및 유사도 검색 라이브러리.
- 대량의 고차원 벡터 데이터를 빠르게 검색할 수 있도록 최적화되어 있다,
- AI, 자연어 처리(NLP), 이미지 검색, 추천 시스템 등에서 자주 사용됨.
- 기본적으로 메모리 기반으로 동작한다.

In [84]:
from langchain_community.vectorstores.faiss import FAISS

In [85]:
vectorstore = FAISS.from_documents(documents, embeddings)

In [86]:
chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='refine', # chain type 변경!
    retriever=vectorstore.as_retriever(),   # 변경된 retriever
)

print(chain.invoke("Where does Winston live?"))
print('🟨' * 20)
print(chain.invoke("Describe Victory Mansions"))

{'query': 'Where does Winston live?', 'result': "Winston lives in Victory Mansions on the seventh floor, where he strategically positioned himself in an alcove to remain outside the range of the telescreen in his living room. The oppressive atmosphere of constant surveillance and control by the Party, symbolized by the posters of Big Brother and INGSOC, permeates his living space. Winston's cautious actions and awareness of being monitored at all times reflect the pervasive fear and paranoia instilled by the totalitarian regime. The Ministry of Love, a place of extreme fear and control, looms ominously in the background, serving as a constant reminder of the Party's power and reach. Despite the scarcity of food and the harsh living conditions, Winston finds solace in Victory Gin and Victory Cigarettes, temporary escapes from the harsh reality of his existence."}
🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨
{'query': 'Describe Victory Mansions', 'result': 'Victory Mansions is a building complex where Winston S

## Map Reduce document chain

In [87]:
chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='map_reduce',  # Map Reduce document chain
    retriever=vectorstore.as_retriever(),
)

print(chain.invoke("Where does Winston live?"))
print('🟨' * 20)
print(chain.invoke("Describe Victory Mansions"))

{'query': 'Where does Winston live?', 'result': "Winston Smith lives in Victory Mansions on the seventh floor, in a flat with a telescreen in the living-room. The exact physical address or location of Winston's residence is not explicitly mentioned in the provided text."}
🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨
{'query': 'Describe Victory Mansions', 'result': 'Victory Mansions is a dilapidated apartment building located in London, the chief city of Airstrip One in Oceania, in George Orwell\'s novel "1984." The building is described as having a shabby appearance, with broken elevators, faulty plumbing, and dimly lit corridors. It is overcrowded, with poor sanitation and facilities. The apartments are small and cramped, lacking basic amenities and comfort. Victory Mansions is a symbol of the oppressive and bleak living conditions under the totalitarian regime of the Party in the novel.'}


## Map re-rank documents chain

In [88]:
chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='map_rerank',  # Map re-rank document chain
    retriever=vectorstore.as_retriever(),
)

print(chain.invoke("Where does Winston live?"))
print('🟨' * 20)
print(chain.invoke("Describe Victory Mansions"))



{'query': 'Where does Winston live?', 'result': 'Victory Mansions'}
🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨🟨




{'query': 'Describe Victory Mansions', 'result': 'Victory Mansions is a place where Winston lives. It is a rundown apartment complex with a telescreen in every room. The building is in disrepair, with no food available except for dark-colored bread. The residents are constantly under surveillance and live in fear of the Ministry of Love. The living conditions are poor, with Winston having to drink Victory Gin, which is described as having a sickly, oily smell. Overall, Victory Mansions is a bleak and oppressive place.'}


# LCEL chain - Stuff

## Retriever 의 입출력
Retriever 도 Chain 을 구성하는 component 다

https://python.langchain.com/docs/concepts/retrievers/

- Retriever 의 입력
  - 질문이나, 그와 관련성이 있는 Document 를 얻기위한 query (한개의 string)

- Retriever 의 출력
  - Document 들의 List

In [89]:
llm = ChatOpenAI(temperature=0.1)

In [90]:
retriever = vectorstore.as_retriever()

In [91]:
# chain 에서 retreiver 는 Document 의 list 를 리턴하게 된다
# 그리고 그 document 들은 template 에 입력되어야 한다
# 그렇게 만들어진 prompt 가 LLM 에 전달되어야 한다.

prompt = ChatPromptTemplate.from_messages([
    ('system', """
        You are a helpful assistant.
        Answer questions using only the following context.
        If you don't know the answer just say you don't know,
        don't make it up:\n\n{context}    
    """),
    ('human', "{question}"),
])


In [93]:
chain = retriever | prompt | llm

# chain.invoke("Describe Victory Mansions")  # <- 에러!

# ↑↑↑↑↑↑
# 1. 이 문자열 query기 retriever 에 전달되는거다.
# 2. retriever 는 List[Document] 를 리턴할거고 prompt 의 {context} 로 전달되어야 하는데, ...
#    또한, query 는 prompt 의 {question}으로 입력되어야 하는데..

#  당연히 지금은 작동하지 않는다.  (알아서 동작해주는게 아니다.)

# TypeError: Expected mapping type as input to ChatPromptTemplate. Received <class 'list'>.

In [94]:
#  prompt 의 {context} property 는 retriever 로부터 받아 오도록 하고
#  retriever 는 invoke 에 입력해준 query 를 받아 호출(call)되어야 한다

# 또한 query 는 prompt 의 {question} property 로 전달되게 해줘야 한다
# 어떻게? RunnablePassthrough 를 사용!

from langchain_core.runnables.passthrough import RunnablePassthrough

# RunnablePassthrough 는 간단한 기능의 class 다.
#  입력값을 그대로 통과하게 (pass through) 해준다
#  "question": RunnablePassthrough()  =>  "question": "Descrive Victory Mansions"

In [95]:
chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm

chain.invoke("Describe Victory Mansions")

AIMessage(content='Victory Mansions is a building where Winston Smith resides. It is described as having glass doors, a hallway that smells of boiled cabbage and old rag mats, and being seven flights up. The building has a faulty lift, and the electric current is cut off during daylight hours as part of an economy drive in preparation for Hate Week. Inside the flat, there is a telescreen that cannot be completely shut off, and there is a poster with the caption "BIG BROTHER IS WATCHING YOU" on each landing.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 106, 'prompt_tokens': 2096, 'total_tokens': 2202, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-ClYy2rA48IIQzjLhLMIb0D9x3yfoQ', 'finish_reason': 'stop',

In [99]:
chain = ( 
    {
        "context": retriever, 
        "question": RunnablePassthrough(),
        "xxxx": RunnablePassthrough(),
    } 
    | prompt 
    | llm
)

chain.invoke("Describe Victory Mansions")

AIMessage(content='Victory Mansions is a building where Winston Smith resides. It is described as having glass doors, a hallway that smells of boiled cabbage and old rag mats, and being seven flights up. The building has a faulty lift, with an enormous face poster that reads "BIG BROTHER IS WATCHING YOU" on each landing opposite the lift-shaft. Inside the flat, there is a telescreen that cannot be completely shut off, and the building is part of the city of London in Airstrip One.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 105, 'prompt_tokens': 2096, 'total_tokens': 2201, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-ClZ0vuC0hmZMiDZcv1amOm1KrIyEP', 'finish_reason': 'stop', 'logprobs': None}, id='run

# LCEL chain - Map Reduce

In [100]:
# [Map 단계]
# - query 를 입력하면, documents 들을 입력받아서 각각의 요약작업을 수행함 -> '각각의 요약결과물' 출력

# [Reduce 단계]
# - 체인 출력을 새 문서로 처리합니다.
# - 그런 다음 모든 새 문서를 별도의 결합 문서 체인으로 전달하여 '단일 출력'을 얻는다.

In [None]:
"""
stuff 방식과 map reduce 방식.  어는 것을 사용하는 것이 더 효율적인가?

정답은!
원하는 prompt 의 크기와 검색할 document 의 수에 따라 달라진다.
만약 retriever 가 검색 결과로 천 개 이상의 Document 를 리턴한다면 stuff 방식은 부적절하다.
왜냐하면 stuff 는 prompt 에 그 document 들을 모두 넣을 수 없기 때문이다. 부적합하다.

바로 위와 같은 상황이 (document 가 아주 많은!) 바로 map reduce 방식이 빛을 발하는 순간이다.

※ 후에 우리가 MeetingGPT(회의록 요약) 를 만들때에도 이런 형식의 코드로 구현할겁니다.
  왜냐하면 우리는 일종의 회의 기록을 요약하는 기능을 구현 할 것이기 때문에
  각 Document 를 하나하나 읽으면서 내용을 요약하는 로직이 필요합니다.
  비록 retriever 검색같은건 없지만, 개별 document 에 접근해서 내용을 요약한다.
  그렇게 하면 회의 내용을 요약할수 있게 될겁니다!

이제 LCEL 을 사용해 이 로직을 구현해봅시다
"""
None

In [101]:
# ▶ list of docs

# ▶ for doc in list of docs | prompt | llm

# ▶ for resposne in list of llms responee | put them all together

# ▶ final doc | prompt | llm

from langchain_core.runnables.base import RunnableLambda


# ---------------------------------------------------------------
# map_doc_chain 은 저 아래 map_docs() 안에서 각 document 에 대해 실행될 chain 이다
map_doc_prompt = ChatPromptTemplate.from_messages([
    ('system', """
      Use the following portion of a long document to see if any of the text is relevant to answer the question.
      Return any relevant text verbatim.
      If there is no relevant text, return : ''
      -------
      {context}    
    """),
    ('human', '{question}')
])

map_doc_chain = map_doc_prompt | llm

# ---------------------------------------------------------------
#  map_docs(inputs) 함수는
#    저 아래에 있는 map_chain 에서의 출력을 받아서 수행할 함수.
#    리턴값은 한개의 string 이어야 한다.  (※아래 map_chain 에서 설명)

def map_docs(inputs):
    documents = inputs['documents']
    question = inputs['question']

    # 각각의 Document 에 대해 또 다른 체인
    # map_doc_chain 을 수행한 결과를 join 하여 하나의 string 으로 리턴!
    return "\n\n".join(
        map_doc_chain.invoke(
            {
                'context': doc.page_content,
                'question': question
            }
        ).content
        for doc in documents
    )

# ---------------------------------------------------------------
# map_chain 은 최종 chain 내부 에서 호출될거다.
# map_chain 은 두개의 데이터 필요
# - documents : documents 는 retriever 를 사용해 얻을수 있다.
# - question : map_chain 은 사용자 질문이 필요.  그래야 LLM 에게 요청할 수 있다.
#         '각 document' 를 살펴보면서 사용자 질문에 대답하는데 필요한 정보가
#         담겨져 있는지 알아보고 추출해달라고 하는 거다.
#

map_chain = (
    {
        "documents": retriever, 
        "question": RunnablePassthrough()
    }

    # 이제 위 결과를 map_docs() 의 input 으로 주어 호출할거다
    # 이를 위해 RunnableLambda 사용★
    # RunnableLambda 는 'chain 과 그 내부 어디에서든' function 을 호출할수 있도록 해준다

    # 이때 건내주는 map_docs() 의 리턴값은?
    #   map_chain의 리턴값은 최종 chain 의 "context" 값으로 전달되어야 한다
    #   따라서 map_chain의 목표는 한개의 string 을 리턴해야 하는 거다.
    #   그 string 은 사용자의 question 에 대한
    #   답변 생성에 관련이 있는 정보를 포함한 document 의 일부나 전체다.
    #   따라서 map_docs() 의 리턴값은 string 이어야 한다.
    
    | RunnableLambda(map_docs)
)


# ---------------------------------------
final_prompt = ChatPromptTemplate.from_messages([
    ('system', """
      Given the following extracted parts of a long document and a question, create a final answer.
      If you don't know the answer, just say that you don't know. Don't try to make up an answer.
      ------
      {context}    
    """),
    ('human', "{question}"),
])


#  최종 chain
# {context} 에는 위에서 만든 map_chain 호출결과 output 이 넘겨진다.
# map_chain 호출시 invoke(query) 의 query 가 map_chain 의 input 으로 넘겨진다는 사실을 잊지 말자.
chain = {"context": map_chain, "question": RunnablePassthrough()} | final_prompt | llm


chain.invoke("Describe Victory Mansions")  # <- 최종적으로 실행할 query

AIMessage(content='Victory Mansions is a dilapidated and run-down apartment building in a dystopian society where Winston resides in the novel "1984" by George Orwell. The building has no elevators, poorly maintained staircases, small and cramped apartments with peeling wallpaper and faulty plumbing. It has glass doors letting in gritty dust, a hallway smelling of boiled cabbage, and an old rag mat. A colored poster of an enormous face with the caption "BIG BROTHER IS WATCHING YOU" is tacked to the wall. The building has a faulty lift, with electricity cut off during daylight hours for an economy drive. Winston\'s flat is located seven flights up, with a telescreen that cannot be completely shut off. The building allows a view of all four Ministries of government simultaneously. The living-room contains a telescreen in an unusual position, and there is a shallow alcove where Winston can sit to avoid being seen by the telescreen.', additional_kwargs={'refusal': None}, response_metadata=

In [102]:
chain.invoke("How many ministries are mentioned") 

AIMessage(content='Three ministries are mentioned: the Ministry of Love, the Ministry of Plenty, and the Ministry of Truth.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 140, 'total_tokens': 161, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-ClZkyWHBEv2tjhecV8Cx5k5TN0KMQ', 'finish_reason': 'stop', 'logprobs': None}, id='run--3315d76c-63e3-41ec-b6bd-faed7bc3e9a5-0', usage_metadata={'input_tokens': 140, 'output_tokens': 21, 'total_tokens': 161, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})