# 7 文档转换器
加载文档后，您通常需要转换它们以更好地适合您的应用程序。

最简单的示例是，您可能希望将长文档拆分为可以放入模型上下文窗口的较小块。

LangChain 有许多内置的文档转换器，可以轻松拆分、组合、过滤和以其他方式操作文档。



当您想要处理长文本时，有必要将该文本拆分为块。

听起来很简单，但这里有很多潜在的复杂性。

理想情况下，您希望将语义相关的文本片段放在一起。

“语义相关”的含义可能取决于文本的类型。本笔记本展示了实现此目的的几种方法。

默认推荐的文本拆分器是 RecursiveCharacterTextSplitter。此文本拆分器采用字符列表。它尝试基于第一个字符的拆分来创建块，但如果任何块太大，它就会移动到下一个字符，依此类推。默认情况下，它尝试拆分的字符是 ["\n\n", "\n", " ", ""]

In [1]:
# This is a long document we can split up.
with open('documentstore/state_of_the_union.txt') as f:
    state_of_the_union = f.read()

from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 100,
    chunk_overlap  = 20,
    length_function = len,
    add_start_index = True,
)
texts = text_splitter.create_documents([state_of_the_union])
print(texts[0])
print(texts[1])

page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and of' metadata={'start_index': 0}
page_content='of Congress and of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.' metadata={'start_index': 82}


In [2]:
#下面是将元数据与文档一起传递的示例，请注意，元数据与文档一起拆分。
metadatas = [{"document": 1}, {"document": 2}]
documents = text_splitter.create_documents([state_of_the_union, state_of_the_union], metadatas=metadatas)
print(documents[0])

page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and of' metadata={'document': 1, 'start_index': 0}


In [3]:
text_splitter.split_text(state_of_the_union)[0]

'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and of'

In [4]:
# CodeTextSplitter 允许您使用多种语言支持拆分代码。导入枚举 Language 并指定语言。
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    Language,
)
# Full list of support languages
[e.value for e in Language]

['cpp',
 'go',
 'java',
 'js',
 'php',
 'proto',
 'python',
 'rst',
 'ruby',
 'rust',
 'scala',
 'swift',
 'markdown',
 'latex',
 'html',
 'sol']

In [5]:
# You can also see the separators used for a given language
RecursiveCharacterTextSplitter.get_separators_for_language(Language.PYTHON)

['\nclass ', '\ndef ', '\n\tdef ', '\n\n', '\n', ' ', '']

In [6]:
#下面是一个使用 PythonTextSplitter 的示例
PYTHON_CODE = """
def hello_world():
    print("Hello, World!")

# Call the function
hello_world()
"""
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)
python_docs = python_splitter.create_documents([PYTHON_CODE])
python_docs

[Document(page_content='def hello_world():\n    print("Hello, World!")', metadata={}),
 Document(page_content='# Call the function\nhello_world()', metadata={})]

In [7]:
#下面是一个使用 JS 文本拆分器的示例
JS_CODE = """
function helloWorld() {
  console.log("Hello, World!");
}

// Call the function
helloWorld();
"""

js_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.JS, chunk_size=60, chunk_overlap=0
)
js_docs = js_splitter.create_documents([JS_CODE])
js_docs

[Document(page_content='function helloWorld() {\n  console.log("Hello, World!");\n}', metadata={}),
 Document(page_content='// Call the function\nhelloWorld();', metadata={})]

In [8]:
#下面是使用 Markdown 文本拆分器的示例。
markdown_text = """
# 🦜️🔗 LangChain

⚡ Building applications with LLMs through composability ⚡

## Quick Install

```bash
# Hopefully this code block isn't split
pip install langchain
```

As an open source project in a rapidly developing field, we are extremely open to contributions.
"""
md_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0
)
md_docs = md_splitter.create_documents([markdown_text])
md_docs

[Document(page_content='# 🦜️🔗 LangChain', metadata={}),
 Document(page_content='⚡ Building applications with LLMs through composability ⚡', metadata={}),
 Document(page_content='## Quick Install\n\n```bash', metadata={}),
 Document(page_content="# Hopefully this code block isn't split", metadata={}),
 Document(page_content='pip install langchain', metadata={}),
 Document(page_content='```', metadata={}),
 Document(page_content='As an open source project in a rapidly developing field, we', metadata={}),
 Document(page_content='are extremely open to contributions.', metadata={})]

In [11]:
# 按字符递归拆分
from langchain.text_splitter import RecursiveCharacterTextSplitter
# This is a long document we can split up.
with open('documentstore/state_of_the_union.txt') as f:
    state_of_the_union = f.read()
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 100,
    chunk_overlap  = 20,
    length_function = len,
)
texts = text_splitter.create_documents([state_of_the_union])
print(texts[0])
print(texts[1])

page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and of' metadata={}
page_content='of Congress and of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.' metadata={}


In [12]:
text_splitter.split_text(state_of_the_union)[:2]

['Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and of',
 'of Congress and of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.']

In [13]:
#按令牌拆分
from langchain.text_splitter import CharacterTextSplitter
with open('documentstore/state_of_the_union.txt') as f:
    state_of_the_union = f.read()
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=100, chunk_overlap=0
)
texts = text_splitter.split_text(state_of_the_union)
print(texts[0])

Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.


In [14]:
#我们也可以直接加载一个tiktoken拆分器
from langchain.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)

texts = text_splitter.split_text(state_of_the_union)
print(texts[0])

Madam Speaker, Madam Vice President, our


无论您的模型的体系结构如何，当您包含 10+ 检索到的文档时，性能都会大幅下降。

简而言之：当模型必须在长上下文中访问相关信息时，往往会忽略提供的文档。

In [17]:
# 导入必要的库和模块
import os
import chromadb
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_transformers import LongContextReorder
from langchain.chains import StuffDocumentsChain, LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI

# 获取预训练的词嵌入
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# 定义一组文本文档
texts = [
    "Basquetball is a great sport.",
    "Fly me to the moon is one of my favourite songs.",
    "The Celtics are my favourite team.",
    "This is a document about the Boston Celtics",
    "I simply love going to the movies",
    "The Boston Celtics won the game by 20 points",
    "This is just a random text.",
    "Elden Ring is one of the best games in the last 15 years.",
    "L. Kornet is one of the best Celtics players.",
    "Larry Bird was an iconic NBA player.",
]

# 使用这些文本文档和预训练的词嵌入创建一个检索器
retriever = Chroma.from_texts(texts, embedding=embeddings).as_retriever(
    search_kwargs={"k": 10}
)

# 定义一个查询
query = "What can you tell me about the Celtics?"

# 基于这个查询检索相关的文档，并按相关性分数排序
docs = retriever.get_relevant_documents(query)
docs


Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

[Document(page_content='This is a document about the Boston Celtics', metadata={}),
 Document(page_content='The Celtics are my favourite team.', metadata={}),
 Document(page_content='L. Kornet is one of the best Celtics players.', metadata={}),
 Document(page_content='The Boston Celtics won the game by 20 points', metadata={}),
 Document(page_content='Larry Bird was an iconic NBA player.', metadata={}),
 Document(page_content='Elden Ring is one of the best games in the last 15 years.', metadata={}),
 Document(page_content='Basquetball is a great sport.', metadata={}),
 Document(page_content='I simply love going to the movies', metadata={}),
 Document(page_content='Fly me to the moon is one of my favourite songs.', metadata={}),
 Document(page_content='This is just a random text.', metadata={})]

In [18]:
# 使用LongContextReorder对文档进行重新排序
# 较不相关的文档将被放在列表的中间，而较相关的文档将被放在开始和结束的位置
reordering = LongContextReorder()
reordered_docs = reordering.transform_documents(docs)

# 确认最相关的4篇文档位于开始和结束的位置
reordered_docs


[Document(page_content='The Celtics are my favourite team.', metadata={}),
 Document(page_content='The Boston Celtics won the game by 20 points', metadata={}),
 Document(page_content='Elden Ring is one of the best games in the last 15 years.', metadata={}),
 Document(page_content='I simply love going to the movies', metadata={}),
 Document(page_content='This is just a random text.', metadata={}),
 Document(page_content='Fly me to the moon is one of my favourite songs.', metadata={}),
 Document(page_content='Basquetball is a great sport.', metadata={}),
 Document(page_content='Larry Bird was an iconic NBA player.', metadata={}),
 Document(page_content='L. Kornet is one of the best Celtics players.', metadata={}),
 Document(page_content='This is a document about the Boston Celtics', metadata={})]

In [19]:
# 重写提示
# 创建一个文档提示，只是简单地返回文档内容
document_prompt = PromptTemplate(
    input_variables=["page_content"], template="{page_content}"
)
document_variable_name = "context"

# 初始化OpenAI模型
llm = OpenAI()

# 定义一个自定义的提示，它会提示模型基于给定的文本（上下文）回答以下的问题
stuff_prompt_override = """给定以下文本：
-----
{context}
-----
请回答以下问题：
{query}"""

# 使用上述自定义提示创建一个提示模板
prompt = PromptTemplate(
    template=stuff_prompt_override, input_variables=["context", "query"]
)

# 实例化LLMChain，这是一个简单的链，只使用OpenAI模型和上述的提示模板
llm_chain = LLMChain(llm=llm, prompt=prompt)

# 实例化StuffDocumentsChain，它使用llm_chain来处理查询，并将文档作为上下文提供
chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_prompt=document_prompt,
    document_variable_name=document_variable_name,
)

# 运行StuffDocumentsChain，使用重新排序的文档作为上下文，并提供查询
chain.run(input_documents=reordered_docs, query=query)


'\n\nThe Celtics are a professional basketball team based in Boston, Massachusetts. They have won 17 NBA Championships, the most out of any team in the league. They have had many iconic players over the years, such as Larry Bird, and currently have stars such as Kemba Walker and Jayson Tatum. They are currently one of the top teams in the league and are looking to make a deep playoff run this season.'

## 文本嵌入模型
嵌入类是设计用于与文本嵌入模型接口的类。有很多嵌入模型提供程序（OpenAI，Cohere，Hugging Face等） - 这个类旨在为所有它们提供一个标准接口。

嵌入创建一段文本的矢量表示形式。这很有用，因为这意味着我们可以考虑向量空间中的文本，并执行诸如语义搜索之类的操作，在其中我们寻找向量空间中最相似的文本片段。

LangChain 中的基本嵌入类公开了两种方法：一种用于嵌入文档，另一种用于嵌入查询。前者将多个文本作为输入，而后者则采用单个文本。将它们作为两个独立方法的原因是，某些嵌入提供程序对文档（要搜索）和查询（搜索查询本身）具有不同的嵌入方法。

In [20]:
from langchain.embeddings import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings()

embeddings = embeddings_model.embed_documents(
    [
        "Hi there!",
        "Oh, hello!",
        "What's your name?",
        "My friends call me World",
        "Hello World!"
    ]
)
len(embeddings), len(embeddings[0])

(5, 1536)

In [21]:
embedded_query = embeddings_model.embed_query("What was the name mentioned in the conversation?")
embedded_query[:5]

[0.005387211957276042,
 -0.0005941777859814659,
 0.03892524773846194,
 -0.00297914132073842,
 -0.008912666382268376]

In [1]:
# 嵌入可以存储或临时缓存，以避免需要重新计算它们。
from langchain.storage import InMemoryStore, LocalFileStore
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
underlying_embeddings = OpenAIEmbeddings()
fs = LocalFileStore("./cache/")

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings, fs, namespace=underlying_embeddings.model
)

In [2]:
#缓存在嵌入之前为空
list(fs.yield_keys())

[]

In [4]:
#加载文档，将其拆分为块，嵌入每个块并将其加载到矢量存储中。
raw_documents = TextLoader("documentstore/state_of_the_union.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
#create the vectorstore 创建矢量存储
db = FAISS.from_documents(documents, cached_embedder)


In [5]:
#如果我们尝试再次创建 vectostore，它会快得多，因为它不需要重新计算任何嵌入。
db2 = FAISS.from_documents(documents, cached_embedder)

In [6]:
#缓存中现在有一些内容
list(fs.yield_keys())[:5]

['text-embedding-ada-002948b99a3-567e-5f0f-9ccb-d68ed5b060d9']

In [7]:
#在内存中
store = InMemoryStore()
underlying_embeddings = OpenAIEmbeddings()
embedder = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings, store, namespace=underlying_embeddings.model
)
embeddings = embedder.embed_documents(["hello", "goodbye"])

In [8]:
#我们第二次尝试嵌入嵌入的时间只有 2 毫秒，因为嵌入是在缓存中查找的。
embeddings_from_cache = embedder.embed_documents(["hello", "goodbye"])

In [9]:
embeddings == embeddings_from_cache

True