In [None]:
from langchain_text_splitters import CharacterTextSplitter

# 1.导入相关依赖
# 2.示例文本
text = """
LangChain 是一个用于开发由语言模型驱动的应用程序的框架的。它提供了一套工具和抽象，使开发者
能够更容易地构建复杂的应用程序。
"""
# 3.定义字符分割器
splitter = CharacterTextSplitter(
    chunk_size=50,  # 每块大小
    chunk_overlap=10,  # 块与块之间的重复字符数
    #length_function=len,
    separator=""  # 设置为空字符串时，表示禁用分隔符优先
)
# 4.分割文本
texts = splitter.split_text(text)
# 5.打印结果
for i, chunk in enumerate(texts):
    print(f"块 {i + 1}:长度：{len(chunk)}")
    print(chunk)
    print("-" * 50)


举例2：体会separator

In [None]:
from langchain_text_splitters import CharacterTextSplitter

# 1.导入相关依赖
# 2.示例文本
# text = """
# LangChain 是一个用于开发由语言模型驱动的应用程序的框架的。它提供了一套工具和抽象，使开发者
# 能够更容易地构建复杂的应用程序。
# """
text = "这是一个示例文本啊。我们将使用CharacterTextSplitter将其分割成小块。分割基于字符数。"

# 3.定义字符分割器
splitter = CharacterTextSplitter(
    chunk_size=42,  # 每块大小
    chunk_overlap=0,  # 块与块之间的重复字符数
    #length_function=len,
    separator="。"  # 设置为空字符串时，表示禁用分隔符优先
)
# 4.分割文本
texts = splitter.split_text(text)
# 5.打印结果
for i, chunk in enumerate(texts):
    print(f"块 {i + 1}:长度：{len(chunk)}")
    print(chunk)
    print("-" * 50)


举例3：指定分隔符

In [None]:
# 1.导入相关依赖
# 2.定义要分割的文本
text = "这是第一段文本。这是第二段内容。最后一段结束。"
from langchain_text_splitters import CharacterTextSplitter

# 3.定义字符分割器
text_splitter = CharacterTextSplitter(
    separator="。",
    chunk_size=20,
    chunk_overlap=0,
    keep_separator=True  #chunk中是否保留切割符
)
# 4.分割文本
chunks = text_splitter.split_text(text)
# 5.打印结果
for i, chunk in enumerate(chunks):
    print(f"块 {i + 1}:长度：{len(chunk)}")
    print(chunk)
    print("-" * 50)


RecursiveCharacterTextSplitter：最常用

举例1：使用split_text()方法演示


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 2.定义RecursiveCharacterTextSplitter分割器对象
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=10,
    chunk_overlap=0,
    add_start_index=True,
)
# 3.定义拆分的内容
text = "LangChain框架特性\n\n多模型集成(GPT/Claude)\n记忆管理功能\n链式调用设计。文档分析场景示例：需要处理PDF/Word等格式。"
# 4.拆分器分割
paragraphs = text_splitter.split_text(text)
for para in paragraphs:
    print(para)
    print('-------')


举例2

In [None]:
# 2.定义RecursiveCharacterTextSplitter分割器对象
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=10,
    chunk_overlap=0,
    add_start_index=True,
)
# 3.定义分割的内容
# text="LangChain框架特性\n\n多模型集成(GPT/Claude)\n记忆管理功能\n链式调用设计。文档分析场景示例：需要处理PDF/Word等格式。"
list = [
    "LangChain框架特性\n\n多模型集成(GPT/Claude)\n记忆管理功能\n链式调用设计。文档分析场景示例：需要处理PDF/Word等格式。"]
# 4.分割器分割
# create_documents()：形参是字符串列表，返回值是Document的列表
paragraphs = text_splitter.create_documents(list)
for para in paragraphs:
    print(para)
print('-------')

# SemanticChunker：语义分块

In [None]:
import numpy as np
from typing import List, Optional
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings


def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


class SemanticChunker:
    def __init__(
            self,
            embeddings: Embeddings,
            threshold: float = 0.8,  # 相似度低于此值则切分
            min_chunk_size: int = 100,
    ):
        self.embeddings = embeddings
        self.threshold = threshold
        self.min_chunk_size = min_chunk_size

    def split_text(self, text: str) -> List[str]:
        # 按句子分割（简单处理，可用更高级的分句器如 nltk、spacy）
        import re
        sentences = re.split(r"(?<=[。！？.!?])\s+", text.strip())
        sentences = [s for s in sentences if s]

        if not sentences:
            return []

        # 获取所有句子的嵌入
        embeddings = self.embeddings.embed_documents(sentences)
        embeddings = [np.array(e) for e in embeddings]

        chunks = []
        current_chunk = [sentences[0]]
        current_embedding = embeddings[0]

        for i in range(1, len(sentences)):
            sim = cosine_similarity(current_embedding, embeddings[i])

            # 如果相似度低，或当前 chunk 太小，决定是否切分
            if sim < self.threshold and len("".join(current_chunk)) >= self.min_chunk_size:
                chunks.append("".join(current_chunk))
                current_chunk = [sentences[i]]
                current_embedding = embeddings[i]
            else:
                current_chunk.append(sentences[i])
                # 更新当前 chunk 的平均嵌入（可选）
                current_embedding = np.mean(
                    [current_embedding, embeddings[i]], axis=0
                )

        if current_chunk:
            chunks.append("".join(current_chunk))

        return chunks

    def split_documents(self, documents: List[Document]) -> List[Document]:
        chunks = []
        for doc in documents:
            texts = self.split_text(doc.page_content)
            for text in texts:
                chunks.append(Document(page_content=text, metadata=doc.metadata))
        return chunks

In [None]:
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv()
embeddings = OpenAIEmbeddings()
chunker = SemanticChunker(embeddings, threshold=0.75)

text = "人工智能是计算机科学的分支。它致力于让机器具备智能。另一方面，量子计算利用量子力学原理进行计算。这两者可能在未来结合。"
chunks = chunker.split_text(text)
for c in chunks:
    print(c)
    print("-" * 40)