# 作业一: 探索 LlamaIndex 中的句子切片检索及其参数影响分析

## 文本数据集

* [doc1.txt](./docs/txt/doc1.txt)
* [doc2.txt](./docs/txt/doc2.txt)
* [doc3.txt](./docs/txt/doc3.txt)

## 配置 llamaindex

使用 qwen 系列，兼容 OpenAI 接口的大模型和嵌入模型

In [2]:
import os
from dotenv import load_dotenv
from llama_index.core import Settings
from llama_index.embeddings.dashscope import DashScopeEmbedding
from llama_index.llms.openai_like import OpenAILike

_ = load_dotenv()

Settings.llm = OpenAILike(
    model="qwen-plus-latest",
    api_base="https://dashscope.aliyuncs.com/compatible-mode/v1",
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    is_chat_model=True,
)

Settings.embed_model = DashScopeEmbedding(
    model_name="text-embedding-v4",
    embed_batch_size=6,
    embed_input_length=8192,
)

## 参数对比实验

比较不同参数组合对**检索相关性**和**生成回答质量**的影响
- 检索到的上下文是否包含答案
- LLM 生成的回答是否准确完整
- 上下文冗余程度（主观评分 1–5）
- 制作对比表格或图表展示结果。

### 定义 evaluate_splitter 函数

In [3]:
from typing import Sequence
from llama_index.core import Document
from llama_index.core.indices import VectorStoreIndex
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.core.schema import TransformComponent


def evaluate_splitter(
    splitter: TransformComponent,
    documents: Sequence[Document],
    query: str,
    splitter_description: str,
) -> None:
    index = VectorStoreIndex.from_documents(
        documents,
        transformations=[splitter],
    )

    if splitter_description == "Sentence Window":
        query_engine = index.as_query_engine(
            similarity_top_k=5,
            streaming=True,
            node_postprocessors=[
                MetadataReplacementPostProcessor(target_metadata_key="window")
            ],
        )
    else:
        query_engine = index.as_query_engine()

    response = query_engine.query(query)

    print(f"\n=================分块器: {splitter_description}=================")
    print(f"查询: {query}\n回答: {response}\n")

### 比较不同参数组合对**检索相关性**和**生成回答质量**的影响

1. 使用不同的 chunk_size 参数的 SentenceSplitter 实例

In [None]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.readers import SimpleDirectoryReader

# 加载文档
documents = SimpleDirectoryReader(input_files=["./docs/txt/doc1.txt"]).load_data()

# 不同 chunk_size 参数的实例
sentence_splitter1 = SentenceSplitter(chunk_size=64, chunk_overlap=24)
sentence_splitter2 = SentenceSplitter(chunk_size=128, chunk_overlap=24)
sentence_splitter3 = SentenceSplitter(chunk_size=512, chunk_overlap=24)

query = "为什么说算法偏见问题同样不容忽视？"
evaluate_splitter(
    sentence_splitter1,
    documents,
    query,
    "sentence_splitter1(chunk_size=64, chunk_overlap=24)",
)
evaluate_splitter(
    sentence_splitter2,
    documents,
    query,
    "sentence_splitter1(chunk_size=128, chunk_overlap=24)",
)
evaluate_splitter(
    sentence_splitter3,
    documents,
    query,
    "sentence_splitter2(chunk_size=512, chunk_overlap=24)",
)

2. 使用不同 chunk_overlap 参数的 SentenceSplitter 实例

In [None]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.readers import SimpleDirectoryReader

# 加载文档
documents = SimpleDirectoryReader(input_files=["./docs/txt/doc2.txt"]).load_data()

sentence_splitter4 = SentenceSplitter(chunk_size=128, chunk_overlap=0)
sentence_splitter5 = SentenceSplitter(chunk_size=128, chunk_overlap=16)
sentence_splitter6 = SentenceSplitter(chunk_size=128, chunk_overlap=32)
sentence_splitter7 = SentenceSplitter(chunk_size=128, chunk_overlap=64)

query = "量子计算的前景光明，表现在哪些方面？"

evaluate_splitter(
    sentence_splitter4,
    documents,
    query,
    "sentence_splitter1(chunk_size=128, chunk_overlap=0)",
)
evaluate_splitter(
    sentence_splitter5,
    documents,
    query,
    "sentence_splitter2(chunk_size=128, chunk_overlap=16)",
)

evaluate_splitter(
    sentence_splitter6,
    documents,
    query,
    "sentence_splitter2(chunk_size=128, chunk_overlap=32)",
)

evaluate_splitter(
    sentence_splitter7,
    documents,
    query,
    "sentence_splitter2(chunk_size=128, chunk_overlap=64)",
)

3. 使用不同 splitter 实例

In [None]:
from llama_index.core.node_parser import (
    SentenceSplitter,
    SentenceWindowNodeParser,
    TokenTextSplitter,
)
from llama_index.core.readers import SimpleDirectoryReader

# 加载文档
documents = SimpleDirectoryReader(input_files=["./docs/txt/doc3.txt"]).load_data()

# 句子切片
sentence_splitter = SentenceSplitter(chunk_size=128, chunk_overlap=32)
# Token 切片
token_splitter = TokenTextSplitter(chunk_size=128, chunk_overlap=32, separator="。")
# 句子窗口切片
sentence_window_splitter = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

query = "为什么说《巴黎协定》代表了全球气候治理的重要里程碑？"
evaluate_splitter(
    splitter=sentence_splitter,
    documents=documents,
    query=query,
    splitter_description="Sentence",
)
evaluate_splitter(
    splitter=token_splitter,
    documents=documents,
    query=query,
    splitter_description="Token",
)
evaluate_splitter(
    splitter=sentence_window_splitter,
    documents=documents,
    query=query,
    splitter_description="Sentence Window",
)
