# HanLP语义切段示例

本笔记基于 [HanLP](https://github.com/hankcs/HanLP) 的本地API，展示如何对中文长文本进行分句、分词、词性标注，并结合TF-IDF语义相似度完成按语义切段。所有代码都配有中文注释，便于了解每一步的处理逻辑。

## 1. 安装依赖（首次运行需要解开下面的pip命令）

In [None]:

# !pip install -q hanlp scikit-learn pandas


## 2. 导入依赖并定位项目路径

In [None]:

from pathlib import Path
import statistics
from typing import Iterable, List, Tuple

import hanlp
from hanlp.utils.rules import split_sentence
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [None]:

# 允许 notebook 无论从项目根目录还是 notebooks/ 目录启动，都能找到 data 目录
PROJECT_ROOT_CANDIDATES: Iterable[Path] = [Path.cwd(), Path.cwd().parent]
for candidate in PROJECT_ROOT_CANDIDATES:
    data_dir = candidate / "data"
    if data_dir.exists():
        PROJECT_ROOT = candidate
        DATA_DIR = data_dir
        break
else:
    raise FileNotFoundError("未找到 data 目录，请检查项目结构或手动修改路径。")

print(f"项目根目录: {PROJECT_ROOT}")
print(f"可用数据文件: {[p.name for p in DATA_DIR.glob('*')]}")


## 3. 读取并清洗原始文本

In [None]:

PRIMARY_FILE = DATA_DIR / "data.txt"
BACKUP_ENCODINGS = ("utf-8", "utf-8-sig", "gb18030", "iso-8859-1")

last_error = None
for enc in BACKUP_ENCODINGS:
    try:
        raw_text = PRIMARY_FILE.read_text(encoding=enc)
        print(f"使用编码 {enc} 成功读取文本，共 {len(raw_text)} 个字符")
        break
    except UnicodeDecodeError as err:
        last_error = err
else:
    raise last_error or UnicodeDecodeError("读取失败", PRIMARY_FILE.name, 0, 0, "无法解码文件")

# 保留段落结构，去除冗余空白字符
clean_text = raw_text.replace('', ' ')
clean_text = '
'.join(line.strip() for line in clean_text.splitlines())
clean_text = '

'.join(block for block in clean_text.split('

') if block.strip())
print(f"清洗后文本长度: {len(clean_text)}")


## 4. 使用 HanLP 进行分句与语法分析

In [None]:

# 先用规则快速分句，减少模型负担
sentences: List[str] = [s.strip() for s in split_sentence(clean_text) if s.strip()]
print(f"分句数量: {len(sentences)}")
print("示例句子:")
for sample in sentences[:3]:
    print("-", sample)


In [None]:

# 加载HanLP多任务模型，获得分词、词性、依存等多种输出
mtl_model = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH)
analysis = mtl_model(sentences)

print("HanLP返回的键:", list(analysis.keys()))


## 5. 组织分词与词性结果

In [None]:

# 将HanLP的输出整理成表格，便于后续特征计算
records = []
for idx, sentence in enumerate(sentences):
    tokens = analysis['tok/fine'][idx]
    pos_tags = analysis['pos/ctb'][idx]
    ner_tags = analysis['ner/ontonotes'][idx]
    records.append({
        "句子编号": idx,
        "原句": sentence,
        "分词": tokens,
        "词性": pos_tags,
        "命名实体": ner_tags,
    })

sentence_df = pd.DataFrame(records)
sentence_df.head()


## 6. 利用分词结果计算句子语义向量

In [None]:

# 只保留名词、动词、形容词等携带语义的词，提升语义聚合效果
SEMANTIC_POS_PREFIXES = ("N", "V", "JJ", "ADJ")
filtered_tokens: List[List[str]] = []
for idx, row in sentence_df.iterrows():
    tokens = [
        token for token, pos in zip(row["分词"], row["词性"])
        if any(pos.startswith(prefix) for prefix in SEMANTIC_POS_PREFIXES)
    ]
    filtered_tokens.append(tokens if tokens else row["分词"])

# 使用HanLP分出的词直接构建TF-IDF向量，避免重复分词
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    max_features=1024,
)
tfidf_matrix = vectorizer.fit_transform(filtered_tokens)
print(f"TF-IDF矩阵维度: {tfidf_matrix.shape}")


## 7. 依据相邻句子的语义相似度生成语义段落

In [None]:

# 计算相邻句子的余弦相似度，低于阈值则划分新的语义段
SIM_THRESHOLD = 0.25
MIN_SENTENCE_PER_CHUNK = 1

similarities = cosine_similarity(tfidf_matrix)
chunk_spans: List[Tuple[int, int]] = []
start = 0

for idx in range(1, len(sentences)):
    similarity = similarities[idx - 1, idx]
    if similarity < SIM_THRESHOLD and (idx - start) >= MIN_SENTENCE_PER_CHUNK:
        chunk_spans.append((start, idx))
        start = idx

# 处理最后一个片段，若不满足最小句数则并入上一段
if len(sentences) - start >= MIN_SENTENCE_PER_CHUNK:
    chunk_spans.append((start, len(sentences)))
elif chunk_spans:
    prev_start, _ = chunk_spans[-1]
    chunk_spans[-1] = (prev_start, len(sentences))
else:
    chunk_spans.append((start, len(sentences)))

print(f"共划分语义段落: {len(chunk_spans)}")
chunk_spans[:5]


## 8. 汇总每个语义段落的关键信息

In [None]:

from collections import Counter

chunk_records = []
for chunk_id, (start, end) in enumerate(chunk_spans, start=1):
    chunk_sentences = sentence_df.iloc[start:end]
    chunk_text = "".join(chunk_sentences["原句"].tolist())

    # 聚合词性为名词或专有名词的高频词，辅助理解主题
    candidate_terms = []
    for tokens, pos_tags in zip(chunk_sentences["分词"], chunk_sentences["词性"]):
        candidate_terms.extend(
            token
            for token, pos in zip(tokens, pos_tags)
            if pos.startswith("N") or pos.startswith("NR") or pos.startswith("NT")
        )
    top_terms = [term for term, _ in Counter(candidate_terms).most_common(5)]

    chunk_records.append({
        "语义段编号": chunk_id,
        "起始句": start,
        "结束句": end - 1,
        "段落句数": end - start,
        "代表关键词": top_terms,
        "段落文本": chunk_text,
    })

chunk_df = pd.DataFrame(chunk_records)
chunk_df.head()


## 9. 保存语义段结果以便后续复用

In [None]:

output_dir = PROJECT_ROOT / "data" / "processed"
output_dir.mkdir(parents=True, exist_ok=True)

chunk_df.to_csv(output_dir / "semantic_chunks.csv", index=False)
print(f"语义切段结果已保存至 {output_dir / 'semantic_chunks.csv'}")


## 10. 进一步扩展的方向

- 调整 `SIM_THRESHOLD` 与特征词性列表，探索不同语义聚类颗粒度。
- 利用 HanLP 的依存句法或语义角色标注结果，为段落生成摘要或结构化标签。
- 替换 TF-IDF 为 HanLP 的语义相似度模型（如 STS 预训练模型），获得更精细的语义分组。
- 将分段结果与上游 Docling 解析结合，构建多通道语义索引。