# 传统NLP数据处理示例

本笔记展示如何使用传统NLP工具对`data/`目录中的文本数据进行分词、清洗、特征提取以及关键词分析。所有代码均附带中文注释，便于团队成员快速理解。

## 1. 安装依赖（如已安装可跳过）

In [None]:

# !pip install -q jieba scikit-learn pandas


## 2. 导入库与路径设置

In [None]:

from pathlib import Path
import re
import statistics
from typing import Iterable

import jieba
import jieba.posseg as pseg
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# 尝试定位项目根目录，确保无论从何处运行都能找到数据文件
PROJECT_ROOT_CANDIDATES: Iterable[Path] = [Path.cwd(), Path.cwd().parent]
for candidate in PROJECT_ROOT_CANDIDATES:
    data_dir = candidate / "data"
    if data_dir.exists():
        PROJECT_ROOT = candidate
        DATA_DIR = data_dir
        break
else:
    raise FileNotFoundError("未找到 data 目录，请确认项目结构是否正确。")

print(f"项目根目录: {PROJECT_ROOT}")
print(f"数据文件: {[p.name for p in DATA_DIR.glob('*')]}")


## 3. 加载与清洗原始文本

In [None]:

PRIMARY_FILE = DATA_DIR / "data.txt"
BACKUP_ENCODINGS = ("utf-8", "utf-8-sig", "gb18030", "iso-8859-1")

# 逐个编码尝试读取文本，保证遇到异常编码时也能成功加载
last_error = None
for enc in BACKUP_ENCODINGS:
    try:
        raw_text = PRIMARY_FILE.read_text(encoding=enc)
        print(f"使用编码 {enc} 成功读取文本，共 {len(raw_text)} 个字符")
        break
    except UnicodeDecodeError as err:
        last_error = err
        continue
else:
    raise last_error or UnicodeDecodeError("加载失败", PRIMARY_FILE.name, 0, 0, "无法解码")

# 使用正则去除多余的空白符与无意义符号，保留换行以帮助后续分析
clean_text = re.sub(r"[	]+", " ", raw_text)
clean_text = re.sub(r"\s+
", "
", clean_text)
clean_text = re.sub(r"
{2,}", "

", clean_text).strip()
print(f"清洗后文本长度: {len(clean_text)}")


## 4. 中文分词与词性标注

In [None]:

# 加载自定义停用词，避免无效词干扰分析
STOPWORDS = {
    "的", "了", "和", "是", "在", "我们", "以及", "一个", "如果", "可以", "需要"
}

# 使用结巴分词并同时进行词性标注
words = []
for word, flag in pseg.cut(clean_text):
    word = word.strip()
    if not word or word in STOPWORDS:
        continue
    words.append({"词语": word, "词性": flag})

words_df = pd.DataFrame(words)
print(f"有效分词数量: {len(words_df)}")
words_df.head(10)


## 5. 构建TF-IDF特征

In [None]:

# 定义使用结巴的分词函数，供TF-IDF向量化器调用

def jieba_tokenizer(text: str):
    return [token for token in jieba.cut(text) if token.strip() and token not in STOPWORDS]

# 因文本可能较长，我们按段落拆分后再计算TF-IDF
paragraphs = [para.strip() for para in clean_text.split("

") if para.strip()]
vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer, max_features=2000)
tfidf_matrix = vectorizer.fit_transform(paragraphs)

feature_names = vectorizer.get_feature_names_out()

print(f"段落数量: {len(paragraphs)}，特征维度: {len(feature_names)}")


## 6. 抽取权重最高的关键词

In [None]:

# 计算每个词语的平均TF-IDF权重，选择整体最重要的关键词

tfidf_mean = tfidf_matrix.mean(axis=0).A1
keyword_scores = list(zip(feature_names, tfidf_mean))
keyword_scores.sort(key=lambda x: x[1], reverse=True)

TOP_K = 20
print(f"前 {TOP_K} 个关键词: 
")
for word, score in keyword_scores[:TOP_K]:
    print(f"{word:<20}	{score:.4f}")


## 7. 计算基本文本统计指标

In [None]:

# 统计词长、词频、以及段落长度信息，帮助理解文本结构
word_lengths = [len(w) for w in words_df["词语"]]
paragraph_lengths = [len(p) for p in paragraphs]

stats = {
    "分词总数": len(words_df),
    "唯一词语数量": words_df["词语"].nunique(),
    "平均词长": statistics.mean(word_lengths),
    "中位词长": statistics.median(word_lengths),
    "段落数量": len(paragraphs),
    "平均段落长度": statistics.mean(paragraph_lengths) if paragraph_lengths else 0,
}

stats


## 8. 保存处理结果供后续使用

In [None]:

output_dir = PROJECT_ROOT / "data" / "processed"
output_dir.mkdir(parents=True, exist_ok=True)

# 导出分词结果与关键词列表，方便其他流程复用
words_df.to_csv(output_dir / "data_words.csv", index=False)
pd.DataFrame(keyword_scores, columns=["词语", "平均权重"]).to_csv(output_dir / "keywords.csv", index=False)

print(f"已保存分词结果与关键词至 {output_dir}")


## 9. 后续扩展建议

- 更换停用词表以适配不同领域文本。
- 引入自定义词典或行业术语，提升分词准确率。
- 结合`n-gram`特征或`TextRank`算法挖掘更丰富的语义信息。
- 将生成的CSV文件接入下游建模或知识库流程。