# 标签编码 (Label Encoding)

使用 `LabelEncoder` 将五个 CSV 文件中的 `label` 列（文本标签）转换为数字类别。

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

BASE_PATH = "/Users/kezhuli/Desktop/删除病名和同义词/"
FILES = [
    "Colon_Cancer.csv",
    "Liver_Cancer.csv",
    "Lung_Cancer.csv",
    "Stomach_Cancer.csv",
    "Thyroid_Cancer.csv",
]

In [2]:
# 先收集所有文件的 label，统一拟合 LabelEncoder，保证五个文件编码一致
all_labels = []
for filename in FILES:
    df = pd.read_csv(BASE_PATH + filename, encoding="utf-8")
    all_labels.extend(df["label"].astype(str).unique().tolist())

le = LabelEncoder()
le.fit(sorted(set(all_labels)))

print("标签 → 数字 映射:")
for label in le.classes_:
    print(f"  {label} → {le.transform([label])[0]}")

标签 → 数字 映射:
  Colon_Cancer → 0
  Liver_Cancer → 1
  Lung_Cancer → 2
  Stomach_Cancer → 3
  Thyroid_Cancer → 4


In [3]:
# 对每个文件进行标签编码，新增 label_encoded 列并写回 CSV
for filename in FILES:
    filepath = BASE_PATH + filename
    df = pd.read_csv(filepath, encoding="utf-8")
    df["label_encoded"] = le.transform(df["label"].astype(str))
    df.to_csv(filepath, index=False, encoding="utf-8")
    print(f"{filename}: 已写入 label_encoded 列，共 {len(df)} 行")

Colon_Cancer.csv: 已写入 label_encoded 列，共 200 行
Liver_Cancer.csv: 已写入 label_encoded 列，共 200 行
Lung_Cancer.csv: 已写入 label_encoded 列，共 200 行
Stomach_Cancer.csv: 已写入 label_encoded 列，共 200 行
Thyroid_Cancer.csv: 已写入 label_encoded 列，共 200 行


In [4]:
# 合并五个文件为一个完整数据集
all_data = []
for filename in FILES:
    filepath = BASE_PATH + filename
    df = pd.read_csv(filepath, encoding="utf-8")
    all_data.append(df)

data = pd.concat(all_data, ignore_index=True)
print(f"合并后总数据: {len(data)} 行")
print(f"各类别分布:\n{data['label'].value_counts().sort_index()}")

合并后总数据: 1000 行
各类别分布:
label
Colon_Cancer      200
Liver_Cancer      200
Lung_Cancer       200
Stomach_Cancer    200
Thyroid_Cancer    200
Name: count, dtype: int64


In [5]:
from sklearn.model_selection import train_test_split

# 准备特征 (X) 和标签 (y)
X = data['cleaned_text']
y = data['label_encoded']

# 划分训练集和测试集: 80% 训练，20% 测试
# stratify=y 确保训练集和测试集中 5 类癌症的比例均衡
# random_state=42 保证结果可复现
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y, 
    random_state=42
)

print(f"训练集大小: {len(X_train)} 行")
print(f"测试集大小: {len(X_test)} 行")
print(f"\n训练集类别分布:\n{y_train.value_counts().sort_index()}")
print(f"\n测试集类别分布:\n{y_test.value_counts().sort_index()}")

训练集大小: 800 行
测试集大小: 200 行

训练集类别分布:
label_encoded
0    160
1    160
2    160
3    160
4    160
Name: count, dtype: int64

测试集类别分布:
label_encoded
0    40
1    40
2    40
3    40
4    40
Name: count, dtype: int64


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 创建 TF-IDF 向量化器
# max_features=2000: 只保留 2000 个最重要的特征
# ngram_range=(1, 2): 提取单词（unigram）和双词组合（bigram）
tfidf = TfidfVectorizer(
    max_features=2000,
    ngram_range=(1, 2)
)

# 只在训练集上 fit，然后 transform 训练集和测试集
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"TF-IDF 特征矩阵:")
print(f"  训练集: {X_train_tfidf.shape} (样本数 × 特征数)")
print(f"  测试集: {X_test_tfidf.shape}")
print(f"\n提取的特征数量: {len(tfidf.get_feature_names_out())}")
print(f"特征示例 (前 20 个): {tfidf.get_feature_names_out()[:20].tolist()}")

TF-IDF 特征矩阵:
  训练集: (800, 2000) (样本数 × 特征数)
  测试集: (200, 2000)

提取的特征数量: 2000
特征示例 (前 20 个): ['abdominal', 'ability', 'ablation', 'able', 'abstract', 'abstract jats', 'abstracttext', 'abstracttext abstracttext', 'abstracttext label', 'ac', 'accepted', 'according', 'accounting', 'accounts', 'accumulating', 'accurate', 'ace', 'achieved', 'acid', 'acids']


In [7]:
# ========== 输出验证 ==========

print("=" * 60)
print("数据集形状验证")
print("=" * 60)

print(f"\n原始文本:")
print(f"  X_train shape: {X_train.shape}")
print(f"  X_test shape: {X_test.shape}")

print(f"\nTF-IDF 特征矩阵:")
print(f"  X_train_tfidf shape: {X_train_tfidf.shape}")
print(f"  X_test_tfidf shape: {X_test_tfidf.shape}")

print(f"\n标签:")
print(f"  y_train shape: {y_train.shape}")
print(f"  y_test shape: {y_test.shape}")

print("\n" + "=" * 60)
print("标签编码对应关系")
print("=" * 60)
print(f"\n{'标签名称':<20} {'编码值'}")
print("-" * 60)
for label in le.classes_:
    encoded = le.transform([label])[0]
    print(f"{label:<20} {encoded}")

print("\n" + "=" * 60)
print("数据集类别分布验证")
print("=" * 60)
print(f"\n训练集 (共 {len(y_train)} 样本):")
for i in range(5):
    count = (y_train == i).sum()
    percentage = count / len(y_train) * 100
    label_name = le.inverse_transform([i])[0]
    print(f"  {label_name:<20} {count:>3} 样本 ({percentage:.1f}%)")

print(f"\n测试集 (共 {len(y_test)} 样本):")
for i in range(5):
    count = (y_test == i).sum()
    percentage = count / len(y_test) * 100
    label_name = le.inverse_transform([i])[0]
    print(f"  {label_name:<20} {count:>3} 样本 ({percentage:.1f}%)")

print("\n" + "=" * 60)

数据集形状验证

原始文本:
  X_train shape: (800,)
  X_test shape: (200,)

TF-IDF 特征矩阵:
  X_train_tfidf shape: (800, 2000)
  X_test_tfidf shape: (200, 2000)

标签:
  y_train shape: (800,)
  y_test shape: (200,)

标签编码对应关系

标签名称                 编码值
------------------------------------------------------------
Colon_Cancer         0
Liver_Cancer         1
Lung_Cancer          2
Stomach_Cancer       3
Thyroid_Cancer       4

数据集类别分布验证

训练集 (共 800 样本):
  Colon_Cancer         160 样本 (20.0%)
  Liver_Cancer         160 样本 (20.0%)
  Lung_Cancer          160 样本 (20.0%)
  Stomach_Cancer       160 样本 (20.0%)
  Thyroid_Cancer       160 样本 (20.0%)

测试集 (共 200 样本):
  Colon_Cancer          40 样本 (20.0%)
  Liver_Cancer          40 样本 (20.0%)
  Lung_Cancer           40 样本 (20.0%)
  Stomach_Cancer        40 样本 (20.0%)
  Thyroid_Cancer        40 样本 (20.0%)

