# 标签编码 (Label Encoding)

使用 `LabelEncoder` 将 label 列（文本标签）转换为数字类别。

In [3]:
# 若报错 No module named 'sklearn'，先运行本单元格安装 scikit-learn
!python3 -m pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp39-cp39-macosx_12_0_arm64.whl (11.1 MB)
[K     |████████████████████████████████| 11.1 MB 4.5 MB/s eta 0:00:01
Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Collecting scipy>=1.6.0
  Downloading scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl (30.3 MB)
[K     |████████████████████████████████| 30.3 MB 33.4 MB/s eta 0:00:01
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.6.1 scipy-1.13.1 threadpoolctl-3.6.0
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 读取数据（可改为其他 CSV 或合并后的文件）
files = ['Colon_Cancer.csv', 'Liver_Cancer.csv', 'Lung_Cancer.csv', 'Stomach_Cancer.csv', 'Thyroid_Cancer.csv']
dfs = []
for f in files:
    try:
        df = pd.read_csv(f, encoding='utf-8', on_bad_lines='skip')
        dfs.append(df)
    except FileNotFoundError:
        print(f'未找到 {f}，跳过')

df = pd.concat(dfs, ignore_index=True)
print(f'共 {len(df)} 行')
print('列名:', list(df.columns))
print('\nlabel 列取值:', df['label'].unique())

共 1000 行
列名: ['序号', 'label', 'cleaned_text']

label 列取值: ['Colon_Cancer' 'Liver_Cancer' 'Lung_Cancer' 'Stomach_Cancer'
 'Thyroid_Cancer']


In [5]:
# 使用 LabelEncoder 将 label 列转换为数字
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

# 标签与数字的对应关系
print('标签编码映射:')
for i, cls in enumerate(le.classes_):
    print(f'  {cls} -> {i}')

print('\n前 5 行:')
print(df[['label', 'label_encoded']].head())

标签编码映射:
  Colon_Cancer -> 0
  Liver_Cancer -> 1
  Lung_Cancer -> 2
  Stomach_Cancer -> 3
  Thyroid_Cancer -> 4

前 5 行:
          label  label_encoded
0  Colon_Cancer              0
1  Colon_Cancer              0
2  Colon_Cancer              0
3  Colon_Cancer              0
4  Colon_Cancer              0


In [None]:
# 可选：保存带编码列的数据
df.to_csv('data_with_label_encoded.csv', index=False, encoding='utf-8')
print('已保存为 data_with_label_encoded.csv')

已保存为 data_with_label_encoded.csv


In [10]:
# 划分数据集：80% 训练集，20% 测试集
from sklearn.model_selection import train_test_split

X = df['cleaned_text']
y = df['label_encoded']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print(f'训练集样本数: {len(X_train)} ({len(X_train)/len(df)*100:.1f}%)')
print(f'测试集样本数: {len(X_test)} ({len(X_test)/len(df)*100:.1f}%)')
print('\n训练集各类别分布:')
print(pd.Series(y_train).value_counts().sort_index())
print('\n测试集各类别分布:')
print(pd.Series(y_test).value_counts().sort_index())

# 输出验证：形状与标签编码对应关系
print('\n--- 输出验证 ---')
print(f'X_train.shape: {X_train.shape}')
print(f'X_test.shape:  {X_test.shape}')
print('\n标签编码对应关系:')
for i, cls in enumerate(le.classes_):
    print(f'  {cls} -> {i}')

训练集样本数: 800 (80.0%)
测试集样本数: 200 (20.0%)

训练集各类别分布:
label_encoded
0    160
1    160
2    160
3    160
4    160
Name: count, dtype: int64

测试集各类别分布:
label_encoded
0    40
1    40
2    40
3    40
4    40
Name: count, dtype: int64

--- 输出验证 ---
X_train.shape: (800,)
X_test.shape:  (200,)

标签编码对应关系:
  Colon_Cancer -> 0
  Liver_Cancer -> 1
  Lung_Cancer -> 2
  Stomach_Cancer -> 3
  Thyroid_Cancer -> 4


In [None]:
# 特征工程：TF-IDF + N-gram (1-gram 与 2-gram)
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=2000,
    ngram_range=(1, 2)
)

# 只在训练集上 fit，再分别变换训练集和测试集
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f'训练集 TF-IDF 矩阵: {X_train_tfidf.shape}')
print(f'测试集 TF-IDF 矩阵: {X_test_tfidf.shape}')
print(f'特征数（词汇表大小）: {X_train_tfidf.shape[1]}')

训练集 TF-IDF 矩阵: (800, 2000)
测试集 TF-IDF 矩阵: (200, 2000)
特征数（词汇表大小）: 2000
