In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/

/content/drive/MyDrive/Colab Notebooks


# 1政策文本嵌入

In [None]:
import json
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np

def generate_embeddings(input_file_path, output_file_path):
    """
    加载政策数据，过滤无效条目，生成文本嵌入，并将结果保存到新的JSON文件。

    Args:
        input_file_path (str): 输入的JSON文件路径。
        output_file_path (str): 输出的JSON文件路径。
    """
    # 1. 加载一个强大的预训练模型
    # all-MiniLM-L6-v2 是一个性能卓越且速度快的通用模型，
    # 它将文本编码为 384 维的密集向量，非常适合后续的聚类和分类任务。
    print("Loading Sentence-BERT model: all-MiniLM-L6-v2...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embedding_dim = model.get_sentence_embedding_dimension()
    print(f"Model loaded successfully. Embedding dimension: {embedding_dim}")

    # 2. 读取原始JSON数据
    try:
        with open(input_file_path, 'r', encoding='utf-8') as f:
            all_policies = json.load(f)
        print(f"Successfully loaded {len(all_policies)} total records from {input_file_path}")
    except FileNotFoundError:
        print(f"Error: Input file not found at {input_file_path}")
        return
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {input_file_path}")
        return

    # 3. 过滤数据并准备用于嵌入的文本
    valid_policies = []
    texts_to_embed = []
    print("Filtering policies and preparing text for embedding...")
    for policy in all_policies:
        short_desc = policy.get('ShortDescription')
        # 过滤机制：确保ShortDescription不为空或仅包含空白字符
        if short_desc and str(short_desc).strip():
            # 将原始名称和简短描述合并为一个文本字符串
            # 如果NameOriginalLanguage为空，则使用空字符串代替
            name_orig = policy.get('NameOriginalLanguage', '') or ''
            combined_text = f"{name_orig}. {short_desc}"

            texts_to_embed.append(combined_text)
            valid_policies.append(policy)

    print(f"Found {len(valid_policies)} valid policies to process.")
    if not valid_policies:
        print("No valid policies to process. Exiting.")
        return

    # 4. 生成嵌入向量 (使用tqdm显示进度条)
    print("Generating embeddings for all valid policies. This may take a while...")
    embeddings = model.encode(
        texts_to_embed,
        show_progress_bar=True,
        batch_size=32  # 可以根据您的硬件调整批处理大小
    )

    # 5. 将嵌入向量添加回字典列表
    # JSON不支持Numpy数组，因此需要将其转换为Python列表
    for i, policy in enumerate(valid_policies):
        policy['embed'] = embeddings[i].tolist()

    # 6. 保存带有嵌入向量的新JSON文件
    print(f"\nSaving data with embeddings to {output_file_path}...")
    try:
        with open(output_file_path, 'w', encoding='utf-8') as f:
            json.dump(valid_policies, f, ensure_ascii=False, indent=4)
        print("✅ Success!")
        print(f"Processed {len(valid_policies)} policies and saved to {output_file_path}")
    except Exception as e:
        print(f"Error saving file: {e}")

# --- 主程序入口 ---
if __name__ == '__main__':
    # 请将此路径替换为您自己的文件路径
    INPUT_JSON_PATH = '/mnt/c/Users/20452/OneDrive/桌面/待办文件夹/【代码】/验证3+5/数据/全量标注数据/gemini_merged_policy_data_with_labels_v2.json'
    OUTPUT_JSON_PATH = '/mnt/c/Users/20452/OneDrive/桌面/待办文件夹/【代码】/验证3+5/1政策文本嵌入/policies_with_embeddings.json'

    generate_embeddings(INPUT_JSON_PATH, OUTPUT_JSON_PATH)

# 2无监督分类

## ctm模型

In [None]:
import json
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
import re

# CTM的核心模型
from contextualized_topic_models.models.ctm import CombinedTM

# 使用 SKLEARN 来进行稳定可靠的文本预处理
from sklearn.feature_extraction.text import CountVectorizer

# 使用 NLTK 来获取基础停用词列表
import nltk
from nltk.corpus import stopwords as stop_words

# 用于语言检测
try:
    from langdetect import detect
    from langdetect.lang_detect_exception import LangDetectException
except ImportError:
    print("Warning: langdetect not installed. Installing now...")
    import subprocess
    subprocess.check_call(['pip', 'install', 'langdetect'])
    from langdetect import detect
    from langdetect.lang_detect_exception import LangDetectException

def is_english_word(word):
    """
    检测单词是否为英语
    使用多种方法来确保准确性：
    1. 字符检测 - 只包含英文字母
    2. 语言检测 - 使用langdetect库
    3. 常见非英语词汇黑名单
    """
    # 首先检查是否只包含英文字母（排除特殊字符和数字）
    if not re.match(r'^[a-zA-Z]+$', word):
        return False

    # 太短的词不进行语言检测（通常是英语）
    if len(word) <= 2:
        return True

    # 常见非英语词汇黑名单（从原始停用词列表中提取）
    non_english_blacklist = {
        'российской', 'visokošolskih', 'ministerijos', 'respublikos', 'technologijų',
        'trkiye', 'zukunft', 'innovacin', 'nacional', 'bilim', 'und', 'der', 'dla',
        'ciencia', 'tecnologa', 'tbtak', 'tubitak', 'destekleme', 'arge', 'destek',
        'fondo', 'fonds', 'inteligencia', 'digitale', 'sanayi', 'teknoloji',
        'operacyjny', 'pianoo', 'cooperao', 'entre', 'του', 'desenvolvimento',
        'pesquisa', 'cientfica', 'tecnolgico', 'vlaanderen', 'vlaams', 'voor',
        'piano', 'nazionale', 'burs', 'doktora', 'yurt', 'rencileri', 'salud',
        'investigacin', 'ayudas', 'contratos', 'formacin', 'doctorado', 'programa',
        'nivel', 'alto', 'datos', 'empresarial', 'centro', 'industria', 'industrie',
        'estratégia', 'tecnologia', 'inovação', 'proyectos', 'ciência', 'innovación',
        'tecnología', 'lietuvos', 'на', 'förderung', 'zur', 'für', 'du', 'des',
        'την', 'και', 'για', 'türkiye', 'tübi', 'kosgeb',
    }

    if word.lower() in non_english_blacklist:
        return False

    # 使用langdetect进行语言检测
    try:
        detected_lang = detect(word)
        return detected_lang == 'en'
    except (LangDetectException, Exception):
        # 如果检测失败，采用保守策略：
        # 如果包含非ASCII字符，很可能不是英语
        return all(ord(c) < 128 for c in word)

class CTMDataset(Dataset):
    """CTM数据集类，用于创建CTM期望的数据格式"""

    def __init__(self, X_bow, X_contextual):
        self.X_bow = torch.FloatTensor(X_bow)
        self.X_contextual = torch.FloatTensor(X_contextual)

    def __len__(self):
        return len(self.X_bow)

    def __getitem__(self, index):
        return {
            'X_bow': self.X_bow[index],
            'X_contextual': self.X_contextual[index]
        }

def run_ctm_with_sklearn(input_file_path, output_file_path, num_topics=20):
    """
    (V6 - FINAL - Sklearn Preprocessing)
    使用 Scikit-learn 进行文本预处理，确保兼容性和稳定性。
    """
    # --- 1. 数据加载 (不变) ---
    print(f"Loading data and pre-computed embeddings from {input_file_path}...")
    try:
        df = pd.read_json(input_file_path)
        print(f"Loaded {len(df)} documents")
        documents = (df['NameOriginalLanguage'].fillna('') + ". " + df['ShortDescription'].fillna('')).astype(str).tolist()
        sbert_embeddings = np.array(df['embed'].tolist()) # 直接转为Numpy数组
    except Exception as e:
        print(f"Error loading data: {e}")
        return

    # --- 2. 使用 Scikit-learn进行文本预处理 ---
    print("Preprocessing text using Scikit-learn's CountVectorizer...")
    try:
        stop_words.words("english")
    except LookupError:
        nltk.download('stopwords')

    base_stopwords = list(stop_words.words("english"))
    domain_specific_stopwords = [
        # 原始自定义停用词列表
        'para', 'use', 'based', 'provide', 'new', 'including', 'public', 'sector', 'countries', 'value',
        'mainstreaming', 'access', 'activities', 'capacities', 'change', 'opportunities', 'quality', 'level',
        'smes', 'sme', 'ist', 'national', 'international', 'federal', 'regional', 'european', 'российской', 'visokošolskih',
        'canada', 'canadian', 'china', 'ge', 'ar', 'tak', 'ministerijos', 'nr', 'respublikos', 'technologijų',
        'turkish', 'polish', 'russian', 'ukraine', 'korean', 'kazakhstan', 'bulgaria', 'greece', 'cyprus',  'ist', 'bas',
        'hellenic', 'portugal', 'turkey', 'trkiye', 'austria', 'austrian', 'thailand', 'thai', 'german', '78', '65', 'sure',
        'germany', 'malta', 'flanders', 'flemish', 'romanian', 'japanese', 'japan', 'india', 'zealand', 'zukunft',
        'brasil', 'la', 'innovacin', 'nacional', 've', 'bilim', 'fr', 'und', 'der', 'en', 'na', 'dla', 'vi',
        'ciencia', 'tecnologa', 'tbtak', 'tubitak', 'destekleme', 'arge', 'destek', 'fondo', 'fonds', 'og',
        'inteligencia', 'digitale', 'sanayi', 'teknoloji', 'operacyjny', 'pianoo', 'cooperao', 'entre', 'του',
        'em', 'desenvolvimento', 'pesquisa', 'cientfica', 'tecnolgico', 'vlaanderen', 'vlaams', 'voor', 'costa',
        'piano', 'nazionale', 'burs', 'doktora', 'yk', 'yurt', 'rencileri', 'salud', 'investigacin', 'ayudas',
        'contratos', 'formacin', 'doctorado', 'programa', 'nivel', 'alto', 'datos', 'empresarial', 'centro',
        'industria', 'industrie', 'ministry', 'agency', 'council', '2016', '2020',
        '3a', 'also', 'czech', 'de', 'el', 'da', 'del', 'czech', 'estratégia', 'tecnologia', 'inovação',
        'proyectos', 'ciência', 'innovación', 'tecnología', 'ir', 'di', 'lietuvos', '2010', 'three', 'на',
        '000', 'förderung', 'zur', 'für', '19', 'le', 'du', 'des', 'et', 'br', 'την', 'και', 'για', 'türkiye',
        'tübi', '2019', '2021', '2017', 'bas', '78', '65', 'beis', 'ups', 'ju', 'rica', 'es', 'kosgeb',
    ]

    # 过滤非英语词汇
    print("Filtering non-English words from stopwords...")
    english_domain_stopwords = []
    filtered_count = 0

    for word in domain_specific_stopwords:
        if is_english_word(word):
            english_domain_stopwords.append(word)
        else:
            filtered_count += 1
            print(f"  Filtered out non-English word: '{word}'")

    print(f"Filtered out {filtered_count} non-English words from domain stopwords")
    print(f"Remaining English domain stopwords: {len(english_domain_stopwords)}")

    # 合并英语停用词
    final_stopwords = base_stopwords + english_domain_stopwords

    # 初始化CountVectorizer，这是我们新的预处理核心
    min_df=5 #表示一个词至少要在5个文档中出现过才被考虑
    max_df=0.8 #表示一个词如果在90%以上的文档中都出现过，则忽略它（通常是无用词）
    vectorizer = CountVectorizer(stop_words=final_stopwords, min_df=5, max_df=0.9)

    # 一步到位：创建词袋矩阵
    bow_matrix = vectorizer.fit_transform(documents)

    # 获取词汇表
    vocab = vectorizer.get_feature_names_out()

    # --- 3. 创建CTM兼容的数据集 ---
    print("Creating CTM-compatible dataset...")

    # 将BOW矩阵转换为稠密矩阵
    bow_dense = bow_matrix.toarray()

    # 创建CTM数据集
    training_dataset = CTMDataset(bow_dense, sbert_embeddings)
    # --- ↓↓↓ 在这里添加下面这行关键代码 ↓↓↓ ---
    # 手动将词汇表附加到数据集中，以满足get_topic_lists()的需要
    # CTM模型内部会查找名为 idx2token 的属性来映射单词
    training_dataset.idx2token = vocab
    # --- ↑↑↑ 在这里添加上面这行关键代码 ↑↑↑ ---

    # --- 4. 初始化和训练模型 ---
    print(f"Initializing and training CombinedTM with {num_topics} topics...")
    ctm = CombinedTM(
        bow_size=len(vocab),
        contextual_size=sbert_embeddings.shape[1],
        n_components=num_topics,
        num_epochs=50
    )

    # 直接将我们创建的字典传入模型
    ctm.fit(training_dataset)
    print("Model training complete.")

    # --- 5. 提取并保存结果 ---
    print("Extracting and formatting topics...")
    # 获取主题词汇，传入词汇表
    topic_keywords = ctm.get_topic_lists(15)

    output_data = []
    for i, keywords in enumerate(topic_keywords):
        output_data.append({
            "topic_id": i,
            "keywords": keywords
        })

    print(f"Saving results to {output_file_path}...")
    try:
        with open(output_file_path, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, ensure_ascii=False, indent=4)
        print("✅ Success!")
        print(f"Contextualized Topic Model results saved to {output_file_path}")
    except Exception as e:
        print(f"Error saving JSON file: {e}")

# --- 主程序入口 ---
if __name__ == '__main__':
    INPUT_JSON_PATH = '/mnt/c/Users/20452/OneDrive/桌面/待办文件夹/【代码】/验证3+5/1政策文本嵌入/policies_with_embeddings.json'
    OUTPUT_JSON_PATH = '/mnt/c/Users/20452/OneDrive/桌面/待办文件夹/【代码】/验证3+5/2无监督分类/ctm_results_sklearn.json'
    NUMBER_OF_TOPICS = 20

    run_ctm_with_sklearn(INPUT_JSON_PATH, OUTPUT_JSON_PATH, num_topics=NUMBER_OF_TOPICS)

# 3有监督分类

## 下载模型

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# --- UPDATED a---
MODEL_NAME = 'microsoft/deberta-v3-base'
SAVE_DIRECTORY = '/content/drive/MyDrive/Colab Notebooks/deberta-v3-base-local'
# --- END UPDATE ---

print(f"Downloading model and tokenizer for '{MODEL_NAME}'...")
print("This is a large model and may take several minutes...")

# Download and save the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.save_pretrained(SAVE_DIRECTORY)

# Download and save the model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.save_pretrained(SAVE_DIRECTORY)

print(f"\nModel and tokenizer saved successfully to '{SAVE_DIRECTORY}'")

## 两个模型的训练

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
# ================== 脚本最开始的位置 (修正版) ==================
import warnings

# 忽略所有来自 scikit-learn 的 FutureWarning
# 直接使用内置的 FutureWarning 类型，无需导入
warnings.filterwarnings("ignore", category=FutureWarning, module='sklearn')

print("========= DEBUG: SCRIPT VERSION V5 - CORRECTED WARNING FILTER =========")

import json
import numpy as np
import pandas as pd
import torch
import lightgbm as lgb
import matplotlib
matplotlib.use('Agg') # Set the backend before importing pyplot
import matplotlib.pyplot as plt
import seaborn as sns
# At the top of your file
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DebertaV2Tokenizer
from datasets import Dataset
# ================== 打印实际导入的库版本 ==================
import transformers
print(f"========= DEBUG: IMPORTED TRANSFORMERS VERSION: {transformers.__version__} =========")
# =======================================================
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight # Added for CustomTrainer
from torch import nn # Added for CustomTrainer

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, Trainer,
    TrainingArguments, DebertaV2Tokenizer, DataCollatorWithPadding,
    EarlyStoppingCallback  # <-- 1. 在这里添加 EarlyStoppingCallback 的导入
)
# ==============================================================================
# 0. 配置区域
# ==============================================================================
class Config:
    INPUT_JSON_PATH = '/content/drive/MyDrive/Colab Notebooks/policies_with_embeddings.json'
    RANDOM_STATE = 42
    TEST_SIZE = 0.1
    VALIDATION_SIZE = 0.1
    LGBM_PARAMS = {
        'objective': 'multiclass',
        'metric': ['multi_logloss'], # Note: Early stopping will use this.
        'n_estimators': 5000,
        'learning_rate': 0.01,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'num_leaves':21,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'bagging_freq': 5,
        'verbose': -1,
        'n_jobs': -1,
        'seed': RANDOM_STATE,
        'boosting_type': 'gbdt',
        #'is_unbalance': True, # Alternative to class_weight, try if 'balanced' isn't enough
    }
    TRANSFORMER_MODEL = '/content/drive/MyDrive/Colab Notebooks/deberta-v3-base-local'

    TRAINING_ARGS = {
        'output_dir': '/content/drive/MyDrive/Colab Notebooks/results',
        'num_train_epochs': 25,
        'learning_rate': 1e-5,
        'per_device_train_batch_size': 4,
        'per_device_eval_batch_size': 8,
        'gradient_accumulation_steps': 2,
        'warmup_ratio': 0.1,
        'weight_decay': 0.05,
        'logging_dir': './logs',
        'logging_steps': 100,
        'eval_strategy': "epoch",
        'save_strategy': "epoch",
        'load_best_model_at_end': True,
        'metric_for_best_model': 'f1',
        'greater_is_better': True,
        'report_to': "none"
    }

# ==============================================================================
# Custom Trainer for Weighted Loss (MODIFIED FOR COMPATIBILITY)
# ==============================================================================
class CustomTrainer(Trainer):
    # CORRECTED: Added **kwargs to accept new arguments from the base Trainer class
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Initialize loss function
        loss_fct = None

        # Check if we can and should compute class weights
        if self.train_dataset and hasattr(self.train_dataset, 'features') and 'label' in self.train_dataset.features:
            train_labels = np.array(self.train_dataset['label'])
            if len(train_labels) > 0:
                # Ensure labels are valid for classification
                if (np.issubdtype(train_labels.dtype, np.integer) and np.all(train_labels >= 0) and
                    hasattr(model, 'config') and hasattr(model.config, 'num_labels')):

                    num_classes = model.config.num_labels
                    class_weights_array = compute_class_weight(
                        class_weight='balanced',
                        classes=np.arange(num_classes),
                        y=train_labels
                    )
                    weights_tensor = torch.tensor(class_weights_array, dtype=torch.float, device=model.device)
                    loss_fct = nn.CrossEntropyLoss(weight=weights_tensor)
                else:
                    loss_fct = nn.CrossEntropyLoss()
            else:
                loss_fct = nn.CrossEntropyLoss()
        else:
            loss_fct = nn.CrossEntropyLoss()

        # Compute loss
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss


# ==============================================================================
# 1. 数据加载与准备 (No changes)
# ==============================================================================
def load_and_prepare_data(config):
    print("--- Step 0: Loading and Preparing Data ---")
    df = pd.read_json(config.INPUT_JSON_PATH)
    df['text'] = df['NameOriginalLanguage'].fillna('') + ". " + df['ShortDescription'].fillna('')
    macro_cats = ['Guideline_Strategy', 'Planning_Layout', 'Institutional_Arrangements']
    df['primary_label_str'] = df['ClassificationLabel'].apply(lambda x: 'Macro' if x in macro_cats else 'Micro')
    le_primary = LabelEncoder()
    le_secondary = LabelEncoder()
    df['primary_label'] = le_primary.fit_transform(df['primary_label_str'])
    df['secondary_label'] = le_secondary.fit_transform(df['ClassificationLabel'])
    train_val_df, test_df = train_test_split(
        df, test_size=config.TEST_SIZE, random_state=config.RANDOM_STATE, stratify=df['secondary_label']
    )
    relative_val_size = config.VALIDATION_SIZE / (1 - config.TEST_SIZE)
    train_df, val_df = train_test_split(
        train_val_df, test_size=relative_val_size, random_state=config.RANDOM_STATE, stratify=train_val_df['secondary_label']
    )
    print(f"Data Split: Train={len(train_df)}, Validation={len(val_df)}, Test={len(test_df)}")
    return df, train_df, val_df, test_df, le_primary, le_secondary

# ==============================================================================
# 2. 辅助函数（评估与可视化）(No changes)
# ==============================================================================
def evaluate_model(y_true, y_pred, labels, title):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
    print(f"\n--- Evaluation Results for: {title} ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Macro F1-Score: {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=labels, zero_division=0))
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(max(8, len(labels)*0.9), max(6, len(labels)*0.7)))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.title(f'Confusion Matrix - {title}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    filename = f"confusion_matrix_{title.replace(' ', '_').replace('(', '').replace(')', '')}.png"
    plt.savefig(filename, bbox_inches='tight')
    plt.close()
    print(f"Confusion matrix saved to {filename}")
    return {'accuracy': accuracy, 'f1': f1}

# ==============================================================================
# 3. LightGBM 训练器 (No changes)
# ==============================================================================
def train_evaluate_lgbm(train_df, val_df, test_df, label_col, le, title, apply_class_weighting=False):
    print(f"\n--- Training LightGBM for: {title} ---")
    X_train = np.array(train_df['embed'].tolist())
    y_train = train_df[label_col]
    X_val = np.array(val_df['embed'].tolist())
    y_val = val_df[label_col]
    X_test = np.array(test_df['embed'].tolist())
    y_test = test_df[label_col]

    params = Config.LGBM_PARAMS.copy()
    num_classes = len(le.classes_)

    if num_classes == 2:
        params['objective'] = 'binary'
        params['metric'] = ['binary_logloss']
        if 'num_class' in params: del params['num_class']
        if apply_class_weighting:
            params['class_weight'] = 'balanced'
            print(f"Applied class_weight='balanced' to LGBM (binary) for {title}.")
    else:
        params['objective'] = 'multiclass'
        params['metric'] = ['multi_logloss']
        params['num_class'] = num_classes
        if apply_class_weighting:
            params['class_weight'] = 'balanced'
            print(f"Applied class_weight='balanced' to LGBM (multiclass) for {title}.")

    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              callbacks=[lgb.early_stopping(100, verbose=False)])

    y_pred = model.predict(X_test)
    metrics = evaluate_model(y_test, y_pred, le.classes_, title)
    return model, metrics

# ==============================================================================
# 4. Transformer (DeBERTa) 训练器 (最终报错修正版)
# ==============================================================================
def train_evaluate_transformer(train_df, val_df, test_df, label_col, le, title, use_custom_trainer=False):
    print(f"\n--- Training Transformer for: {title} ---")
    train_dataset = Dataset.from_pandas(train_df[['text', label_col]].rename(columns={label_col: 'label'}))
    val_dataset = Dataset.from_pandas(val_df[['text', label_col]].rename(columns={label_col: 'label'}))
    test_dataset = Dataset.from_pandas(test_df[['text', label_col]].rename(columns={label_col: 'label'}))

    tokenizer = DebertaV2Tokenizer.from_pretrained(Config.TRANSFORMER_MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(
        Config.TRANSFORMER_MODEL, num_labels=len(le.classes_), ignore_mismatched_sizes=True
    )

    def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return {
            'accuracy': accuracy_score(labels, predictions),
            'f1': f1_score(labels, predictions, average='macro', zero_division=0),
        }

    training_args_dict = Config.TRAINING_ARGS.copy()
    if 'early_stopping_patience' in training_args_dict:
        del training_args_dict['early_stopping_patience']

    unique_output_dir = f"./results/{title.replace(' ', '_')}"
    training_args_dict['output_dir'] = unique_output_dir
    training_args = TrainingArguments(**training_args_dict)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

    TrainerClass = CustomTrainer if use_custom_trainer else Trainer

    trainer = TrainerClass(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
        callbacks=[early_stopping_callback]
    )

    # 【在此处添加】手动将tokenizer附加到trainer对象上
    # 这样在 evaluate_hierarchical_pipeline 函数中就能通过 dispatcher.tokenizer 调用它了
    trainer.tokenizer = tokenizer

    trainer.train()

    print(f"\nEvaluating Transformer on test set for: {title}")
    test_predictions = trainer.predict(test_dataset)
    y_pred = np.argmax(test_predictions.predictions, axis=-1)
    y_true = test_dataset['label']
    metrics = evaluate_model(y_true, y_pred, le.classes_, title)
    return trainer, metrics

# ==============================================================================
# 5. 分层管道评估 (No changes)
# ==============================================================================
def evaluate_hierarchical_pipeline(dispatcher, macro_specialist, micro_specialist, test_df, le_primary, le_secondary, model_type):
    print(f"\n--- Evaluating End-to-End Hierarchical Pipeline for: {model_type.upper()} ---")
    y_true = test_df['secondary_label']
    y_pred_final = []
    device = torch.device("cuda" if torch.cuda.is_available() and model_type != 'lightgbm' else "cpu")
    if model_type != 'lightgbm':
        dispatcher.model.to(device)
        macro_specialist.model.to(device)
        micro_specialist.model.to(device)

    for i, row in test_df.iterrows():
        if model_type == 'lightgbm':
            features = np.array(row['embed']).reshape(1, -1)
        else:
            features = row['text']

        # Stage 1: Dispatcher prediction
        if model_type == 'lightgbm':
            primary_pred_encoded = dispatcher.predict(features)[0]
        else:
            inputs = dispatcher.tokenizer(features, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
            with torch.no_grad():
                logits = dispatcher.model(**inputs).logits
            primary_pred_encoded = torch.argmax(logits, dim=1).item()

        primary_pred_label = le_primary.inverse_transform([primary_pred_encoded])[0]

        # Stage 2: Specialist prediction
        if primary_pred_label == 'Macro':
            specialist_trainer = macro_specialist
            current_specialist_le = specialist_trainer.le
        else:
            specialist_trainer = micro_specialist
            current_specialist_le = specialist_trainer.le

        if model_type == 'lightgbm':
            secondary_pred_encoded_specialist = specialist_trainer.predict(features)[0]
        else:
            inputs = specialist_trainer.tokenizer(features, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
            with torch.no_grad():
                logits = specialist_trainer.model(**inputs).logits
            secondary_pred_encoded_specialist = torch.argmax(logits, dim=1).item()

        final_pred_label_str = current_specialist_le.inverse_transform([secondary_pred_encoded_specialist])[0]
        final_pred_encoded_global = le_secondary.transform([final_pred_label_str])[0]
        y_pred_final.append(final_pred_encoded_global)

    evaluate_model(y_true, y_pred_final, le_secondary.classes_, f"End-to-End Pipeline ({model_type.upper()})")

# ==============================================================================
# 6. 主执行流程 (No changes)
# ==============================================================================
def main():
    print("Starting Supervised Classification Pipeline...")
    config = Config()
    full_df, train_df, val_df, test_df, le_primary, le_secondary = load_and_prepare_data(config)

    print("\n\n" + "="*50)
    print("RUNNING LIGHTGBM PIPELINE")
    print("="*50)

    dispatcher_lgbm, _ = train_evaluate_lgbm(
        train_df, val_df, test_df,
        'primary_label', le_primary, 'LGBM Dispatcher (Macro-Micro)',
        apply_class_weighting=False
    )

    macro_df = full_df[full_df['primary_label_str'] == 'Macro'].copy()
    micro_df = full_df[full_df['primary_label_str'] == 'Micro'].copy()
    le_macro = LabelEncoder().fit(macro_df['ClassificationLabel'])
    le_micro = LabelEncoder().fit(micro_df['ClassificationLabel'])
    macro_df['specialist_label'] = le_macro.transform(macro_df['ClassificationLabel'])
    micro_df['specialist_label'] = le_micro.transform(micro_df['ClassificationLabel'])

    macro_train_val_df, macro_test_df = train_test_split(macro_df, test_size=0.2, random_state=config.RANDOM_STATE, stratify=macro_df['specialist_label'])
    macro_train_df, macro_val_df = train_test_split(macro_train_val_df, test_size=0.2, random_state=config.RANDOM_STATE, stratify=macro_train_val_df['specialist_label'])

    micro_train_val_df, micro_test_df = train_test_split(micro_df, test_size=0.2, random_state=config.RANDOM_STATE, stratify=micro_df['specialist_label'])
    micro_train_df, micro_val_df = train_test_split(micro_train_val_df, test_size=0.2, random_state=config.RANDOM_STATE, stratify=micro_train_val_df['specialist_label'])

    macro_specialist_lgbm, _ = train_evaluate_lgbm(
        macro_train_df, macro_val_df, macro_test_df,
        'specialist_label', le_macro, 'LGBM Macro Specialist',
        apply_class_weighting=True
    )
    micro_specialist_lgbm, _ = train_evaluate_lgbm(
        micro_train_df, micro_val_df, micro_test_df,
        'specialist_label', le_micro, 'LGBM Micro Specialist',
        apply_class_weighting=True
    )
    macro_specialist_lgbm.le = le_macro
    micro_specialist_lgbm.le = le_micro
    evaluate_hierarchical_pipeline(dispatcher_lgbm, macro_specialist_lgbm, micro_specialist_lgbm, test_df, le_primary, le_secondary, 'lightgbm')

    print("\n\n" + "="*50)
    print("RUNNING TRANSFORMER (DEBERTA) PIPELINE")
    print("="*50)
    print(f"Using Transformer Model: {Config.TRANSFORMER_MODEL}")
    print("NOTE: This part is computationally expensive and requires a GPU with significant VRAM.")
    if torch.cuda.is_available():
        dispatcher_deberta, _ = train_evaluate_transformer(
            train_df, val_df, test_df, 'primary_label', le_primary, 'DeBERTa Dispatcher (Macro-Micro)', use_custom_trainer=False
        )
        macro_specialist_deberta, _ = train_evaluate_transformer(
            macro_train_df, macro_val_df, macro_test_df, 'specialist_label', le_macro, 'DeBERTa Macro Specialist', use_custom_trainer=True
        )
        micro_specialist_deberta, _ = train_evaluate_transformer(
            micro_train_df, micro_val_df, micro_test_df, 'specialist_label', le_micro, 'DeBERTa Micro Specialist', use_custom_trainer=True
        )
        macro_specialist_deberta.le = le_macro
        micro_specialist_deberta.le = le_micro
        evaluate_hierarchical_pipeline(dispatcher_deberta, macro_specialist_deberta, micro_specialist_deberta, test_df, le_primary, le_secondary, 'transformer')
    else:
        print("\nSkipping Transformer pipeline as no GPU was detected.")

if __name__ == '__main__':
    main()