In [10]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    get_linear_schedule_with_warmup,
    BertForSequenceClassification
)
from torch.optim import AdamW
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import jieba.posseg as pseg
from collections import Counter

# 设置随机种子以确保结果可复现
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

# 数据处理部分
class AmbiguityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        if not isinstance(text, str):
            text = str(text)
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def enrich_with_linguistic_features(text):
    words = pseg.cut(text)
    tokens = []
    pos_tags = []
    for word, flag in words:
        tokens.append(word)
        pos_tags.append(flag)
    token_str = ' '.join(tokens)
    pos_str = ' '.join(pos_tags)
    enriched_text = f"{text} [TOKENS] {token_str} [POS] {pos_str}"
    return enriched_text

def print_distribution_stats(labels, types, dataset_name):
    """打印数据集中各类型的分布统计"""
    print(f"\n=== {dataset_name} 数据分布 ===")
    total_count = len(labels)
    
    # 按歧义/非歧义统计
    ambig_count = sum(labels)
    non_ambig_count = total_count - ambig_count
    print(f"总数量: {total_count}")
    print(f"歧义句: {ambig_count} ({ambig_count/total_count*100:.2f}%)")
    print(f"非歧义句: {non_ambig_count} ({non_ambig_count/total_count*100:.2f}%)")
    
    # 按类型统计
    type_counter = Counter(types)
    print(f"\n各类型分布:")
    for type_name, count in sorted(type_counter.items()):
        percentage = count / total_count * 100
        print(f"  {type_name}: {count} ({percentage:.2f}%)")
    
    return type_counter

def prepare_data_with_requirements(file_path):
    """
    按照实验要求处理数据：去重、删除不需要的行、随机排序、划分数据集
    """
    print("正在加载和处理数据...")
    
    # 读取数据集
    df = pd.read_csv(file_path)
    print(f"原始数据量: {len(df)}")
    
    # 0. 去重和删除标记为删除的行
    # 首先基于歧义句去重
    df = df.drop_duplicates(subset=['歧义句'], keep='first')
    print(f"去重后数据量: {len(df)}")
    
    # 删除【是否删除】列不为空的行
    if '是否删除' in df.columns:
        before_delete = len(df)
        df = df[df['是否删除'].isna()]
        print(f"删除标记为删除的行后数据量: {len(df)} (删除了 {before_delete - len(df)} 行)")
    
    # 检查必要的列
    required_columns = ['歧义句', '歧义句消岐1', '歧义句消岐2', '歧义类型']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"数据集缺少必要的列: {col}")
    
    # 2. 随机排序 (种子为0)
    df = df.sample(frac=1, random_state=0).reset_index(drop=True)
    print("数据已随机排序")
    
    # 准备训练数据
    # 歧义句作为正样本
    ambiguous_data = df[df['歧义句'].notna()].copy()
    ambiguous_texts = ambiguous_data['歧义句'].apply(enrich_with_linguistic_features).tolist()
    ambiguous_labels = [1] * len(ambiguous_texts)
    # 歧义句保留原始的歧义类型
    ambiguous_types = ambiguous_data['歧义类型'].fillna('未知').tolist()
    
    # 消歧句作为负样本
    disambig_texts = []
    disambig_labels = []
    disambig_types = []
    
    for _, row in df.iterrows():
        ambig_type = row['歧义类型'] if pd.notna(row['歧义类型']) else '未知'
        for col in ['歧义句消岐1', '歧义句消岐2']:
            if pd.notna(row[col]) and str(row[col]).strip():
                disambig_texts.append(enrich_with_linguistic_features(str(row[col])))
                disambig_labels.append(0)
                # 消歧句使用特殊的类型标记，表明它们是对应某种歧义类型的消歧版本
                disambig_types.append(f"非歧义-{ambig_type}")
    
    # 合并所有数据
    all_texts = ambiguous_texts + disambig_texts
    all_labels = ambiguous_labels + disambig_labels
    all_types = ambiguous_types + disambig_types
    
    print(f"歧义句数量: {len(ambiguous_texts)}")
    print(f"消歧句数量: {len(disambig_texts)}")
    print(f"总数据量: {len(all_texts)}")
    
    # 打印整体数据分布
    print_distribution_stats(all_labels, all_types, "整体数据")
    
    # 3. 划分数据集 (种子为2025)，使用改进的分层策略
    # 对于样本数量太少的类别，我们将它们合并到主要类别中进行分层
    type_counts = Counter(all_types)
    min_samples_for_stratify = 3  # 每个类别至少需要3个样本才能进行分层
    
    # 创建分层标签
    stratify_labels = []
    for label, atype in zip(all_labels, all_types):
        if type_counts[atype] >= min_samples_for_stratify:
            # 样本足够的类别使用完整的分层标签
            stratify_labels.append(f"{label}_{atype}")
        else:
            # 样本不足的类别只按歧义/非歧义分层
            stratify_labels.append(f"{label}_其他")
    
    # 检查分层标签的分布
    stratify_counts = Counter(stratify_labels)
    print(f"\n分层标签分布:")
    for label, count in sorted(stratify_counts.items()):
        print(f"  {label}: {count}")
    
    # 首先划分出训练集 (70%)
    train_texts, temp_texts, train_labels, temp_labels, train_types, temp_types, train_strat, temp_strat = train_test_split(
        all_texts, all_labels, all_types, stratify_labels,
        test_size=0.3, stratify=stratify_labels, random_state=2025
    )
    
    # 计算需要的测试集大小，确保至少100条
    min_test_size = 100
    temp_size = len(temp_texts)
    
    if temp_size < min_test_size:
        raise ValueError(f"剩余数据量 ({temp_size}) 不足以创建100条测试集")
    
    # 计算测试集比例，确保至少100条
    test_ratio = max(0.5, min_test_size / temp_size)  # 在临时数据中的比例
    
    # 划分验证集和测试集
    val_texts, test_texts, val_labels, test_labels, val_types, test_types = train_test_split(
        temp_texts, temp_labels, temp_types,
        test_size=test_ratio, stratify=temp_strat, random_state=2025
    )
    
    print(f"\n=== 数据集划分结果 ===")
    print(f"训练集大小: {len(train_texts)} ({len(train_texts)/len(all_texts)*100:.1f}%)")
    print(f"验证集大小: {len(val_texts)} ({len(val_texts)/len(all_texts)*100:.1f}%)")
    print(f"测试集大小: {len(test_texts)} ({len(test_texts)/len(all_texts)*100:.1f}%)")
    
    # 打印各数据集的详细分布
    train_dist = print_distribution_stats(train_labels, train_types, "训练集")
    val_dist = print_distribution_stats(val_labels, val_types, "验证集")  
    test_dist = print_distribution_stats(test_labels, test_types, "测试集")
    
    # 验证分布一致性
    print(f"\n=== 分布一致性验证 ===")
    all_type_names = set(all_types)
    
    for type_name in sorted(all_type_names):
        train_ratio = train_dist.get(type_name, 0) / len(train_texts) * 100
        val_ratio = val_dist.get(type_name, 0) / len(val_texts) * 100
        test_ratio = test_dist.get(type_name, 0) / len(test_texts) * 100
        
        print(f"{type_name}:")
        print(f"  训练集: {train_ratio:.2f}%, 验证集: {val_ratio:.2f}%, 测试集: {test_ratio:.2f}%")
    
    return {
        'train': (train_texts, train_labels, train_types),
        'val': (val_texts, val_labels, val_types),
        'test': (test_texts, test_labels, test_types)
    }

def create_dataloaders(data_dict, tokenizer, batch_size=16):
    """创建训练、验证和测试集的DataLoader"""
    print("正在创建数据加载器...")
    
    train_dataset = AmbiguityDataset(
        texts=data_dict['train'][0],
        labels=data_dict['train'][1],
        tokenizer=tokenizer
    )
    
    val_dataset = AmbiguityDataset(
        texts=data_dict['val'][0],
        labels=data_dict['val'][1],
        tokenizer=tokenizer
    )
    
    test_dataset = AmbiguityDataset(
        texts=data_dict['test'][0],
        labels=data_dict['test'][1],
        tokenizer=tokenizer
    )
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
    
    return train_dataloader, val_dataloader, test_dataloader

def train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, num_epochs, patience=3):
    """训练模型并在每个epoch后评估"""
    best_val_f1 = 0
    counter = 0
    best_model = None
    
    history = {
        'train_loss': [],
        'val_accuracy': [],
        'val_precision': [],
        'val_recall': [],
        'val_f1': []
    }
    
    for epoch in range(num_epochs):
        print(f"开始 Epoch {epoch+1}/{num_epochs}...")
        
        # 训练
        model.train()
        train_loss = 0
        
        for batch in tqdm(train_dataloader, desc="训练中"):
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            train_loss += loss.item()
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
        
        avg_train_loss = train_loss / len(train_dataloader)
        history['train_loss'].append(avg_train_loss)
        print(f"Epoch {epoch+1} 平均训练损失: {avg_train_loss:.4f}")
        
        # 验证
        val_metrics = evaluate_model(model, val_dataloader)
        val_accuracy, val_precision, val_recall, val_f1 = val_metrics
        
        history['val_accuracy'].append(val_accuracy)
        history['val_precision'].append(val_precision)
        history['val_recall'].append(val_recall)
        history['val_f1'].append(val_f1)
        
        print(f"Epoch {epoch+1} 验证集评估:")
        print(f"  准确率: {val_accuracy:.4f}")
        print(f"  精确率: {val_precision:.4f}")
        print(f"  召回率: {val_recall:.4f}")
        print(f"  F1分数: {val_f1:.4f}")
        
        # 早停检查
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            counter = 0
            best_model = model.state_dict().copy()
        else:
            counter += 1
            if counter >= patience:
                print(f"早停激活，{patience} 个epoch没有提升")
                break
    
    if best_model is not None:
        model.load_state_dict(best_model)
    
    return model, history

def evaluate_model(model, dataloader):
    """评估模型性能"""
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            logits = outputs.logits
            pred = torch.argmax(logits, dim=1).cpu().numpy()
            
            predictions.extend(pred)
            true_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, predictions, average='binary', zero_division=0
    )
    
    return accuracy, precision, recall, f1

def predict_ambiguity(text, model, tokenizer):
    """预测句子是否有歧义"""
    model.eval()
    
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
    pred_class = np.argmax(probs)
    
    result = "歧义句" if pred_class == 1 else "非歧义句"
    probability = probs[pred_class]
    ambig_prob = probs[1]
    non_ambig_prob = probs[0]
    
    return result, probability, ambig_prob, non_ambig_prob

def predict_on_test_set(model, tokenizer, data_dict):
    """在测试集上进行预测并返回结果"""
    print("正在对测试集进行预测...")
    
    test_texts, test_labels, test_types = data_dict['test']
    predictions = []
    probabilities = []
    ambig_probs = []
    non_ambig_probs = []
    
    for text in tqdm(test_texts, desc="预测中"):
        result, prob, ambig_prob, non_ambig_prob = predict_ambiguity(text, model, tokenizer)
        predictions.append(result)
        probabilities.append(prob)
        ambig_probs.append(ambig_prob)
        non_ambig_probs.append(non_ambig_prob)
    
    # 创建测试集DataFrame
    test_df = pd.DataFrame({
        'text': test_texts,
        'true_label': ['歧义句' if label == 1 else '非歧义句' for label in test_labels],
        'ambiguity_type': test_types,
        'predicted_label': predictions,
        'prediction_probability': probabilities,
        'ambiguity_probability': ambig_probs,
        'non_ambiguity_probability': non_ambig_probs
    })
    
    return test_df

def save_datasets_as_tsv(data_dict, output_dir="./datasets"):
    """将数据集保存为TSV文件"""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    split_mapping = {
        'train': 'train.tsv',
        'val': 'dev.tsv',  # 验证集命名为dev.tsv
        'test': 'test.tsv'
    }
    
    for split_name, (texts, labels, types) in data_dict.items():
        df = pd.DataFrame({
            'text': texts,
            'label': ['歧义句' if label == 1 else '非歧义句' for label in labels],
            'ambiguity_type': types
        })
        
        filename = split_mapping[split_name]
        output_path = os.path.join(output_dir, filename)
        df.to_csv(output_path, sep='\t', index=False, encoding='utf-8')
        print(f"已保存 {split_name} 数据集到: {output_path}")

def save_model(model, tokenizer, output_dir):
    """保存模型和分词器"""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    print(f"保存模型到 {output_dir}")
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

def load_model(model_path):
    """加载保存的模型和分词器"""
    print(f"从 {model_path} 加载模型...")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
    return model, tokenizer

def run_complete_pipeline(data_file, model_name, output_dir, num_epochs=10, batch_size=16, 
                         learning_rate=2e-5, weight_decay=0.01):
    """完整的训练流程"""
    # 加载分词器
    print(f"加载分词器: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # 按要求准备数据
    data_dict = prepare_data_with_requirements(data_file)
    
    # 5. 保存划分后的数据集为TSV文件
    save_datasets_as_tsv(data_dict)
    
    # 创建数据加载器
    train_dataloader, val_dataloader, test_dataloader = create_dataloaders(
        data_dict, tokenizer, batch_size=batch_size
    )
    
    # 加载模型
    print(f"加载模型: {model_name}")
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=2
    ).to(device)
    
    # 设置优化器和调度器
    optimizer = AdamW(
        model.parameters(),
        lr=learning_rate,
        weight_decay=weight_decay
    )
    
    total_steps = len(train_dataloader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    
    # 训练模型
    print("开始训练模型...")
    model, history = train_model(
        model, train_dataloader, val_dataloader, optimizer, scheduler, num_epochs
    )
    
    # 4. 在测试集上进行预测
    test_predictions = predict_on_test_set(model, tokenizer, data_dict)
    
    # 保存测试集预测结果
    test_predictions.to_csv("test_predictions.csv", index=False, encoding='utf-8')
    print("测试集预测结果已保存到: test_predictions.csv")
    
    # 评估测试集性能
    test_metrics = evaluate_model(model, test_dataloader)
    print("测试集结果:")
    print(f"  准确率: {test_metrics[0]:.4f}")
    print(f"  精确率: {test_metrics[1]:.4f}")
    print(f"  召回率: {test_metrics[2]:.4f}")
    print(f"  F1分数: {test_metrics[3]:.4f}")
    
    # 保存模型
    save_model(model, tokenizer, output_dir)
    
    return model, tokenizer, test_predictions

# 主函数执行
if __name__ == "__main__":
    # 配置参数
    model_name = "hfl/chinese-roberta-wwm-ext"
    data_file = r'D:\浏览器\中文文本歧义收集与标注_数据表_歧义收集与标注 (2).csv'  # 修改为您的文件路径
    output_dir = "./ambiguity_detection_model_FINAL"
    
    # 运行完整流程
    model, tokenizer, test_predictions = run_complete_pipeline(
        data_file, 
        model_name, 
        output_dir,
        num_epochs=10,
        batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,
    )
    
    print("完整流程执行完成！")
    print("生成的文件:")
    print("- train.tsv: 训练集")
    print("- dev.tsv: 验证集")  
    print("- test.tsv: 测试集")
    print("- test_predictions.csv: 测试集预测结果")
    print(f"- {output_dir}: 训练好的模型")

使用设备: cuda
加载分词器: hfl/chinese-roberta-wwm-ext
正在加载和处理数据...
原始数据量: 995
去重后数据量: 985
删除标记为删除的行后数据量: 964 (删除了 21 行)
数据已随机排序
歧义句数量: 963
消歧句数量: 1926
总数据量: 2889

=== 整体数据 数据分布 ===
总数量: 2889
歧义句: 963 (33.33%)
非歧义句: 1926 (66.67%)

各类型分布:
  词汇-兼类词: 48 (1.66%)
  词汇-同形词: 29 (1.00%)
  词汇-多义词: 174 (6.02%)
  语法-句法歧义: 270 (9.35%)
  语法-语义歧义: 70 (2.42%)
  语用语境-会话含义: 83 (2.87%)
  语用语境-指示语: 51 (1.77%)
  语用语境-社会文化: 129 (4.47%)
  语用语境-言语行为: 109 (3.77%)
  非歧义-词汇-兼类词: 96 (3.32%)
  非歧义-词汇-同形词: 58 (2.01%)
  非歧义-词汇-多义词: 348 (12.05%)
  非歧义-语法-句法歧义: 540 (18.69%)
  非歧义-语法-语义歧义: 140 (4.85%)
  非歧义-语用语境-会话含义: 166 (5.75%)
  非歧义-语用语境-指示语: 102 (3.53%)
  非歧义-语用语境-社会文化: 258 (8.93%)
  非歧义-语用语境-言语行为: 218 (7.55%)

分层标签分布:
  0_非歧义-词汇-兼类词: 96
  0_非歧义-词汇-同形词: 58
  0_非歧义-词汇-多义词: 348
  0_非歧义-语法-句法歧义: 540
  0_非歧义-语法-语义歧义: 140
  0_非歧义-语用语境-会话含义: 166
  0_非歧义-语用语境-指示语: 102
  0_非歧义-语用语境-社会文化: 258
  0_非歧义-语用语境-言语行为: 218
  1_词汇-兼类词: 48
  1_词汇-同形词: 29
  1_词汇-多义词: 174
  1_语法-句法歧义: 270
  1_语法-语义歧义: 70
  1_语用语境-会话含义: 83
  1_语用语境-指示语: 51
  1_

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-roberta-wwm-ext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


开始训练模型...
开始 Epoch 1/10...


训练中: 100%|██████████| 127/127 [00:38<00:00,  3.32it/s]


Epoch 1 平均训练损失: 0.2592
Epoch 1 验证集评估:
  准确率: 0.9192
  精确率: 0.9044
  召回率: 0.8483
  F1分数: 0.8754
开始 Epoch 2/10...


训练中: 100%|██████████| 127/127 [00:50<00:00,  2.49it/s]


Epoch 2 平均训练损失: 0.1825
Epoch 2 验证集评估:
  准确率: 0.9169
  精确率: 0.9658
  召回率: 0.7793
  F1分数: 0.8626
开始 Epoch 3/10...


训练中: 100%|██████████| 127/127 [00:50<00:00,  2.51it/s]


Epoch 3 平均训练损失: 0.1471
Epoch 3 验证集评估:
  准确率: 0.8961
  精确率: 0.7747
  召回率: 0.9724
  F1分数: 0.8624
开始 Epoch 4/10...


训练中: 100%|██████████| 127/127 [00:53<00:00,  2.37it/s]


Epoch 4 平均训练损失: 0.0800
Epoch 4 验证集评估:
  准确率: 0.9284
  精确率: 0.9385
  召回率: 0.8414
  F1分数: 0.8873
开始 Epoch 5/10...


训练中: 100%|██████████| 127/127 [00:54<00:00,  2.33it/s]


Epoch 5 平均训练损失: 0.0496
Epoch 5 验证集评估:
  准确率: 0.9330
  精确率: 0.9462
  召回率: 0.8483
  F1分数: 0.8945
开始 Epoch 6/10...


训练中: 100%|██████████| 127/127 [00:55<00:00,  2.31it/s]


Epoch 6 平均训练损失: 0.0293
Epoch 6 验证集评估:
  准确率: 0.9376
  精确率: 0.9538
  召回率: 0.8552
  F1分数: 0.9018
开始 Epoch 7/10...


训练中: 100%|██████████| 127/127 [00:55<00:00,  2.29it/s]


Epoch 7 平均训练损失: 0.0137
Epoch 7 验证集评估:
  准确率: 0.9492
  精确率: 0.9624
  召回率: 0.8828
  F1分数: 0.9209
开始 Epoch 8/10...


训练中: 100%|██████████| 127/127 [00:55<00:00,  2.30it/s]


Epoch 8 平均训练损失: 0.0035
Epoch 8 验证集评估:
  准确率: 0.9446
  精确率: 0.9116
  召回率: 0.9241
  F1分数: 0.9178
开始 Epoch 9/10...


训练中: 100%|██████████| 127/127 [00:55<00:00,  2.29it/s]


Epoch 9 平均训练损失: 0.0027
Epoch 9 验证集评估:
  准确率: 0.9515
  精确率: 0.9493
  召回率: 0.9034
  F1分数: 0.9258
开始 Epoch 10/10...


训练中: 100%|██████████| 127/127 [00:55<00:00,  2.28it/s]


Epoch 10 平均训练损失: 0.0016
Epoch 10 验证集评估:
  准确率: 0.9515
  精确率: 0.9493
  召回率: 0.9034
  F1分数: 0.9258
正在对测试集进行预测...


预测中: 100%|██████████| 434/434 [00:05<00:00, 73.44it/s]


测试集预测结果已保存到: test_predictions.csv
测试集结果:
  准确率: 0.9470
  精确率: 0.9416
  召回率: 0.8958
  F1分数: 0.9181
保存模型到 ./ambiguity_detection_model_FINAL
完整流程执行完成！
生成的文件:
- train.tsv: 训练集
- dev.tsv: 验证集
- test.tsv: 测试集
- test_predictions.csv: 测试集预测结果
- ./ambiguity_detection_model_FINAL: 训练好的模型


In [11]:
import pandas as pd

# 1. 读取CSV文件
file_path = r'D:\python\Coding\NLP\Classifier\模型训练\Classifier\test_predictions.csv'  # 请根据实际路径修改
df = pd.read_csv(file_path)

# 2. 统计 predicted_label 列中歧义句和非歧义句的数量和比例
label_counts = df['predicted_label'].value_counts()
label_ratios = df['predicted_label'].value_counts(normalize=True) * 100

# 3. 整合统计结果为DataFrame
label_summary = pd.DataFrame({
    '数量': label_counts,
    '比例（%）': label_ratios.round(2)
})

# 4. 输出结果
print(label_summary)



                  数量  比例（%）
predicted_label            
非歧义句             297  68.43
歧义句              137  31.57
