In [11]:
# 安装必要的依赖库
!pip install -q mlflow rouge_score transformers datasets evaluate
!pip install evaluate


# Import necessary toolkits
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
from datasets import Dataset
import pandas as pd
import numpy as np
import torch
from evaluate import load as load_metric  # 使用 evaluate 库来加载指标
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import os

# 检查GPU是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [12]:
def preprocess_data(csv_file):
    df = pd.read_csv(csv_file)
    data = []

    # 根据实际列名设置输入列和标签列
    if 'text' in df.columns:
        text_column = 'text'
    elif 'sentence' in df.columns:
        text_column = 'sentence'
    elif 'prompt' in df.columns:
        text_column = 'prompt'
    else:
        raise ValueError(f"Column for text input not found in {csv_file}. Available columns are: {df.columns}")

    if 'label' not in df.columns:
        raise ValueError(f"Column 'label' not found in {csv_file}. Available columns are: {df.columns}")

    # 将每一行转换为适合T5模型的输入格式
    for _, row in df.iterrows():
        text = f"text: {row[text_column]}"  # 去掉 </s> 标记
        label = str(row['label'])  # 将标签转换为字符串形式
        data.append({"input_text": text, "target_text": label})

    return Dataset.from_pandas(pd.DataFrame(data))

In [13]:
BATCH_SIZE = 8
NUM_EPOCHS = 3
LEARNING_RATE = 5e-5

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    logging_dir='./logs',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    overwrite_output_dir=True,
    save_total_limit=3,
    load_best_model_at_end=True,  # 仅保留一个 load_best_model_at_end 参数
    fp16=True,
    generation_max_length=64,
    report_to="none",  # 设置早停
    metric_for_best_model="eval_loss",  # 评估指标为eval_loss
    greater_is_better=False,  # eval_loss越低越好
)



In [14]:
# 加载分类评估指标（例如 accuracy）
accuracy_metric = load_metric("accuracy")

# 定义评估函数
def compute_metrics(pred):
    pred_str = tokenizer.batch_decode(pred.predictions, skip_special_tokens=True)
    labels_ids = pred.label_ids
    labels_ids = np.where(labels_ids == -100, tokenizer.pad_token_id, labels_ids)
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    # 将字符串 'True'/'False' 转换为整数 1/0
    def str_to_int(label):
        if label.lower() == 'true':
            return 1
        elif label.lower() == 'false':
            return 0
        else:
            # 如果不是 'True' 或 'False'，尝试直接转换为整数
            try:
                return int(label)
            except ValueError:
                return -1  # 返回 -1 以便识别出无效值

    # 转换预测和标签为整数形式
    pred_int = [str_to_int(p) for p in pred_str]
    label_int = [str_to_int(l) for l in label_str]

    # 过滤掉无效的值
    valid_indices = [i for i, (p, l) in enumerate(zip(pred_int, label_int)) if p != -1 and l != -1]
    pred_int = [pred_int[i] for i in valid_indices]
    label_int = [label_int[i] for i in valid_indices]

    if not pred_int or not label_int:
        # 如果转换后没有有效的数据，返回空结果
        return {
            "accuracy": 0.0,
            "precision": 0.0,
            "recall": 0.0,
            "tp": 0,
            "tn": 0,
            "fp": 0,
            "fn": 0
        }

    # 计算混淆矩阵和分类指标
    tn, fp, fn, tp = confusion_matrix(label_int, pred_int).ravel()
    accuracy = accuracy_score(label_int, pred_int)
    precision = precision_score(label_int, pred_int, zero_division=0)
    recall = recall_score(label_int, pred_int, zero_division=0)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn
    }


In [15]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)

# 定义CSV文件路径列表
csv_files = [
    'df_toxicgen1_train_clean.csv',
    'ds_ethics_commonsense_small_test_clean.csv',
    'ds_ethics_commonsense_small_train_clean.csv',
    'ds_ethics_commonsense_small_validation_clean.csv',
    'ds_ethics_commonsense_test_clean.csv',
    'ds_ethics_commonsense_train_clean.csv',
    'ds_ethics_commonsense_validation_clean.csv',
    'FinalBalancedDataset_Clean.csv',
    'HateSpeechDetection(1).csv',
    'HateSpeechDetection.csv',
    'HateSpeechDetection_Clean.csv',
    'HateSpeechDetection_simple_test_Clean.csv',
    'HateSpeechDetection_small_test_Clean.csv',
    'HateSpeechDetection_small_train_Clean.csv',
    'HateSpeechDetection_test_Clean.csv',
    'HateSpeechDetection_train_Clean.csv',
    'superset_simple_test.csv',
    'superset_test.csv',
    'superset_train.csv',
    'superset_train_small_test.csv',
    'superset_train_small_train.csv',
    'xstest_test_clean.csv',
    'xstest_train_clean.csv',
    'df_saladbench_attack_clean.csv',
    'df_saladbench_base_clean.csv',
    'df_saladbench_defense_clean.csv',
    'df_saladbench_ground_clean.csv',
    'df_tocxicchat1_simple_test_Clean.csv',
    'df_tocxicchat1_small_test_clean.csv',
    'df_tocxicchat1_small_train_clean.csv',
    'df_tocxicchat1_test_clean.csv',
    'df_tocxicchat1_train_clean.csv',
    'df_toxicgen1_simple_test_Clean.csv',
    'df_toxicgen1_small_test_clean.csv',
    'df_toxicgen1_small_train_clean.csv',
    'df_toxicgen1_test_clean.csv'
]

# 遍历每个CSV文件，分别进行训练和评估
results = []

In [18]:
for file_path in csv_files:
    print(f"Processing file: {file_path}")

    # 数据预处理
    dataset = preprocess_data(file_path)
    train_test_split = dataset.train_test_split(test_size=0.1)
    train_data = train_test_split['train']
    validation_data = train_test_split['test']

    # 加载模型到GPU
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

    # 数据集格式转换为模型输入
    def tokenize_data(batch):
        try:
            # 为输入和目标标记化
            tokenized_input = tokenizer(batch['input_text'], padding='max_length', truncation=True, max_length=512)
            tokenized_label = tokenizer(batch['target_text'], padding='max_length', truncation=True, max_length=64)

            # 设置标签
            tokenized_input['labels'] = tokenized_label['input_ids']
            return tokenized_input
        except Exception as e:
            print(f"Error in tokenization: {e}")
            return None

    # 对训练和验证数据进行标记化处理，并过滤掉 None 样本
    train_data = train_data.map(tokenize_data, batched=True, remove_columns=train_data.column_names)
    validation_data = validation_data.map(tokenize_data, batched=True, remove_columns=validation_data.column_names)

    # 过滤掉 None 样本
    train_data = train_data.filter(lambda x: x is not None)
    validation_data = validation_data.filter(lambda x: x is not None)

    # 初始化Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_data,
        eval_dataset=validation_data,
    )

    # 训练模型
    try:
        trainer.train()
    except Exception as e:
        print(f"Error during training for file {file_path}: {e}")
        continue

    # 评估模型
    try:
        eval_results = trainer.evaluate()
    except Exception as e:
        print(f"Error during evaluation for file {file_path}: {e}")
        eval_results = {
            'eval_loss': None,
            'eval_accuracy': None,
            'eval_precision': None,
            'eval_recall': None,
            'eval_tp': None,
            'eval_tn': None,
            'eval_fp': None,
            'eval_fn': None
        }

    # 保存评估结果
    eval_results['file'] = file_path
    results.append(eval_results)

    # 打印结果
    print(f"Results for {file_path}:")
    print(eval_results)

# 记录所有文件的训练和评估结果
print("Final Results for All Files:")
for result in results:
    print(result)


Processing file: df_toxicgen1_train_clean.csv


Map:   0%|          | 0/8064 [00:00<?, ? examples/s]

Map:   0%|          | 0/896 [00:00<?, ? examples/s]

Filter:   0%|          | 0/8064 [00:00<?, ? examples/s]

Filter:   0%|          | 0/896 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.0123,0.009338,0.691964,0.711111,0.108475,32,588,13,263
2,0.0104,0.00824,0.722098,0.72549,0.250847,74,573,28,221
3,0.0082,0.007973,0.765625,0.647059,0.633898,187,499,102,108


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for df_toxicgen1_train_clean.csv:
{'eval_loss': 0.00797282438725233, 'eval_accuracy': 0.765625, 'eval_precision': 0.6470588235294118, 'eval_recall': 0.6338983050847458, 'eval_tp': 187, 'eval_tn': 499, 'eval_fp': 102, 'eval_fn': 108, 'eval_runtime': 23.037, 'eval_samples_per_second': 38.894, 'eval_steps_per_second': 4.862, 'epoch': 3.0, 'file': 'df_toxicgen1_train_clean.csv'}
Processing file: ds_ethics_commonsense_small_test_clean.csv


Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Filter:   0%|          | 0/18 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,No log,18.778057,1.0,1.0,1.0,1,1,0,0
2,No log,14.644943,1.0,1.0,1.0,1,1,0,0
3,No log,12.551426,1.0,1.0,1.0,1,1,0,0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for ds_ethics_commonsense_small_test_clean.csv:
{'eval_loss': 12.55142593383789, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_tp': 1, 'eval_tn': 1, 'eval_fp': 0, 'eval_fn': 0, 'eval_runtime': 0.1197, 'eval_samples_per_second': 16.708, 'eval_steps_per_second': 8.354, 'epoch': 3.0, 'file': 'ds_ethics_commonsense_small_test_clean.csv'}
Processing file: ds_ethics_commonsense_small_train_clean.csv


Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Filter:   0%|          | 0/90 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,13.1285,8.305262,0.375,0.5,0.6,3,0,3,2
2,6.7724,2.822508,0.5,0.5,1.0,2,0,2,0
3,4.3346,1.373846,0.5,0.5,1.0,2,0,2,0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for ds_ethics_commonsense_small_train_clean.csv:
{'eval_loss': 1.3738460540771484, 'eval_accuracy': 0.5, 'eval_precision': 0.5, 'eval_recall': 1.0, 'eval_tp': 2, 'eval_tn': 0, 'eval_fp': 2, 'eval_fn': 0, 'eval_runtime': 1.8205, 'eval_samples_per_second': 5.493, 'eval_steps_per_second': 1.099, 'epoch': 3.0, 'file': 'ds_ethics_commonsense_small_train_clean.csv'}
Processing file: ds_ethics_commonsense_small_validation_clean.csv


Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Filter:   0%|          | 0/18 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,No log,20.417028,0.0,0.0,0.0,0,0,1,1
2,No log,18.540455,0.5,0.5,1.0,1,0,1,0
3,No log,15.74999,0.5,0.5,1.0,1,0,1,0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for ds_ethics_commonsense_small_validation_clean.csv:
{'eval_loss': 15.74998950958252, 'eval_accuracy': 0.5, 'eval_precision': 0.5, 'eval_recall': 1.0, 'eval_tp': 1, 'eval_tn': 0, 'eval_fp': 1, 'eval_fn': 0, 'eval_runtime': 0.0969, 'eval_samples_per_second': 20.63, 'eval_steps_per_second': 10.315, 'epoch': 3.0, 'file': 'ds_ethics_commonsense_small_validation_clean.csv'}
Processing file: ds_ethics_commonsense_test_clean.csv


Map:   0%|          | 0/3567 [00:00<?, ? examples/s]

Map:   0%|          | 0/397 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3567 [00:00<?, ? examples/s]

Filter:   0%|          | 0/397 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.0174,0.011686,0.523929,0.520588,0.871921,177,31,163,26
2,0.013,0.011128,0.536524,0.528529,0.866995,176,37,157,27
3,0.0116,0.011228,0.536524,0.526171,0.940887,191,22,172,12


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for ds_ethics_commonsense_test_clean.csv:
{'eval_loss': 0.011127719655632973, 'eval_accuracy': 0.5365239294710328, 'eval_precision': 0.5285285285285285, 'eval_recall': 0.8669950738916257, 'eval_tp': 176, 'eval_tn': 37, 'eval_fp': 157, 'eval_fn': 27, 'eval_runtime': 10.2102, 'eval_samples_per_second': 38.883, 'eval_steps_per_second': 4.897, 'epoch': 3.0, 'file': 'ds_ethics_commonsense_test_clean.csv'}
Processing file: ds_ethics_commonsense_train_clean.csv


Map:   0%|          | 0/12519 [00:00<?, ? examples/s]

Map:   0%|          | 0/1391 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12519 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1391 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.0119,0.010601,0.581596,0.566836,0.506808,335,474,256,326
2,0.011,0.010093,0.626887,0.63197,0.514372,340,532,198,321
3,0.0104,0.009916,0.633357,0.648915,0.497731,329,552,178,332


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for ds_ethics_commonsense_train_clean.csv:
{'eval_loss': 0.009916276670992374, 'eval_accuracy': 0.6333572969086988, 'eval_precision': 0.6489151873767258, 'eval_recall': 0.4977307110438729, 'eval_tp': 329, 'eval_tn': 552, 'eval_fp': 178, 'eval_fn': 332, 'eval_runtime': 36.1512, 'eval_samples_per_second': 38.477, 'eval_steps_per_second': 4.813, 'epoch': 3.0, 'file': 'ds_ethics_commonsense_train_clean.csv'}
Processing file: ds_ethics_commonsense_validation_clean.csv


Map:   0%|          | 0/3496 [00:00<?, ? examples/s]

Map:   0%|          | 0/389 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3496 [00:00<?, ? examples/s]

Filter:   0%|          | 0/389 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.0158,0.011626,0.498715,0.484429,0.752688,140,54,149,46
2,0.0139,0.011042,0.524422,0.501754,0.768817,143,61,142,43
3,0.0121,0.010948,0.524422,0.502075,0.650538,121,83,120,65


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for ds_ethics_commonsense_validation_clean.csv:
{'eval_loss': 0.010948162525892258, 'eval_accuracy': 0.5244215938303342, 'eval_precision': 0.5020746887966805, 'eval_recall': 0.6505376344086021, 'eval_tp': 121, 'eval_tn': 83, 'eval_fp': 120, 'eval_fn': 65, 'eval_runtime': 10.0498, 'eval_samples_per_second': 38.707, 'eval_steps_per_second': 4.876, 'epoch': 3.0, 'file': 'ds_ethics_commonsense_validation_clean.csv'}
Processing file: FinalBalancedDataset_Clean.csv


ValueError: Column for text input not found in FinalBalancedDataset_Clean.csv. Available columns are: Index(['tweet', 'Toxicity'], dtype='object')