In [1]:
!pip install -q mlflow rouge_score transformers datasets evaluate
!pip install evaluate



In [2]:
# Import necessary toolkits
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
from datasets import Dataset
import pandas as pd
import numpy as np
import torch
from evaluate import load as load_metric  # 使用 evaluate 库来加载指标
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import os

# 检查GPU是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [20]:
def preprocess_data(csv_file):
    df = pd.read_csv(csv_file)
    data = []

    # 根据实际列名设置输入列和标签列
    if 'text' in df.columns:
        text_column = 'text'
    elif 'sentence' in df.columns:
        text_column = 'sentence'
    elif 'prompt' in df.columns:
        text_column = 'prompt'
    elif 'baseq' in df.columns:
        text_column = 'baseq'
    elif 'question' in df.columns:
        text_column = 'question'
    else:
        raise ValueError(f"Column for text input not found in {csv_file}. Available columns are: {df.columns}")

    if 'label' not in df.columns:
        raise ValueError(f"Column 'label' not found in {csv_file}. Available columns are: {df.columns}")

    # 将每一行转换为适合T5模型的输入格式
    for _, row in df.iterrows():
        text = row[text_column]
        label = str(row['label'])  # 将标签转换为字符串形式
        data.append({"X": text, "y": label})

    return pd.DataFrame(data)

In [4]:
# 数据集分割函数，返回训练、验证和测试集
def split_data(df, test_size=0.1, val_size=0.2, random_seed=42):
    # 随机化数据集并分割测试集
    df = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    test_split_index = int(len(df) * (1 - test_size))
    train_val_df, test_df = df[:test_split_index], df[test_split_index:]

    # 进一步分割训练集和验证集
    val_split_index = int(len(train_val_df) * (1 - val_size))
    train_df, val_df = train_val_df[:val_split_index], train_val_df[val_split_index:]

    # 分离 X 和 y
    X_train, y_train = train_df['X'].tolist(), train_df['y'].tolist()
    X_val, y_val = val_df['X'].tolist(), val_df['y'].tolist()
    X_test, y_test = test_df['X'].tolist(), test_df['y'].tolist()

    return X_train, X_val, X_test, y_train, y_val, y_test

# 数据集格式转换为 T5 模型输入
def prepare_dataset(X, y):
    data = [{'input_text': f"text: {text}", 'target_text': label} for text, label in zip(X, y)]
    return Dataset.from_pandas(pd.DataFrame(data))

In [5]:
BATCH_SIZE = 8
NUM_EPOCHS = 3
LEARNING_RATE = 5e-5

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    logging_dir='./logs',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    overwrite_output_dir=True,
    save_total_limit=3,
    load_best_model_at_end=True,
    fp16=True,
    generation_max_length=64,
    report_to="none",
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)




In [6]:
accuracy_metric = load_metric("accuracy")

# 定义评估函数
def compute_metrics(pred):
    pred_str = tokenizer.batch_decode(pred.predictions, skip_special_tokens=True)
    labels_ids = pred.label_ids
    labels_ids = np.where(labels_ids == -100, tokenizer.pad_token_id, labels_ids)
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    def str_to_int(label):
        if label.lower() == 'true':
            return 1
        elif label.lower() == 'false':
            return 0
        else:
            try:
                return int(label)
            except ValueError:
                return -1

    pred_int = [str_to_int(p) for p in pred_str]
    label_int = [str_to_int(l) for l in label_str]
    valid_indices = [i for i, (p, l) in enumerate(zip(pred_int, label_int)) if p != -1 and l != -1]
    pred_int = [pred_int[i] for i in valid_indices]
    label_int = [label_int[i] for i in valid_indices]

    if not pred_int or not label_int:
        return {
            "accuracy": 0.0,
            "precision": 0.0,
            "recall": 0.0,
            "tp": 0,
            "tn": 0,
            "fp": 0,
            "fn": 0
        }

    tn, fp, fn, tp = confusion_matrix(label_int, pred_int).ravel()
    accuracy = accuracy_score(label_int, pred_int)
    precision = precision_score(label_int, pred_int, zero_division=0)
    recall = recall_score(label_int, pred_int, zero_division=0)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn
    }


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [16]:
csv_files = [
    'df_toxicgen1_train_clean.csv',
    'ds_ethics_commonsense_small_test_clean.csv',
    'ds_ethics_commonsense_small_train_clean.csv',
    'ds_ethics_commonsense_small_validation_clean.csv',
    'ds_ethics_commonsense_test_clean.csv',
    'ds_ethics_commonsense_train_clean.csv',
    'ds_ethics_commonsense_validation_clean.csv',
    'HateSpeechDetection_simple_test_Clean.csv',
    'HateSpeechDetection_small_test_Clean.csv',
    'HateSpeechDetection_small_train_Clean.csv',
    'HateSpeechDetection_test_Clean.csv',
    'HateSpeechDetection_train_Clean.csv',
    'superset_simple_test.csv',
    'superset_test.csv',
    'superset_train.csv',
    'superset_train_small_test.csv',
    'superset_train_small_train.csv',
    'xstest_test_clean.csv',
    'xstest_train_clean.csv',
    'df_saladbench_attack_clean.csv',
    'df_saladbench_base_clean.csv',
    'df_saladbench_defense_clean.csv',
    'df_saladbench_ground_clean.csv',
    'df_tocxicchat1_simple_test_Clean.csv',
    'df_tocxicchat1_small_test_clean.csv',
    'df_tocxicchat1_small_train_clean.csv',
    'df_tocxicchat1_test_clean.csv',
    'df_tocxicchat1_train_clean.csv',
    'df_toxicgen1_simple_test_Clean.csv',
    'df_toxicgen1_small_test_clean.csv',
    'df_toxicgen1_small_train_clean.csv',
    'df_toxicgen1_test_clean.csv'
]

results_file = 'traintestperformances.csv'
results_list = []

In [21]:
for file_path in csv_files:
    print(f"Processing file: {file_path}")

    # 数据预处理并分割数据集
    df = preprocess_data(file_path)
    X_train, X_val, X_test, y_train, y_val, y_test = split_data(df)

    # 准备训练、验证和测试集的 Dataset
    train_data = prepare_dataset(X_train, y_train)
    val_data = prepare_dataset(X_val, y_val)
    test_data = prepare_dataset(X_test, y_test)

    # 加载模型到GPU
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

    # 模型结构摘要
    print(f"Model Summary for {file_path}:")
    num_params = sum(p.numel() for p in model.parameters())
    print(f"Total number of parameters: {num_params}")
    model_summary_info = {
        'file': file_path,
        'model': model_name,
        'num_parameters': num_params
    }

    # 数据集格式转换为模型输入
    def tokenize_data(batch):
        try:
            tokenized_input = tokenizer(batch['input_text'], padding='max_length', truncation=True, max_length=512)
            tokenized_label = tokenizer(batch['target_text'], padding='max_length', truncation=True, max_length=64)
            tokenized_input['labels'] = tokenized_label['input_ids']
            return tokenized_input
        except Exception as e:
            print(f"Error in tokenization: {e}")
            return None

    # 对训练和验证数据进行标记化处理
    train_data = train_data.map(tokenize_data, batched=True, remove_columns=train_data.column_names)
    val_data = val_data.map(tokenize_data, batched=True, remove_columns=val_data.column_names)

    # 初始化Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_data,
        eval_dataset=val_data,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
    )

    # 训练模型
    try:
        trainer.train()
    except Exception as e:
        print(f"Error during training for file {file_path}: {e}")
        continue

    # 评估模型
    try:
        eval_results = trainer.evaluate()
    except Exception as e:
        print(f"Error during evaluation for file {file_path}: {e}")
        eval_results = {
            'eval_loss': None,
            'eval_accuracy': None,
            'eval_precision': None,
            'eval_recall': None,
            'eval_tp': None,
            'eval_tn': None,
            'eval_fp': None,
            'eval_fn': None
        }

    # 保存评估结果
    eval_results['file'] = file_path
    eval_results.update(model_summary_info)
    results_list.append(eval_results)

    # 打印结果
    print(f"Results for {file_path}:")
    print(eval_results)

# 将所有结果保存到CSV文件中
results_df = pd.DataFrame(results_list)
results_df.to_csv(results_file, index=False)

print("Final Results for All Files:")
print(results_df)

Processing file: df_saladbench_attack_clean.csv
Model Summary for df_saladbench_attack_clean.csv:
Total number of parameters: 60506624


Map:   0%|          | 0/3600 [00:00<?, ? examples/s]

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss




Error during training for file df_saladbench_attack_clean.csv: not enough values to unpack (expected 4, got 1)
Processing file: df_saladbench_base_clean.csv
Model Summary for df_saladbench_base_clean.csv:
Total number of parameters: 60506624


Map:   0%|          | 0/15348 [00:00<?, ? examples/s]

Map:   0%|          | 0/3838 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss




Error during training for file df_saladbench_base_clean.csv: not enough values to unpack (expected 4, got 1)
Processing file: df_saladbench_defense_clean.csv
Model Summary for df_saladbench_defense_clean.csv:
Total number of parameters: 60506624


Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss


Error during training for file df_saladbench_defense_clean.csv: piece id is out of range.
Processing file: df_saladbench_ground_clean.csv
Model Summary for df_saladbench_ground_clean.csv:
Total number of parameters: 60506624


Map:   0%|          | 0/1382 [00:00<?, ? examples/s]

Map:   0%|          | 0/346 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss




Error during training for file df_saladbench_ground_clean.csv: not enough values to unpack (expected 4, got 1)
Processing file: df_tocxicchat1_simple_test_Clean.csv
Model Summary for df_tocxicchat1_simple_test_Clean.csv:
Total number of parameters: 60506624


Map:   0%|          | 0/72 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss


Error during training for file df_tocxicchat1_simple_test_Clean.csv: piece id is out of range.
Processing file: df_tocxicchat1_small_test_clean.csv
Model Summary for df_tocxicchat1_small_test_clean.csv:
Total number of parameters: 60506624


Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,No log,17.904057,0.75,1.0,0.5,1,2,0,1
2,No log,16.746147,0.75,1.0,0.5,1,2,0,1
3,No log,15.768224,0.75,1.0,0.5,1,2,0,1


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for df_tocxicchat1_small_test_clean.csv:
{'eval_loss': 15.768223762512207, 'eval_accuracy': 0.75, 'eval_precision': 1.0, 'eval_recall': 0.5, 'eval_tp': 1, 'eval_tn': 2, 'eval_fp': 0, 'eval_fn': 1, 'eval_runtime': 0.1498, 'eval_samples_per_second': 26.701, 'eval_steps_per_second': 6.675, 'epoch': 3.0, 'file': 'df_tocxicchat1_small_test_clean.csv', 'model': 't5-small', 'num_parameters': 60506624}
Processing file: df_tocxicchat1_small_train_clean.csv
Model Summary for df_tocxicchat1_small_train_clean.csv:
Total number of parameters: 60506624


Map:   0%|          | 0/57 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss


Error during training for file df_tocxicchat1_small_train_clean.csv: piece id is out of range.
Processing file: df_tocxicchat1_test_clean.csv
Model Summary for df_tocxicchat1_test_clean.csv:
Total number of parameters: 60506624


Map:   0%|          | 0/732 [00:00<?, ? examples/s]

Map:   0%|          | 0/183 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.3305,0.174126,0.0,0.0,0.0,0,0,0,0
2,0.1018,0.027759,0.948718,0.0,0.0,0,37,0,2


Error during training for file df_tocxicchat1_test_clean.csv: piece id is out of range.
Processing file: df_tocxicchat1_train_clean.csv
Model Summary for df_tocxicchat1_train_clean.csv:
Total number of parameters: 60506624


Map:   0%|          | 0/2926 [00:00<?, ? examples/s]

Map:   0%|          | 0/732 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.0089,0.008396,0.913934,0.0,0.0,0,669,0,63
2,0.0073,0.005103,0.913934,0.0,0.0,0,669,0,63
3,0.0059,0.004679,0.913934,0.0,0.0,0,669,0,63


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for df_tocxicchat1_train_clean.csv:
{'eval_loss': 0.004678900353610516, 'eval_accuracy': 0.9139344262295082, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_tp': 0, 'eval_tn': 669, 'eval_fp': 0, 'eval_fn': 63, 'eval_runtime': 18.7713, 'eval_samples_per_second': 38.996, 'eval_steps_per_second': 4.901, 'epoch': 3.0, 'file': 'df_tocxicchat1_train_clean.csv', 'model': 't5-small', 'num_parameters': 60506624}
Processing file: df_toxicgen1_simple_test_Clean.csv
Model Summary for df_toxicgen1_simple_test_Clean.csv:
Total number of parameters: 60506624


Map:   0%|          | 0/72 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,No log,11.628146,0.444444,0.363636,0.571429,4,4,7,3
2,12.152500,5.696908,0.444444,0.363636,0.571429,4,4,7,3


Error during training for file df_toxicgen1_simple_test_Clean.csv: piece id is out of range.
Processing file: df_toxicgen1_small_test_clean.csv
Model Summary for df_toxicgen1_small_test_clean.csv:
Total number of parameters: 60506624


Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,No log,21.289383,0.25,0.333333,0.5,1,0,2,1
2,No log,18.237701,0.25,0.333333,0.5,1,0,2,1
3,No log,15.96377,0.25,0.333333,0.5,1,0,2,1


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for df_toxicgen1_small_test_clean.csv:
{'eval_loss': 15.963769912719727, 'eval_accuracy': 0.25, 'eval_precision': 0.3333333333333333, 'eval_recall': 0.5, 'eval_tp': 1, 'eval_tn': 0, 'eval_fp': 2, 'eval_fn': 1, 'eval_runtime': 0.1733, 'eval_samples_per_second': 23.078, 'eval_steps_per_second': 5.77, 'epoch': 3.0, 'file': 'df_toxicgen1_small_test_clean.csv', 'model': 't5-small', 'num_parameters': 60506624}
Processing file: df_toxicgen1_small_train_clean.csv
Model Summary for df_toxicgen1_small_train_clean.csv:
Total number of parameters: 60506624


Map:   0%|          | 0/57 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,No log,16.173279,0.666667,0.666667,0.75,6,4,3,2
2,13.718200,7.176066,0.533333,0.555556,0.625,5,3,4,3
3,8.259100,4.225649,0.6,0.6,0.75,6,3,4,2


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for df_toxicgen1_small_train_clean.csv:
{'eval_loss': 4.225649356842041, 'eval_accuracy': 0.6, 'eval_precision': 0.6, 'eval_recall': 0.75, 'eval_tp': 6, 'eval_tn': 3, 'eval_fp': 4, 'eval_fn': 2, 'eval_runtime': 0.4196, 'eval_samples_per_second': 35.749, 'eval_steps_per_second': 4.767, 'epoch': 3.0, 'file': 'df_toxicgen1_small_train_clean.csv', 'model': 't5-small', 'num_parameters': 60506624}
Processing file: df_toxicgen1_test_clean.csv
Model Summary for df_toxicgen1_test_clean.csv:
Total number of parameters: 60506624


Map:   0%|          | 0/676 [00:00<?, ? examples/s]

Map:   0%|          | 0/170 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.3678,0.230089,0.0,0.0,0.0,0,0,0,0
2,0.1842,0.117922,0.0,0.0,0.0,0,0,0,0
3,0.1153,0.062851,0.0,0.0,0.0,0,0,0,1


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for df_toxicgen1_test_clean.csv:
{'eval_loss': 0.06285139173269272, 'eval_accuracy': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_tp': 0, 'eval_tn': 0, 'eval_fp': 0, 'eval_fn': 1, 'eval_runtime': 20.5241, 'eval_samples_per_second': 8.283, 'eval_steps_per_second': 1.072, 'epoch': 3.0, 'file': 'df_toxicgen1_test_clean.csv', 'model': 't5-small', 'num_parameters': 60506624}
Final Results for All Files:
   eval_loss  eval_accuracy  eval_precision  eval_recall  eval_tp  eval_tn  \
0  15.768224       0.750000        1.000000         0.50        1        2   
1   0.004679       0.913934        0.000000         0.00        0      669   
2  15.963770       0.250000        0.333333         0.50        1        0   
3   4.225649       0.600000        0.600000         0.75        6        3   
4   0.062851       0.000000        0.000000         0.00        0        0   

   eval_fp  eval_fn  eval_runtime  eval_samples_per_second  \
0        0        1        0.1498                 