In [1]:
# 安装必要的依赖库
!pip install -q mlflow rouge_score transformers datasets evaluate
!pip install evaluate


# Import necessary toolkits
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
from datasets import Dataset
import pandas as pd
import numpy as np
import torch
from evaluate import load as load_metric  # 使用 evaluate 库来加载指标
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import os

# 检查GPU是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [2]:
def preprocess_data(csv_file):
    df = pd.read_csv(csv_file)
    data = []

    # 根据实际列名设置输入列和标签列
    if 'text' in df.columns:
        text_column = 'text'
    elif 'sentence' in df.columns:
        text_column = 'sentence'
    elif 'prompt' in df.columns:
        text_column = 'prompt'
    elif 'baseq' in df.columns:
        text_column = 'baseq'
    elif 'question' in df.columns:
        text_column = 'question'
    else:
        raise ValueError(f"Column for text input not found in {csv_file}. Available columns are: {df.columns}")

    if 'label' not in df.columns:
        raise ValueError(f"Column 'label' not found in {csv_file}. Available columns are: {df.columns}")

    # 将每一行转换为适合T5模型的输入格式
    for _, row in df.iterrows():
        text = f"text: {row[text_column]}"  # 去掉 </s> 标记
        label = str(row['label'])  # 将标签转换为字符串形式
        data.append({"input_text": text, "target_text": label})

    return Dataset.from_pandas(pd.DataFrame(data))

In [3]:
BATCH_SIZE = 8
NUM_EPOCHS = 3
LEARNING_RATE = 5e-5

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    logging_dir='./logs',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    overwrite_output_dir=True,
    save_total_limit=3,
    load_best_model_at_end=True,  # 仅保留一个 load_best_model_at_end 参数
    fp16=True,
    generation_max_length=64,
    report_to="none",  # 设置早停
    metric_for_best_model="eval_loss",  # 评估指标为eval_loss
    greater_is_better=False,  # eval_loss越低越好
)



In [4]:
# 加载分类评估指标（例如 accuracy）
accuracy_metric = load_metric("accuracy")

# 定义评估函数
def compute_metrics(pred):
    pred_str = tokenizer.batch_decode(pred.predictions, skip_special_tokens=True)
    labels_ids = pred.label_ids
    labels_ids = np.where(labels_ids == -100, tokenizer.pad_token_id, labels_ids)
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    # 将字符串 'True'/'False' 转换为整数 1/0
    def str_to_int(label):
        if label.lower() == 'true':
            return 1
        elif label.lower() == 'false':
            return 0
        else:
            # 如果不是 'True' 或 'False'，尝试直接转换为整数
            try:
                return int(label)
            except ValueError:
                return -1  # 返回 -1 以便识别出无效值

    # 转换预测和标签为整数形式
    pred_int = [str_to_int(p) for p in pred_str]
    label_int = [str_to_int(l) for l in label_str]

    # 过滤掉无效的值
    valid_indices = [i for i, (p, l) in enumerate(zip(pred_int, label_int)) if p != -1 and l != -1]
    pred_int = [pred_int[i] for i in valid_indices]
    label_int = [label_int[i] for i in valid_indices]

    if not pred_int or not label_int:
        # 如果转换后没有有效的数据，返回空结果
        return {
            "accuracy": 0.0,
            "precision": 0.0,
            "recall": 0.0,
            "tp": 0,
            "tn": 0,
            "fp": 0,
            "fn": 0
        }

    # 计算混淆矩阵和分类指标
    tn, fp, fn, tp = confusion_matrix(label_int, pred_int).ravel()
    accuracy = accuracy_score(label_int, pred_int)
    precision = precision_score(label_int, pred_int, zero_division=0)
    recall = recall_score(label_int, pred_int, zero_division=0)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn
    }


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)

# 定义CSV文件路径列表
csv_files = [
    'df_toxicgen1_train_clean.csv',
    'ds_ethics_commonsense_small_test_clean.csv',
    'ds_ethics_commonsense_small_train_clean.csv',
    'ds_ethics_commonsense_small_validation_clean.csv',
    'ds_ethics_commonsense_test_clean.csv',
    'ds_ethics_commonsense_train_clean.csv',
    'ds_ethics_commonsense_validation_clean.csv',
    'HateSpeechDetection_Clean.csv',
    'HateSpeechDetection_simple_test_Clean.csv',
    'HateSpeechDetection_small_test_Clean.csv',
    'HateSpeechDetection_small_train_Clean.csv',
    'HateSpeechDetection_test_Clean.csv',
    'HateSpeechDetection_train_Clean.csv',
    'superset_simple_test.csv',
    'superset_test.csv',
    'superset_train.csv',
    'superset_train_small_test.csv',
    'superset_train_small_train.csv',
    'xstest_test_clean.csv',
    'xstest_train_clean.csv',
    'df_saladbench_attack_clean.csv',
    'df_saladbench_base_clean.csv',
    'df_saladbench_defense_clean.csv',
    'df_saladbench_ground_clean.csv',
    'df_tocxicchat1_simple_test_Clean.csv',
    'df_tocxicchat1_small_test_clean.csv',
    'df_tocxicchat1_small_train_clean.csv',
    'df_tocxicchat1_test_clean.csv',
    'df_tocxicchat1_train_clean.csv',
    'df_toxicgen1_simple_test_Clean.csv',
    'df_toxicgen1_small_test_clean.csv',
    'df_toxicgen1_small_train_clean.csv',
    'df_toxicgen1_test_clean.csv'
]

# 遍历每个CSV文件，分别进行训练和评估
results = []

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
for file_path in csv_files:
    print(f"Processing file: {file_path}")

    # 数据预处理
    dataset = preprocess_data(file_path)
    train_test_split = dataset.train_test_split(test_size=0.1)
    train_data = train_test_split['train']
    validation_data = train_test_split['test']

    # 加载模型到GPU
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

    # 数据集格式转换为模型输入
    def tokenize_data(batch):
        try:
            # 为输入和目标标记化
            tokenized_input = tokenizer(batch['input_text'], padding='max_length', truncation=True, max_length=512)
            tokenized_label = tokenizer(batch['target_text'], padding='max_length', truncation=True, max_length=64)

            # 设置标签
            tokenized_input['labels'] = tokenized_label['input_ids']
            return tokenized_input
        except Exception as e:
            print(f"Error in tokenization: {e}")
            return None

    # 对训练和验证数据进行标记化处理，并过滤掉 None 样本
    train_data = train_data.map(tokenize_data, batched=True, remove_columns=train_data.column_names)
    validation_data = validation_data.map(tokenize_data, batched=True, remove_columns=validation_data.column_names)

    # 过滤掉 None 样本
    train_data = train_data.filter(lambda x: x is not None)
    validation_data = validation_data.filter(lambda x: x is not None)

    # 初始化Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_data,
        eval_dataset=validation_data,
    )

    # 训练模型
    try:
        trainer.train()
    except Exception as e:
        print(f"Error during training for file {file_path}: {e}")
        continue

    # 评估模型
    try:
        eval_results = trainer.evaluate()
    except Exception as e:
        print(f"Error during evaluation for file {file_path}: {e}")
        eval_results = {
            'eval_loss': None,
            'eval_accuracy': None,
            'eval_precision': None,
            'eval_recall': None,
            'eval_tp': None,
            'eval_tn': None,
            'eval_fp': None,
            'eval_fn': None
        }

    # 保存评估结果
    eval_results['file'] = file_path
    results.append(eval_results)

    # 打印结果
    print(f"Results for {file_path}:")
    print(eval_results)

# 记录所有文件的训练和评估结果
print("Final Results for All Files:")
for result in results:
    print(result)


Processing file: df_toxicgen1_train_clean.csv


Map:   0%|          | 0/8064 [00:00<?, ? examples/s]

Map:   0%|          | 0/896 [00:00<?, ? examples/s]

Filter:   0%|          | 0/8064 [00:00<?, ? examples/s]

Filter:   0%|          | 0/896 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.0131,0.009304,0.700893,0.6,0.208333,60,568,40,228
2,0.0105,0.008121,0.74442,0.649746,0.444444,128,539,69,160
3,0.0078,0.007822,0.746652,0.621514,0.541667,156,513,95,132


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for df_toxicgen1_train_clean.csv:
{'eval_loss': 0.007821546867489815, 'eval_accuracy': 0.7466517857142857, 'eval_precision': 0.6215139442231076, 'eval_recall': 0.5416666666666666, 'eval_tp': 156, 'eval_tn': 513, 'eval_fp': 95, 'eval_fn': 132, 'eval_runtime': 23.0916, 'eval_samples_per_second': 38.802, 'eval_steps_per_second': 4.85, 'epoch': 3.0, 'file': 'df_toxicgen1_train_clean.csv'}
Processing file: ds_ethics_commonsense_small_test_clean.csv


Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Filter:   0%|          | 0/18 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,No log,18.778057,1.0,1.0,1.0,1,1,0,0
2,No log,14.644943,1.0,1.0,1.0,1,1,0,0
3,No log,12.551426,1.0,1.0,1.0,1,1,0,0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for ds_ethics_commonsense_small_test_clean.csv:
{'eval_loss': 12.55142593383789, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_tp': 1, 'eval_tn': 1, 'eval_fp': 0, 'eval_fn': 0, 'eval_runtime': 0.1159, 'eval_samples_per_second': 17.26, 'eval_steps_per_second': 8.63, 'epoch': 3.0, 'file': 'ds_ethics_commonsense_small_test_clean.csv'}
Processing file: ds_ethics_commonsense_small_train_clean.csv


Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Filter:   0%|          | 0/90 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,13.1285,8.305262,0.375,0.5,0.6,3,0,3,2
2,6.7724,2.822508,0.5,0.5,1.0,2,0,2,0
3,4.3346,1.373846,0.5,0.5,1.0,2,0,2,0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for ds_ethics_commonsense_small_train_clean.csv:
{'eval_loss': 1.3738460540771484, 'eval_accuracy': 0.5, 'eval_precision': 0.5, 'eval_recall': 1.0, 'eval_tp': 2, 'eval_tn': 0, 'eval_fp': 2, 'eval_fn': 0, 'eval_runtime': 1.908, 'eval_samples_per_second': 5.241, 'eval_steps_per_second': 1.048, 'epoch': 3.0, 'file': 'ds_ethics_commonsense_small_train_clean.csv'}
Processing file: ds_ethics_commonsense_small_validation_clean.csv


Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Filter:   0%|          | 0/18 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,No log,20.417028,0.0,0.0,0.0,0,0,1,1
2,No log,18.540455,0.5,0.5,1.0,1,0,1,0
3,No log,15.74999,0.5,0.5,1.0,1,0,1,0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for ds_ethics_commonsense_small_validation_clean.csv:
{'eval_loss': 15.74998950958252, 'eval_accuracy': 0.5, 'eval_precision': 0.5, 'eval_recall': 1.0, 'eval_tp': 1, 'eval_tn': 0, 'eval_fp': 1, 'eval_fn': 0, 'eval_runtime': 0.0977, 'eval_samples_per_second': 20.47, 'eval_steps_per_second': 10.235, 'epoch': 3.0, 'file': 'ds_ethics_commonsense_small_validation_clean.csv'}
Processing file: ds_ethics_commonsense_test_clean.csv


Map:   0%|          | 0/3567 [00:00<?, ? examples/s]

Map:   0%|          | 0/397 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3567 [00:00<?, ? examples/s]

Filter:   0%|          | 0/397 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.0174,0.011686,0.523929,0.520588,0.871921,177,31,163,26
2,0.013,0.011128,0.536524,0.528529,0.866995,176,37,157,27
3,0.0116,0.011228,0.536524,0.526171,0.940887,191,22,172,12


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for ds_ethics_commonsense_test_clean.csv:
{'eval_loss': 0.011127719655632973, 'eval_accuracy': 0.5365239294710328, 'eval_precision': 0.5285285285285285, 'eval_recall': 0.8669950738916257, 'eval_tp': 176, 'eval_tn': 37, 'eval_fp': 157, 'eval_fn': 27, 'eval_runtime': 10.2369, 'eval_samples_per_second': 38.781, 'eval_steps_per_second': 4.884, 'epoch': 3.0, 'file': 'ds_ethics_commonsense_test_clean.csv'}
Processing file: ds_ethics_commonsense_train_clean.csv


Map:   0%|          | 0/12519 [00:00<?, ? examples/s]

Map:   0%|          | 0/1391 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12519 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1391 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.0119,0.010601,0.581596,0.566836,0.506808,335,474,256,326
2,0.011,0.010093,0.626887,0.63197,0.514372,340,532,198,321
3,0.0104,0.009916,0.633357,0.648915,0.497731,329,552,178,332


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for ds_ethics_commonsense_train_clean.csv:
{'eval_loss': 0.009916276670992374, 'eval_accuracy': 0.6333572969086988, 'eval_precision': 0.6489151873767258, 'eval_recall': 0.4977307110438729, 'eval_tp': 329, 'eval_tn': 552, 'eval_fp': 178, 'eval_fn': 332, 'eval_runtime': 36.2638, 'eval_samples_per_second': 38.358, 'eval_steps_per_second': 4.798, 'epoch': 3.0, 'file': 'ds_ethics_commonsense_train_clean.csv'}
Processing file: ds_ethics_commonsense_validation_clean.csv


Map:   0%|          | 0/3496 [00:00<?, ? examples/s]

Map:   0%|          | 0/389 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3496 [00:00<?, ? examples/s]

Filter:   0%|          | 0/389 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.0158,0.011626,0.498715,0.484429,0.752688,140,54,149,46
2,0.0139,0.011042,0.524422,0.501754,0.768817,143,61,142,43
3,0.0121,0.010948,0.524422,0.502075,0.650538,121,83,120,65


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for ds_ethics_commonsense_validation_clean.csv:
{'eval_loss': 0.010948162525892258, 'eval_accuracy': 0.5244215938303342, 'eval_precision': 0.5020746887966805, 'eval_recall': 0.6505376344086021, 'eval_tp': 121, 'eval_tn': 83, 'eval_fp': 120, 'eval_fn': 65, 'eval_runtime': 10.1659, 'eval_samples_per_second': 38.265, 'eval_steps_per_second': 4.82, 'epoch': 3.0, 'file': 'ds_ethics_commonsense_validation_clean.csv'}
Processing file: HateSpeechDetection_Clean.csv


Map:   0%|          | 0/2700 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2700 [00:00<?, ? examples/s]

Filter:   0%|          | 0/300 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.0307,0.012829,0.76,0.0,0.0,0,228,0,72
2,0.0135,0.00877,0.76,0.0,0.0,0,228,0,72
3,0.01,0.008302,0.76,0.0,0.0,0,228,0,72


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for HateSpeechDetection_Clean.csv:
{'eval_loss': 0.00830227229744196, 'eval_accuracy': 0.76, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_tp': 0, 'eval_tn': 228, 'eval_fp': 0, 'eval_fn': 72, 'eval_runtime': 7.8638, 'eval_samples_per_second': 38.15, 'eval_steps_per_second': 4.832, 'epoch': 3.0, 'file': 'HateSpeechDetection_Clean.csv'}
Processing file: HateSpeechDetection_simple_test_Clean.csv


Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Filter:   0%|          | 0/90 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,12.3644,9.301262,0.6,0.5,1.0,4,2,4,0


Error during training for file HateSpeechDetection_simple_test_Clean.csv: piece id is out of range.
Processing file: HateSpeechDetection_small_test_Clean.csv


Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Filter:   0%|          | 0/18 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,No log,17.063335,0.0,0.0,0.0,0,0,1,1
2,No log,13.476431,0.0,0.0,0.0,0,0,1,1
3,No log,11.975052,0.0,0.0,0.0,0,0,1,1


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for HateSpeechDetection_small_test_Clean.csv:
{'eval_loss': 11.975051879882812, 'eval_accuracy': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_tp': 0, 'eval_tn': 0, 'eval_fp': 1, 'eval_fn': 1, 'eval_runtime': 0.1216, 'eval_samples_per_second': 16.442, 'eval_steps_per_second': 8.221, 'epoch': 3.0, 'file': 'HateSpeechDetection_small_test_Clean.csv'}
Processing file: HateSpeechDetection_small_train_Clean.csv


Map:   0%|          | 0/72 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Filter:   0%|          | 0/72 [00:00<?, ? examples/s]

Filter:   0%|          | 0/8 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,No log,13.417977,0.75,0.666667,0.666667,2,4,1,1
2,12.857300,7.063251,0.75,0.6,1.0,3,3,2,0
3,7.142400,4.062462,0.625,0.5,1.0,3,2,3,0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for HateSpeechDetection_small_train_Clean.csv:
{'eval_loss': 4.062461853027344, 'eval_accuracy': 0.625, 'eval_precision': 0.5, 'eval_recall': 1.0, 'eval_tp': 3, 'eval_tn': 2, 'eval_fp': 3, 'eval_fn': 0, 'eval_runtime': 0.2203, 'eval_samples_per_second': 36.316, 'eval_steps_per_second': 4.54, 'epoch': 3.0, 'file': 'HateSpeechDetection_small_train_Clean.csv'}
Processing file: HateSpeechDetection_test_Clean.csv


Map:   0%|          | 0/540 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Filter:   0%|          | 0/540 [00:00<?, ? examples/s]

Filter:   0%|          | 0/60 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.6891,0.247775,0.0,0.0,0.0,0,0,0,0
2,0.2363,0.140362,0.0,0.0,0.0,0,0,0,0
3,0.1649,0.097679,0.0,0.0,0.0,0,0,0,0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for HateSpeechDetection_test_Clean.csv:
{'eval_loss': 0.09767899662256241, 'eval_accuracy': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_tp': 0, 'eval_tn': 0, 'eval_fp': 0, 'eval_fn': 0, 'eval_runtime': 7.7467, 'eval_samples_per_second': 7.745, 'eval_steps_per_second': 1.033, 'epoch': 3.0, 'file': 'HateSpeechDetection_test_Clean.csv'}
Processing file: HateSpeechDetection_train_Clean.csv


Map:   0%|          | 0/2160 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2160 [00:00<?, ? examples/s]

Filter:   0%|          | 0/240 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.0251,0.016207,0.775,0.0,0.0,0,186,0,54
2,0.01,0.009257,0.775,0.0,0.0,0,186,0,54
3,0.0127,0.008493,0.775,0.0,0.0,0,186,0,54


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for HateSpeechDetection_train_Clean.csv:
{'eval_loss': 0.008492523804306984, 'eval_accuracy': 0.775, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_tp': 0, 'eval_tn': 186, 'eval_fp': 0, 'eval_fn': 54, 'eval_runtime': 6.2923, 'eval_samples_per_second': 38.142, 'eval_steps_per_second': 4.768, 'epoch': 3.0, 'file': 'HateSpeechDetection_train_Clean.csv'}
Processing file: superset_simple_test.csv


Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Filter:   0%|          | 0/90 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss


Error during training for file superset_simple_test.csv: piece id is out of range.
Processing file: superset_test.csv


Map:   0%|          | 0/5949 [00:00<?, ? examples/s]

Map:   0%|          | 0/662 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5949 [00:00<?, ? examples/s]

Filter:   0%|          | 0/662 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.0114,0.010582,0.587613,0.507317,0.376812,104,285,101,172
2,0.0114,0.0102,0.58006,0.49537,0.387681,107,277,109,169
3,0.0117,0.010082,0.616314,0.538732,0.554348,153,255,131,123


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for superset_test.csv:
{'eval_loss': 0.01008231844753027, 'eval_accuracy': 0.6163141993957704, 'eval_precision': 0.5387323943661971, 'eval_recall': 0.5543478260869565, 'eval_tp': 153, 'eval_tn': 255, 'eval_fp': 131, 'eval_fn': 123, 'eval_runtime': 17.2614, 'eval_samples_per_second': 38.351, 'eval_steps_per_second': 4.808, 'epoch': 3.0, 'file': 'superset_test.csv'}
Processing file: superset_train.csv


Map:   0%|          | 0/26725 [00:00<?, ? examples/s]

Map:   0%|          | 0/2970 [00:00<?, ? examples/s]

Filter:   0%|          | 0/26725 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2970 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.0101,0.008708,0.707071,0.605033,0.520716,553,1547,361,509
2,0.0093,0.007931,0.746801,0.617424,0.76742,815,1403,505,247
3,0.0064,0.007572,0.762963,0.647204,0.741055,787,1479,429,275


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for superset_train.csv:
{'eval_loss': 0.007572422735393047, 'eval_accuracy': 0.762962962962963, 'eval_precision': 0.647203947368421, 'eval_recall': 0.7410546139359698, 'eval_tp': 787, 'eval_tn': 1479, 'eval_fp': 429, 'eval_fn': 275, 'eval_runtime': 77.2963, 'eval_samples_per_second': 38.424, 'eval_steps_per_second': 4.813, 'epoch': 3.0, 'file': 'superset_train.csv'}
Processing file: superset_train_small_test.csv


Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Filter:   0%|          | 0/18 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,No log,22.049891,0.5,1.0,0.5,1,0,0,1
2,No log,19.709787,0.5,1.0,0.5,1,0,0,1
3,No log,16.512384,0.5,1.0,0.5,1,0,0,1


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for superset_train_small_test.csv:
{'eval_loss': 16.51238441467285, 'eval_accuracy': 0.5, 'eval_precision': 1.0, 'eval_recall': 0.5, 'eval_tp': 1, 'eval_tn': 0, 'eval_fp': 0, 'eval_fn': 1, 'eval_runtime': 0.1234, 'eval_samples_per_second': 16.21, 'eval_steps_per_second': 8.105, 'epoch': 3.0, 'file': 'superset_train_small_test.csv'}
Processing file: superset_train_small_train.csv


Map:   0%|          | 0/72 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Filter:   0%|          | 0/72 [00:00<?, ? examples/s]

Filter:   0%|          | 0/8 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,No log,11.591455,0.375,0.428571,0.75,3,0,4,1
2,12.680300,5.950215,0.333333,0.4,0.666667,2,0,3,1
3,8.215200,3.864752,0.428571,0.5,0.75,3,0,3,1


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for superset_train_small_train.csv:
{'eval_loss': 3.8647515773773193, 'eval_accuracy': 0.42857142857142855, 'eval_precision': 0.5, 'eval_recall': 0.75, 'eval_tp': 3, 'eval_tn': 0, 'eval_fp': 3, 'eval_fn': 1, 'eval_runtime': 1.0269, 'eval_samples_per_second': 7.79, 'eval_steps_per_second': 0.974, 'epoch': 3.0, 'file': 'superset_train_small_train.csv'}
Processing file: xstest_test_clean.csv


Map:   0%|          | 0/81 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Filter:   0%|          | 0/81 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,12.8175,9.351216,0.666667,0.666667,0.8,4,2,2,1


Error during training for file xstest_test_clean.csv: piece id is out of range.
Processing file: xstest_train_clean.csv


Map:   0%|          | 0/324 [00:00<?, ? examples/s]

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Filter:   0%|          | 0/324 [00:00<?, ? examples/s]

Filter:   0%|          | 0/36 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,1.5585,0.236752,0.0,0.0,0.0,0,0,0,0
2,0.5368,0.267337,0.0,0.0,0.0,0,0,0,0
3,0.4153,0.251406,0.0,0.0,0.0,0,0,0,0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for xstest_train_clean.csv:
{'eval_loss': 0.2367524802684784, 'eval_accuracy': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_tp': 0, 'eval_tn': 0, 'eval_fp': 0, 'eval_fn': 0, 'eval_runtime': 4.785, 'eval_samples_per_second': 7.524, 'eval_steps_per_second': 1.045, 'epoch': 3.0, 'file': 'xstest_train_clean.csv'}
Processing file: df_saladbench_attack_clean.csv


Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss




Error during training for file df_saladbench_attack_clean.csv: not enough values to unpack (expected 4, got 1)
Processing file: df_saladbench_base_clean.csv


Map:   0%|          | 0/19186 [00:00<?, ? examples/s]

Map:   0%|          | 0/2132 [00:00<?, ? examples/s]

Filter:   0%|          | 0/19186 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2132 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss




Error during training for file df_saladbench_base_clean.csv: not enough values to unpack (expected 4, got 1)
Processing file: df_saladbench_defense_clean.csv


Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Filter:   0%|          | 0/180 [00:00<?, ? examples/s]

Filter:   0%|          | 0/20 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,5.3857,0.419003,0.0,0.0,0.0,0,0,0,0
2,1.5448,0.270498,0.0,0.0,0.0,0,0,0,0
3,0.9958,0.277718,0.0,0.0,0.0,0,0,0,0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for df_saladbench_defense_clean.csv:
{'eval_loss': 0.27049779891967773, 'eval_accuracy': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_tp': 0, 'eval_tn': 0, 'eval_fp': 0, 'eval_fn': 0, 'eval_runtime': 2.8926, 'eval_samples_per_second': 6.914, 'eval_steps_per_second': 1.037, 'epoch': 3.0, 'file': 'df_saladbench_defense_clean.csv'}
Processing file: df_saladbench_ground_clean.csv


Map:   0%|          | 0/1728 [00:00<?, ? examples/s]

Map:   0%|          | 0/192 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1728 [00:00<?, ? examples/s]

Filter:   0%|          | 0/192 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss




Error during training for file df_saladbench_ground_clean.csv: not enough values to unpack (expected 4, got 1)
Processing file: df_tocxicchat1_simple_test_Clean.csv


Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Filter:   0%|          | 0/90 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,14.3665,9.755693,0.5,0.2,0.5,1,4,4,1
2,7.5564,2.600276,0.5,0.333333,1.0,2,2,4,0
3,4.0669,0.826344,0.0,0.0,0.0,0,0,2,0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for df_tocxicchat1_simple_test_Clean.csv:
{'eval_loss': 0.82634437084198, 'eval_accuracy': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_tp': 0, 'eval_tn': 0, 'eval_fp': 2, 'eval_fn': 0, 'eval_runtime': 1.8331, 'eval_samples_per_second': 5.455, 'eval_steps_per_second': 1.091, 'epoch': 3.0, 'file': 'df_tocxicchat1_simple_test_Clean.csv'}
Processing file: df_tocxicchat1_small_test_clean.csv


Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Filter:   0%|          | 0/18 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,No log,20.452438,0.5,0.5,1.0,1,0,1,0
2,No log,16.813028,0.5,0.5,1.0,1,0,1,0
3,No log,14.865689,0.0,0.0,0.0,0,0,1,0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for df_tocxicchat1_small_test_clean.csv:
{'eval_loss': 14.865689277648926, 'eval_accuracy': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_tp': 0, 'eval_tn': 0, 'eval_fp': 1, 'eval_fn': 0, 'eval_runtime': 0.5378, 'eval_samples_per_second': 3.719, 'eval_steps_per_second': 1.859, 'epoch': 3.0, 'file': 'df_tocxicchat1_small_test_clean.csv'}
Processing file: df_tocxicchat1_small_train_clean.csv


Map:   0%|          | 0/72 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Filter:   0%|          | 0/72 [00:00<?, ? examples/s]

Filter:   0%|          | 0/8 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,No log,13.466908,0.75,0.6,1.0,3,3,2,0
2,13.562200,6.51639,0.714286,0.6,1.0,3,2,2,0
3,7.830600,3.632258,0.571429,0.5,1.0,3,1,3,0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for df_tocxicchat1_small_train_clean.csv:
{'eval_loss': 3.632258415222168, 'eval_accuracy': 0.5714285714285714, 'eval_precision': 0.5, 'eval_recall': 1.0, 'eval_tp': 3, 'eval_tn': 1, 'eval_fp': 3, 'eval_fn': 0, 'eval_runtime': 1.0224, 'eval_samples_per_second': 7.825, 'eval_steps_per_second': 0.978, 'epoch': 3.0, 'file': 'df_tocxicchat1_small_train_clean.csv'}
Processing file: df_tocxicchat1_test_clean.csv


Map:   0%|          | 0/915 [00:00<?, ? examples/s]

Map:   0%|          | 0/102 [00:00<?, ? examples/s]

Filter:   0%|          | 0/915 [00:00<?, ? examples/s]

Filter:   0%|          | 0/102 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.2507,0.141497,0.0,0.0,0.0,0,0,0,0


Error during training for file df_tocxicchat1_test_clean.csv: piece id is out of range.
Processing file: df_tocxicchat1_train_clean.csv


Map:   0%|          | 0/3658 [00:00<?, ? examples/s]

Map:   0%|          | 0/407 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3658 [00:00<?, ? examples/s]

Filter:   0%|          | 0/407 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.0144,0.005851,0.92629,0.0,0.0,0,377,0,30
2,0.0066,0.00415,0.92629,0.0,0.0,0,377,0,30
3,0.0069,0.003886,0.92629,0.0,0.0,0,377,0,30


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for df_tocxicchat1_train_clean.csv:
{'eval_loss': 0.003885679878294468, 'eval_accuracy': 0.9262899262899262, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_tp': 0, 'eval_tn': 377, 'eval_fp': 0, 'eval_fn': 30, 'eval_runtime': 10.5766, 'eval_samples_per_second': 38.481, 'eval_steps_per_second': 4.822, 'epoch': 3.0, 'file': 'df_tocxicchat1_train_clean.csv'}
Processing file: df_toxicgen1_simple_test_Clean.csv


Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Filter:   0%|          | 0/90 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,12.92,8.974955,0.2,0.285714,0.4,2,0,5,3
2,6.3451,2.688603,0.4,0.444444,0.8,4,0,5,1
3,3.6022,0.842851,0.333333,0.333333,1.0,1,0,2,0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for df_toxicgen1_simple_test_Clean.csv:
{'eval_loss': 0.8428505659103394, 'eval_accuracy': 0.3333333333333333, 'eval_precision': 0.3333333333333333, 'eval_recall': 1.0, 'eval_tp': 1, 'eval_tn': 0, 'eval_fp': 2, 'eval_fn': 0, 'eval_runtime': 1.857, 'eval_samples_per_second': 5.385, 'eval_steps_per_second': 1.077, 'epoch': 3.0, 'file': 'df_toxicgen1_simple_test_Clean.csv'}
Processing file: df_toxicgen1_small_test_clean.csv


Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Filter:   0%|          | 0/18 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss




Error during training for file df_toxicgen1_small_test_clean.csv: not enough values to unpack (expected 4, got 1)
Processing file: df_toxicgen1_small_train_clean.csv


Map:   0%|          | 0/72 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Filter:   0%|          | 0/72 [00:00<?, ? examples/s]

Filter:   0%|          | 0/8 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,No log,11.083766,0.333333,0.333333,1.0,2,0,4,0
2,12.716400,4.244342,0.4,0.4,1.0,2,0,3,0
3,6.606300,2.215675,0.5,0.5,1.0,2,0,2,0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for df_toxicgen1_small_train_clean.csv:
{'eval_loss': 2.2156753540039062, 'eval_accuracy': 0.5, 'eval_precision': 0.5, 'eval_recall': 1.0, 'eval_tp': 2, 'eval_tn': 0, 'eval_fp': 2, 'eval_fn': 0, 'eval_runtime': 0.9752, 'eval_samples_per_second': 8.203, 'eval_steps_per_second': 1.025, 'epoch': 3.0, 'file': 'df_toxicgen1_small_train_clean.csv'}
Processing file: df_toxicgen1_test_clean.csv


Map:   0%|          | 0/846 [00:00<?, ? examples/s]

Map:   0%|          | 0/94 [00:00<?, ? examples/s]

Filter:   0%|          | 0/846 [00:00<?, ? examples/s]

Filter:   0%|          | 0/94 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Tp,Tn,Fp,Fn
1,0.2903,0.187035,0.0,0.0,0.0,0,0,0,0
2,0.1287,0.039506,0.625,0.0,0.0,0,10,0,6
3,0.0564,0.022622,0.638298,0.0,0.0,0,60,0,34


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Results for df_toxicgen1_test_clean.csv:
{'eval_loss': 0.022621579468250275, 'eval_accuracy': 0.6382978723404256, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_tp': 0, 'eval_tn': 60, 'eval_fp': 0, 'eval_fn': 34, 'eval_runtime': 2.5031, 'eval_samples_per_second': 37.553, 'eval_steps_per_second': 4.794, 'epoch': 3.0, 'file': 'df_toxicgen1_test_clean.csv'}
Final Results for All Files:
{'eval_loss': 0.007821546867489815, 'eval_accuracy': 0.7466517857142857, 'eval_precision': 0.6215139442231076, 'eval_recall': 0.5416666666666666, 'eval_tp': 156, 'eval_tn': 513, 'eval_fp': 95, 'eval_fn': 132, 'eval_runtime': 23.0916, 'eval_samples_per_second': 38.802, 'eval_steps_per_second': 4.85, 'epoch': 3.0, 'file': 'df_toxicgen1_train_clean.csv'}
{'eval_loss': 12.55142593383789, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_tp': 1, 'eval_tn': 1, 'eval_fp': 0, 'eval_fn': 0, 'eval_runtime': 0.1159, 'eval_samples_per_second': 17.26, 'eval_steps_per_second': 8.63, 'epoch': 3.0, '