In [1]:
# 安装必要的依赖库
!pip install -q mlflow rouge_score transformers datasets evaluate
!pip install evaluate


# Import necessary toolkits
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset
import pandas as pd
import numpy as np
import torch
from evaluate import load as load_metric  # 使用 evaluate 库来加载指标

# 检查GPU是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.0/233.0 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.2/128.2 kB[0m [31m10.5 MB/s[0m eta [36m

In [2]:
def preprocess_data(csv_files):
    data = []
    for file in csv_files:
        df = pd.read_csv(file)
        # 根据实际列名设置输入列和标签列
        if 'text' in df.columns:
            text_column = 'text'
        elif 'sentence' in df.columns:
            text_column = 'sentence'
        elif 'prompt' in df.columns:
            text_column = 'prompt'
        else:
            raise ValueError(f"Column for text input not found in {file}. Available columns are: {df.columns}")

        if 'label' not in df.columns:
            raise ValueError(f"Column 'label' not found in {file}. Available columns are: {df.columns}")

        for _, row in df.iterrows():
            text = f"text: {row[text_column]} </s>"
            label = str(row['label'])  # 将标签转换为字符串形式
            data.append({"input_text": text, "target_text": label})
    return data


csv_files = [
    'superset_test.csv',
    'superset_train.csv',
    'superset_train_small_test.csv',
    'superset_train_small_train.csv'
]
all_data = preprocess_data(csv_files)

In [3]:
# Step 3: 创建数据集
all_data = preprocess_data(csv_files)
dataset = Dataset.from_pandas(pd.DataFrame(all_data))
dataset = dataset.train_test_split(test_size=0.1)
train_data = dataset['train']
validation_data = dataset['test']

In [4]:
# Step 4: 加载T5模型和分词器，并将模型加载到GPU上
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)


# 数据集格式转换为模型输入
def tokenize_data(batch):
    tokenized_input = tokenizer(batch['input_text'], padding='max_length', truncation=True, max_length=512)
    tokenized_label = tokenizer(batch['target_text'], padding='max_length', truncation=True, max_length=64)

    tokenized_input['labels'] = tokenized_label['input_ids']
    return tokenized_input

train_data = train_data.map(tokenize_data, batched=True)
validation_data = validation_data.map(tokenize_data, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/32765 [00:00<?, ? examples/s]



Map:   0%|          | 0/3641 [00:00<?, ? examples/s]

In [5]:
BATCH_SIZE = 8
NUM_EPOCHS = 1

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    logging_dir='./logs',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    overwrite_output_dir=True,
    save_total_limit=3,
    load_best_model_at_end=True,
    fp16=True,
    report_to="none",
)



In [6]:
# 加载分类评估指标（例如 accuracy）
accuracy_metric = load_metric("accuracy")

# 定义评估函数
def compute_metrics(pred):
    pred_str = tokenizer.batch_decode(pred.predictions, skip_special_tokens=True)
    labels_ids = pred.label_ids
    labels_ids = np.where(labels_ids == -100, tokenizer.pad_token_id, labels_ids)
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    # 转换预测和标签为整数形式
    pred_int = [int(p) for p in pred_str]
    label_int = [int(l) for l in label_str]

    # 计算准确率
    accuracy = accuracy_metric.compute(predictions=pred_int, references=label_int)

    return {"accuracy": accuracy["accuracy"]}


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [7]:
# Step 5: 初始化Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=validation_data,
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [8]:
# 开始训练
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0102,0.009183,0.669871


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=4096, training_loss=0.07563605193809053, metrics={'train_runtime': 833.7123, 'train_samples_per_second': 39.3, 'train_steps_per_second': 4.913, 'total_flos': 4434474125230080.0, 'train_loss': 0.07563605193809053, 'epoch': 1.0})