# 数据处理

In [None]:
from datasets import load_dataset

# Finetune T5 on the California state bill subset of the BillSum dataset for abstractive summarization
data_file = {
    "train":"your download path/data/train-00000-of-00001.parquet",
    "validation":"your download path/data/ca_test-00000-of-00001.parquet",
    "test":"your download path/data/test-00000-of-00001.parquet"
}
model_tokenizer_path = "your model patg"
dataset = load_dataset("parquet", data_files=data_file)
dataset

In [None]:
sample = dataset["train"][0]
sample["text"], sample

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_tokenizer_path)
tokenizer

In [None]:
def process_function(example):
    texts = example["text"]
    summaries = example["summary"]
    
    # 因为这里的texts和summaries的长度差别有点大， 所以没有放到一起统一处理，而不像在翻译任务中的数据
    tokenized_text = tokenizer(texts, max_length=1024, truncation=True)
    tokenized_summaries = tokenizer(text_target=summaries, max_length=128, truncation=True)

    return {
        "input_ids": tokenized_text["input_ids"],
        "attention_mask":tokenized_text["attention_mask"],
        "labels":tokenized_summaries["input_ids"]
    }
    # tokenized_text["labels"] = tokenized_summaries["input_ids"]
    # return tokenized_text

In [None]:
tokenized_dataset = dataset.map(process_function, batched=True, remove_columns=dataset["train"].column_names)
tokenized_dataset

**解释关于tokenizer中传入一个文本还是两个文本的差别**

In [None]:
# 明白tokenizer中传入text_target的作用：
# 传入两个后面的会放在labels中，只传入tokenizer(text_target=example)的效果和tokenizer(sample)一样

# 但是直接将sample_a和sample_b放入到tokenizer中有一个问题就是他会对这两个文本进行同样的处理，比如说阶段填充等，不灵活

sample_a = "I am very happy to learn knowledge about LLM"
sample_b = "like to learn LLM"
tokenized_a = tokenizer(sample_a)
tokenized_b = tokenizer(sample_b)
tokenized_b_tar = tokenizer(text_target=sample_b)
tokenized_a_b = tokenizer(sample_a, text_target=sample_b)
tokenized_a, tokenized_b, tokenized_b_tar, tokenized_a_b

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_tokenizer_path)
model

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
data_collator

# 模型训练

In [None]:
from evaluate import load

rouge = load("../../evaluate/rouge.py")
rouge

In [None]:
import numpy as np

def compute_metrics(pred):
    predictions, labels = pred

    # 这里为什么是对prediction进行转化而不进行处理呢，因为prediction中是模型输出的不包含-100
    predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # 将-100填充成为pad_token_id,where(condition, True_element, False_element)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=predictions, references=labels, use_stemmer=True)
    print(predictions)
    prediction_len = [np.count_nonzero(prediction) for prediction in predictions]

    result["gen_len"] = np.mean(prediction_len)

    return result

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./checkpoint",
    per_device_train_batch_size=2,
    logging_strategy="steps",
    logging_steps=30,
    num_train_epochs=3,
    learning_rate=3e-5,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=3,
    per_device_eval_batch_size=3,
    eval_strategy="steps",
    eval_steps=3,
    predict_with_generate=True,
    fp16=True
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].shuffle().select(range(300)),
    eval_dataset=tokenized_dataset["validation"].shuffle().select(range(50)),
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

# 评估

In [None]:
trainer.evaluate(tokenized_dataset["test"].shuffle().select(range(100)))

# 推断

## 使用pipeline

In [None]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."

from transformers import pipeline

pipe = pipeline(task="summarization", model=model_tokenizer_path)

pipe(text)

## 手动进行推断

In [None]:
import torch

tokenized_text = tokenizer(text, return_tensors="pt")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
tokenized_text = {k:v.to(device) for k, v in tokenized_text.items()}
generate_ids = model.generate(**tokenized_text,  max_new_tokens=100, do_sample=False)
generate_text = tokenizer.decode(generate_ids[0])
generate_text