In [1]:
from datasets import load_dataset

# Finetune T5 on the English-French subset of the OPUS Books dataset to translate English text to French
data_file = "your data path"
model_tokenizer_path = "your model path"


dataset = load_dataset("parquet", data_files=data_file, split="train")
train_test_dataset = dataset.train_test_split(0.3)
train_test_dataset


  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 88959
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 38126
    })
})

In [2]:
sample = train_test_dataset["train"][0]
sample

{'id': '37001',
 'translation': {'en': 'Next day, when you were at Paris, waiting for your father, and he did not return, a man came to the door and handed in a letter from M. Duval.',
  'fr': '"Le lendemain, pendant que vous étiez à Paris et que vous attendiez votre père qui ne rentrait pas, un homme se présentait chez moi, et me remettait une lettre de M. Duval.'}}

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_tokenizer_path)
tokenizer

T5TokenizerFast(name_or_path='D:/Desktop/learn/instance/model/T5_small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_

In [None]:
source_lang = "en"
target_lang = "fr"
prefix = "translate English to French:"

def process_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    # text_target的会生成labels
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)

    return model_inputs
    

In [8]:
# 看联合编码和分别编码之间的区别
source_lang = "en"
target_lang = "fr"

union_tokenize = tokenizer(sample["translation"][source_lang], text_target=sample["translation"][target_lang])
tokenizer_source = tokenizer(sample["translation"][source_lang])
tokenizer_target = tokenizer(sample["translation"][target_lang])

union_tokenize_labels = union_tokenize["labels"]
tokenize_alone_labels = tokenizer_target["input_ids"]

for i, j in enumerate(union_tokenize_labels):
    if j != tokenize_alone_labels[i]:
        print("map error")

In [None]:
tokenized_datast = train_test_dataset.map(process_function, batched=True, remove_columns=train_test_dataset["train"].column_names)
tokenized_datast

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_tokenizer_path)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
data_collator

In [None]:
sample = train_test_dataset["train"][2]
sample_inputs = tokenizer(sample["translation"]["en"], text_target=sample["translation"]["fr"])
sampe_data_collator = data_collator([sample_inputs])
# decoder_input_ids和labels之间的关系是pad_token_id进行填充的
sampe_data_collator

In [None]:
from evaluate import load


sacrebleu = load("../../evaluate/sacrebleu.py")
sacrebleu

In [None]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # 如果不替换成为tokenizer.pad_token_id的话在decode的时候会出错
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # 不需要两个句子长度一样（已经进行了平均）
    result = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    # 返回预测句子的平均长度
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./output",
    eval_strategy="steps",
    logging_strategy="steps",
    logging_steps=3,
    eval_steps=3,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datast["train"].shuffle().select(range(30)),
    eval_dataset=tokenized_datast["test"].shuffle().select(range(10)),
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

# 推断

## 使用pipeline

In [None]:
from transformers import pipeline

pipe = pipeline(task="translation", model=model_tokenizer_path)
pipe

In [None]:
text = "translate English to French: I love you"

pipe(text)

## 手动推断

In [None]:
import torch

tokenized_text = tokenizer(text, return_tensors="pt")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
tokenized_text = {k:v.to(device) for k, v in tokenized_text.items()}
generate_ids = model.generate(**tokenized_text)[0]

generate_text = tokenizer.decode(generate_ids, skip_special_tokens=True, do_sample=True, top_k=30, top_p=0.95)
generate_text