#Load a Small Dataset
WMT14 English-German dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("wmt14", "de-en", split='train[:1000]')

# Split the dataset into training and validation
train_dataset = dataset.train_test_split(test_size=0.1)
train_data = train_dataset['train']
valid_data = train_dataset['test']

In [None]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

def tokenize_function(examples):
    inputs = [example['en'] for example in examples['translation']]
    targets = [example['de'] for example in examples['translation']]
    # Enable padding and truncation
    model_inputs = tokenizer(inputs, text_target=targets, truncation=True, padding="max_length")
    return model_inputs


# Apply the tokenizer to the datasets
tokenized_train_data = train_data.map(tokenize_function, batched=True)
tokenized_valid_data = valid_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
from transformers import T5ForConditionalGeneration, T5Config

# Load the model configuration and the model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_valid_data,
    tokenizer=tokenizer,
)

# Start training
trainer.train()




Epoch,Training Loss,Validation Loss
1,No log,0.174107
2,No log,0.149673
3,1.638800,0.1447


TrainOutput(global_step=675, training_loss=1.2672367519802517, metrics={'train_runtime': 272.1203, 'train_samples_per_second': 9.922, 'train_steps_per_second': 2.481, 'total_flos': 365422863974400.0, 'train_loss': 1.2672367519802517, 'epoch': 3.0})

#  Evaluation

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

# Print evaluation metrics
print(f"Evaluation Results: {eval_results}")


Evaluation Results: {'eval_loss': 0.14470025897026062, 'eval_runtime': 2.96, 'eval_samples_per_second': 33.784, 'eval_steps_per_second': 8.446, 'epoch': 3.0}


# Generate Translations

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
tokenizer = T5Tokenizer.from_pretrained("t5-small")

sample_text = "The house is wonderful."
inputs = tokenizer(sample_text, return_tensors="pt").input_ids.to(device)

# Generate translation
outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Translation: {translation}")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Translation: Das Haus ist wunderbar.
