In [None]:
import transformers
import datasets

In [None]:
BASE_DIR = 'c:/Users/raven/Nextcloud/Documents/Development/translation'
model_name = f"{BASE_DIR}/nllb-200-distilled-600M"

In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_name).to('cuda')

In [None]:
# The dataset is hosted on the Hub; no local files are needed.
# It contains two fields: "la" (latin) and "en" (english).
raw = datasets.load_dataset("grosenthal/latin_english_translation")

# The hub version already provides a train/validation split.
# If you want a test split as well, you can further split the validation set.
raw = raw["train"].train_test_split(test_size=0.1, seed=42)

train_raw = raw["train"].shuffle()
test_raw  = raw["test"]

In [None]:
# Let's look at an example.
train_raw[0]

{'id': 4713,
 'la': 'Tibi adeo permitto, finge quidvis, eminiscere, excogita, quod possit magicum videri: tamen de eo tecum decertarem.',
 'en': 'Indeed, I give you leave to imagine, invent, suppose anything that might seem to be magical, and even so I would dispute what you said about it.',
 'file': 'final_alignments\\Apuleius_Apologia.json'}

In [None]:
# Latin is a new language for the nllb model, let's add a new language token.
tokenizer.add_special_tokens({'additional_special_tokens': ['lat_Latn']}, replace_additional_special_tokens=False)

1

In [None]:
# The token embeddings need to be extended.
model.resize_token_embeddings(len(tokenizer))

M2M100ScaledWordEmbedding(256205, 1024, padding_idx=1)

In [None]:
def build_example(example):
    """ Apply this function to all examples to create training samples.

    Set the special token for the tokenizer first to english and then to latin.
    The tokenize I use here automatically prepends the language token, which we need
    so that the model knows what to translate into what.

    Also set the pad tokens to -100 so that the loss computation is not biased by many padding tokens.
    
    Params:
        example: A dictionary from the datasets object containing the English and Latin sentences.

    Returns:
        A Dictionary per example with the token IDs for the English and Latin sentences and the attention mask
        for the English sentence.
    """
    tokenizer.src_lang = 'eng_Latn'
    x = tokenizer(example['en'], return_tensors='pt', padding="max_length", truncation=True, max_length=256)
    tokenizer.src_lang = 'lat_Latn'
    y = tokenizer(example['la'], return_tensors='pt', padding="max_length", truncation=True, max_length=256)
    y.input_ids[y.input_ids == tokenizer.pad_token_id] = -100

    # Return everything the model expects
    return {
        "input_ids": x["input_ids"][0],
        "attention_mask": x["attention_mask"][0],
        "labels": y["input_ids"][0]
    }

# Apply transform on all samples.
train_dataset = train_raw.map(build_example, remove_columns=["en", "la"])
test_dataset  = test_raw.map(build_example,  remove_columns=["en", "la"])

Map:   0%|          | 0/89408 [00:00<?, ? examples/s]

Map:   0%|          | 0/9935 [00:00<?, ? examples/s]

In [None]:
# The data collator. Pretty straightforward.
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model_name,
    padding=True,            # pad to longest in batch
    label_pad_token_id=-100
)

In [None]:
# Set up the trainer.
# I only train for 8 epochs, because my GPU isn't exactly the best and this will take some time already.
training_args = transformers.Seq2SeqTrainingArguments(
    output_dir=f"{BASE_DIR}/finetuned-nllb",
    overwrite_output_dir=True,
    num_train_epochs=8,               # start small; increase if needed
    per_device_train_batch_size=4,    # fits on a 12 GB GPU; adjust as memory allows
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,    # effective batch size = 32
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    learning_rate=5e-5,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=True,                        # mixed‑precision speeds up training on RTX/AMP GPUs
    push_to_hub=False,
)

trainer = transformers.Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

In [None]:
# Train.
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.6932,2.513648
2,2.3718,2.242853
3,2.1468,2.112076
4,2.0468,2.041561
5,1.9187,1.998994
6,1.8855,1.966503
7,1.825,1.953404
8,1.8136,1.949745




TrainOutput(global_step=22352, training_loss=2.1760324659139325, metrics={'train_runtime': 47609.9634, 'train_samples_per_second': 15.023, 'train_steps_per_second': 0.469, 'total_flos': 3.875129765312594e+17, 'train_loss': 2.1760324659139325, 'epoch': 8.0})

In [None]:
utils.translate_en_lat(test_raw[1]['en'], tokenizer, model), test_raw[1]['en'], test_raw[1]['la']

('scelus Tarpeiae et turpis sermo tellus et veteris capta domus Iovis.',
 'The crime of Tarpeia and her shameful grave will be my tale, and how the dwelling of ancient Jove was captured.',
 'Tarpeium scelus et Tarpeiae turpe sepulcrum fabor et antiqui limina capta Iovis.')

In [None]:
utils.translate_en_lat(train_raw[1]['en'], tokenizer, model), train_raw[1]['en'], train_raw[1]['la']

('Quibus quoniam propter extrema levitatem, ut supra docuimus, velocitate circumferuntur, facile animos nostros singulis imaginibus quicumque dato exstinguit; est enim ipsa mens tenvia et mirabile motus.',
 'And since these are carried about with velocity because of their extreme lightness, as I explained before, any given one of these fine images easily bestirs our mind by a single impression; for the mind is itself thin and wonderfully easy to move.',
 'quae cum mobiliter summa levitate feruntur, ut prius ostendi, facile uno commovet ictu quaelibet una animum nobis subtilis imago; tenvis enim mens est et mire mobilis ipsa.')