In [None]:
"""
pip install torch==2.2.0 \
            transformers==4.41.0 \
            datasets==2.19.0 \
"""

In [1]:
import transformers
import torch
import datasets

In [2]:
BASE_DIR = 'c:/Users/raven/Nextcloud/Documents/Development/translation'
model_name = f"{BASE_DIR}/gpt2-medium"   

In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.GPT2LMHeadModel.from_pretrained(model_name).to('cuda')

tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({"additional_special_tokens": ['<|stop|>']})
stop_token_id = tokenizer.convert_tokens_to_ids('<|stop|>')

In [4]:
tokenizer.eos_token, tokenizer.eos_token_id, tokenizer.additional_special_tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.additional_special_tokens[0]), stop_token_id

('<|endoftext|>', 50256, '<|stop|>', 50257, 50257)

In [5]:
NUM_BLOCKS = len(model.transformer.h)   # total blocks (12 for base GPT‑2)
freeze_up_to = -1

print(f"GPT2 has {NUM_BLOCKS} transformer blocks; freezing first {freeze_up_to}")

GPT2 has 24 transformer blocks; freezing first -1


In [6]:

def freeze_gpt2_layers(model, n_blocks):
    """
    Freeze the first `n_blocks` transformer blocks + the embedding matrices.
    """
    # Freeze token & position embeddings (optional but common)

    if n_blocks < 0:
        trainable = sum(p.requires_grad for p in model.parameters())
        total = sum(1 for _ in model.parameters())
        print(f"Trainable params: {trainable}/{total}")
        return None
    
    for param in model.transformer.wte.parameters():
        param.requires_grad = False
    for param in model.transformer.wpe.parameters():
        param.requires_grad = False

    # 3b️⃣ Freeze the first `n_blocks` blocks
    for block_idx in range(n_blocks):
        block = model.transformer.h[block_idx]
        for name, param in block.named_parameters():
            param.requires_grad = False
            # Uncomment the next line for a quick sanity print:
            # print(f"Freezing {name} in block {block_idx}")

    # 3c️⃣ Verify which parameters are still trainable
    trainable = sum(p.requires_grad for p in model.parameters())
    total = sum(1 for _ in model.parameters())
    print(f"Trainable params: {trainable}/{total}")

# Apply the freezing
freeze_gpt2_layers(model, freeze_up_to)

Trainable params: 292/292


In [7]:
# The dataset is hosted on the Hub; no local files are needed.
# It contains two fields: "latin" (source) and "english" (target).
raw = datasets.load_dataset("grosenthal/latin_english_translation")

# The hub version already provides a train/validation split.
# If you want a test split as well, you can further split the validation set.
raw = raw["train"].train_test_split(test_size=0.1, seed=42)

train_raw = raw["train"].shuffle()
test_raw  = raw["test"]

In [8]:
train_raw[0]

{'id': 94664,
 'la': 'et corruent singuli super fratres suos quasi bella fugientes nemo vestrum inimicis audebit resistere',
 'en': 'And they shall every one fall upon their brethren as fleeing from wars: none of you shall dare to resist your enemies.',
 'file': 'final_alignments\\Vulgate_Bible.json'}

In [9]:
INSTRUCTION = 'translate English to Latin'
MAX_LEN = 128

In [14]:
def build_example(example):
    # Example prompt – feel free to change the wording or language pair
    prompt = f"{INSTRUCTION}: {example['en']}"
    # The model will see: "<prompt> <example>"
    response = f"{example['la']}"
    # Tokenise the input prompt
    tokenized_prompt = tokenizer(
        prompt,
        truncation=True,
        max_length=MAX_LEN,
        add_special_tokens=True,
        padding="max_length",
    )
    tokenized_response = tokenizer(
        response,
        truncation=True,
        max_length=MAX_LEN,
        add_special_tokens=True,
        padding="max_length",
    )

    tokenized_response_input_ids = tokenized_response['input_ids']  # this is a list

    try:
        first_eos_token = tokenized_response_input_ids.index(tokenizer.eos_token_id)
        tokenized_response_input_ids[first_eos_token] = stop_token_id
        tokenized_response_input_ids = [x if x != tokenizer.eos_token_id else -100 for x in tokenized_response_input_ids]
    except ValueError:
        # There is no first eos_token_id: we ran out of space with our max_length
        # Don't do anything.
        pass

    example['input_ids'] = tokenized_prompt['input_ids']
    example['labels'] = tokenized_response_input_ids
    return example

train_dataset = train_raw.select(range(10000)).map(build_example, remove_columns=["en", "la"])
test_dataset  = test_raw.map(build_example,  remove_columns=["en", "la"])

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9935 [00:00<?, ? examples/s]

In [15]:
data_collator = transformers.DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,                 # not masked LM – we keep causal LM behavior
)

In [16]:
model.resize_token_embeddings(len(tokenizer))   # in case we added a pad token

Embedding(50258, 1024)

In [17]:
training_args = transformers.TrainingArguments(
    output_dir=f"{BASE_DIR}/finetuned-gpt2",
    overwrite_output_dir=True,
    num_train_epochs=3,               # start small; increase if needed
    per_device_train_batch_size=4,    # fits on a 12 GB GPU; adjust as memory allows
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,    # effective batch size = 32
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,                        # mixed‑precision speeds up training on RTX/AMP GPUs
    push_to_hub=False,
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

In [18]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,3.2525,3.154493
2,3.0435,3.138791
3,2.8645,3.150563


TrainOutput(global_step=939, training_loss=3.081868936459477, metrics={'train_runtime': 858.5029, 'train_samples_per_second': 34.945, 'train_steps_per_second': 1.094, 'total_flos': 6965255208960000.0, 'train_loss': 3.081868936459477, 'epoch': 3.0})

In [20]:
def translate(sentence: str, max_new_tokens: int = 128):
    prompt = f"{INSTRUCTION}: {sentence}"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    # Generate until EOS or max_new_tokens
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        eos_token_id=stop_token_id,
        pad_token_id=tokenizer.pad_token_id,
        do_sample=True,               # you can switch to greedy (do_sample=False)
        temperature=0.7,
    )
    # Decode everything after the prompt
    output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    # Remove the prompt part
    return output[len(prompt):].strip()

# Example
print(translate("How are you today?"))

Have you been a good boy? No, I have not been one. But I am glad that you are alive, because, if I had not been, I should not have been able to write this. It is true that I am rather alone in this, since you have been my only companion, and it is better than no one. I am not sure how it happened, but I do not believe that you are so wretched that you have been unable to bear a letter. I should like to send you a letter, but I am afraid that I shall have to wait until next summer. Tell me, my dear friend, what


In [21]:
test_raw[0]

{'id': 18337,
 'la': 'Verum ea re tot res sunt, uti bene deicias, et suave est',
 'en': 'but there are so many ingredients in this concoction that it is an excellent purgative, and, besides, it is agreeable.',
 'file': 'final_alignments\\Cato_Agriculture.json'}

In [23]:
print(translate(train_raw[0]['en']))

And the Lord shall take you by the hand, and you shall die. And you shall be dashed with a sword, and shall die. But I will protect you, and will spare you. And your enemies shall be destroyed, and you shall find no place to escape. And I will hear your prayers, and will deliver you out of my hand. And you shall be delivered into my hand. And you shall escape into your own country. And my God will save you out of my hand. And your enemies shall be destroyed, and you shall find no place to escape. And I will bring you out of the midst of your enemies


In [27]:
inputs = tokenizer('The story of the black cat goes so:', return_tensors="pt").to("cuda")
# Generate until EOS or max_new_tokens
generated_ids = model.generate(
    **inputs,
    max_new_tokens=16,
    eos_token_id=stop_token_id,
    pad_token_id=tokenizer.pad_token_id,
    do_sample=True,               # you can switch to greedy (do_sample=False)
    temperature=0.7,
)
output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
output

'The story of the black cat goes so: One day, a man was sitting by a stream and when he noticed a cat'

In [26]:
generated_ids

tensor([[2061,  318,  366, 9246,    1,  287, 2679,   30,  843,  644,  318,  262,
         3616,  286,  262, 2456, 3797,  290, 3290,  287, 9133,   30,  554, 5486]],
       device='cuda:0')