In [8]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 24 2025

@author: Yaning
"""

from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM

In [5]:
# Load your custom dataset (assuming it's a plain text file)
def load_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    # We assume each poem is split by the "<TITLE>" and "<POEM>" tags
    titles = []
    poems = []
    current_title = ""
    current_poem = ""
    
    for line in lines:
        if line.startswith("<TITLE>"):
            # Save the current poem and reset
            if current_title and current_poem:
                titles.append(current_title)
                poems.append(current_poem)
            current_title = line.strip().replace("<TITLE>", "").strip()
            current_poem = ""
        elif line.startswith("<POEM>"):
            current_poem = line.strip().replace("<POEM>", "").strip()
    
    # Add last poem if any
    if current_title and current_poem:
        titles.append(current_title)
        poems.append(current_poem)
    
    return Dataset.from_dict({"title": titles, "poem": poems})

# Tokenize the dataset
def tokenize_function(examples):
    # Format as <TITLE>: {title} <POEM>: {poem}
    inputs = [f"<TITLE>: {title} <POEM>: {poem}" for title, poem in zip(examples['title'], examples['poem'])]
    
    # Tokenize the inputs and return input_ids as both 'input_ids' and 'labels' for language modeling
    tokenized = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)
    tokenized['labels'] = tokenized['input_ids']  # The labels should be the same as input_ids for LM
    return tokenized


In [9]:
# Load the pre-trained tokenizer and model
# model_name = "gpt2"  # You can change this to a different German model, e.g., "bert-base-german-cased"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# model = GPT2LMHeadModel.from_pretrained(model_name)

model_name = "dbmdz/german-gpt2"  # You can change this to a different German model, e.g., "bert-base-german-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Ensure the tokenizer knows about the special tokens
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))



# Load and preprocess the dataset
dataset = load_dataset('/home/yahu202d/workspaces/horse/yahu202d-saexy/poems.txt')  # Path to your dataset file
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set up TrainingArguments
training_args = TrainingArguments(
    output_dir="./german_poem_model",  # output directory
    num_train_epochs=3,  # number of epochs
    per_device_train_batch_size=1,  # batch size per device
    save_steps=10_000,  # save checkpoint every 10,000 steps
    save_total_limit=2,  # limit the number of saved models
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)



The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 34.79 examples/s]
  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [10]:
# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./fine_tuned_poem_model")

Step,Training Loss


In [11]:
input_title = "Der Mondschein"
input_ids = tokenizer.encode(f"<TITLE>: {input_title} <POEM>:", return_tensors="pt")

output = model.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2, top_p=0.9, temperature=0.7)
generated_poem = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_poem)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<TITLE>: Der Mondschein <POEM>: Die Sonne ist aufgegangen.
Der Mond ist aufgeht. <TAT> : Der Himmel ist blau.<POEF>.: Der Regen ist weg.</POET>. </TOT>
<PREAK> <PORT><SETA>> </P></P}: <SESSION>; <SUCHARGE> ====<<[[POP]
