In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 9 2025

@author: Yaning
"""

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/german-gpt2")
model = AutoModelForCausalLM.from_pretrained("dbmdz/german-gpt2")
model.eval()
model.to("cuda")

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50265, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50265, bias=False)
)

In [None]:
# Freeze all model parameters except `wte`
for param in model.parameters():
    param.requires_grad = False  # Freeze everything

# Unfreeze word embeddings (`wte`) for training
for param in model.transformer.wte.parameters():
    param.requires_grad = True  # Fine-tune only embeddings

# Define optimizer
optimizer = AdamW(model.transformer.wte.parameters(), lr=5e-5)

# Sample dialect dataset (replace with actual dataset)
dialect_sentences = [
    "Albrechtsburg und Dom zu Meißen Ihr geent mirsch globn"
]
dialect_tokens = [tokenizer.encode(sent, return_tensors="pt").to("cuda") for sent in dialect_sentences]

# Training loop (simplified)
model.train()
epochs = 3
for epoch in range(epochs):
    total_loss = 0
    for tokens in dialect_tokens:
        # tokens.to("cuda")
        optimizer.zero_grad()
        outputs = model(tokens, labels=tokens)  # Self-supervised learning
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

# Save fine-tuned model
model.save_pretrained("german-gpt-dialect")
tokenizer.save_pretrained("german-gpt-dialect")

print("Fine-tuning complete! Model saved.")




Epoch 1, Loss: 5.2945
Epoch 2, Loss: 5.2508
Epoch 3, Loss: 5.2068
Fine-tuning complete! Model saved.
