In [2]:
# 🌟 Bonus: Finetuning GPT-2 auf Tiny Shakespeare

from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, Trainer, TrainingArguments
from huggingface_hub import login, create_repo, upload_folder
import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
config = {
    "epochs": 5,
    "batch_size": 2,
    "learning_rate": 5e-5,
    "max_length": 128,
    "model_name": "gpt2",
    "dataset_name": "tiny_shakespeare",
    "device": "cpu",
}

In [4]:
# Anmeldung bei Hugging Face
login("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") 

# Weights & Biases
wandb.init(project="gpt2-finetuned-shakespeare_bonus",name="GPT-2 Finetuning TinyShakespeare", config=config,)


# Laden des Tiny Shakespeare-Datensatzes
dataset = load_dataset(config["dataset_name"], trust_remote_code=True)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mrahaf-aswad[0m ([33mrahaf-aswad-hochschule-hannover[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
# Lade den GPT-2 Tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained(config["model_name"])

# GPT-2 hat kein Pad-Token → Setze ihn auf EOS-Token
tokenizer.pad_token = tokenizer.eos_token

In [6]:

# Definieren der Tokenisierungsfunktion
def tokenize_function(example):
    tokens = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=config["max_length"],
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

In [7]:

# Anwenden des Tokenizers auf die Daten
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Lade das vortrainierte GPT-2 Modell
model = GPT2LMHeadModel.from_pretrained(config["model_name"])

# Aktualisiere die Tokenizer-Größe im Modell
model.resize_token_embeddings(len(tokenizer))
model = model.to(config["device"])

In [9]:
# Trainingseinstellungen 
training_args = TrainingArguments(
    output_dir="gpt2-finetuned-shakespeare_bonus",
    overwrite_output_dir=True,
    num_train_epochs=config["epochs"],
    per_device_train_batch_size=config["batch_size"],
    learning_rate=config["learning_rate"],
    logging_dir="logs",
    report_to="wandb",
    logging_steps=1,  
    save_total_limit=2,
    save_steps=5, 
)

# Trainer vorbereiten
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)


trainer.train()

# Beende die Weights & Biases Session
wandb.finish()


  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,4.3148
2,3.6754
3,3.0633
4,2.962
5,2.6924


0,1
train/epoch,▁▃▅▆██
train/global_step,▁▃▅▆██
train/grad_norm,█▇▁▃▁
train/learning_rate,█▆▅▃▁
train/loss,█▅▃▂▁

0,1
total_flos,326615040000.0
train/epoch,5.0
train/global_step,5.0
train/grad_norm,18.73566
train/learning_rate,1e-05
train/loss,2.6924
train_loss,3.34158
train_runtime,12.3139
train_samples_per_second,0.406
train_steps_per_second,0.406


In [10]:
# Modell und Tokenizer lokal speichern
model.save_pretrained("gpt2-finetuned-shakespeare_bonus")
tokenizer.save_pretrained("gpt2-finetuned-shakespeare_bonus")

('gpt2-finetuned-shakespeare_bonus\\tokenizer_config.json',
 'gpt2-finetuned-shakespeare_bonus\\special_tokens_map.json',
 'gpt2-finetuned-shakespeare_bonus\\vocab.json',
 'gpt2-finetuned-shakespeare_bonus\\merges.txt',
 'gpt2-finetuned-shakespeare_bonus\\added_tokens.json',
 'gpt2-finetuned-shakespeare_bonus\\tokenizer.json')

In [15]:

# Erstelle ein neues Repository auf Hugging Face und lade es hoch
repo_name = "gpt2-finetuned-shakespeare_bonu"
create_repo(repo_name, private=False)
upload_folder(folder_path="gpt2-finetuned-shakespeare_bonus", repo_id=f"rahaf-aswad/{repo_name}")


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]
[A

[A[A


[A[A[A



training_args.bin: 100%|██████████| 5.24k/5.24k [00:00<00:00, 29.6kB/s]
scheduler.pt: 100%|██████████| 1.06k/1.06k [00:00<00:00, 5.13kB/s]
rng_state.pth: 100%|██████████| 14.0k/14.0k [00:00<00:00, 48.9kB/s]s]
model.safetensors:   0%|          | 803k/498M [00:00<05:51, 1.41MB/s]
[A
model.safetensors:   0%|          | 950k/498M [00:00<09:34, 865kB/s] 
model.safetensors:   0%|          | 1.16M/498M [00:01<10:01, 826kB/s]
model.safetensors:   0%|          | 1.26M/498M [00:01<09:54, 836kB/s]
model.safetensors:   0%|          | 1.36M/498M [00:01<09:45, 849kB/s]
model.safetensors:   0%|          | 1.56M/498M [00:01<09:40, 854kB/s]
model.safetensors:   0%|          | 1.65M/498M [00:01<09:35, 862kB/s]
model.safetensors:   0%|          | 1.75M/498M [00:01<09:35, 862kB/s]
model.safetensors:   0%|          | 1.85M/498M [00:02<09:34, 864kB/s]
model.safetensors:   0%|          | 1.95M/498M [00:02<09:45, 846kB/s

CommitInfo(commit_url='https://huggingface.co/rahaf-aswad/gpt2-finetuned-shakespeare_bonu/commit/f8826d6a12a0d3808640c5162f2f87a921fdb050', commit_message='Upload folder using huggingface_hub', commit_description='', oid='f8826d6a12a0d3808640c5162f2f87a921fdb050', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rahaf-aswad/gpt2-finetuned-shakespeare_bonu', endpoint='https://huggingface.co', repo_type='model', repo_id='rahaf-aswad/gpt2-finetuned-shakespeare_bonu'), pr_revision=None, pr_num=None)