In [1]:
!pip install torch transformers datasets wandb huggingface_hub



In [2]:
from transformers import GPT2Tokenizer, GPT2Tokenizer,GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, TrainerCallback, DataCollatorForLanguageModeling
import wandb
import os
import torch


In [None]:
# WANDB SETUP
os.environ['WANDB_API_KEY'] = "xxx"
wandb.login()
run = wandb.init(
    project="gpt2-finetuned",
    config={
        "learning_rate": 1e-5,
        "epochs": 5,
        "batch_size": 8
    }
)


wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: altkachenko11 (altkachenko11-hochschule-hannover) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


In [4]:
# 2. Tokenizer und Modell laden
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")



In [6]:
# 3. Datensatz vorbereiten (z.B. tiny-shakespeare.txt)
# Diese Funktion lädt den Text und wandelt ihn in Token um
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

train_dataset = load_dataset("C:/Users/hshakademie9/Desktop/Projekt_Hussam/GenerativeAI-Project/data/shakespeare_train.txt", tokenizer)
val_dataset = load_dataset("C:/Users/hshakademie9/Desktop/Projekt_Hussam/GenerativeAI-Project/data/shakespeare_val.txt", tokenizer)


# Trainings- und Validierungsdatensätze laden
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)



In [7]:
# Callback für das Logging von eval_loss
class LogEvalLossCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        eval_loss = metrics.get("eval_loss")
        if eval_loss is not None:
            wandb.log({"eval_loss": eval_loss})  # Loggt eval_loss in Wandb

# Trainingseinstellungen festlegen
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",  # Ausgabeordner
    overwrite_output_dir=True,  # Überschreibt den Ausgabeordner
    num_train_epochs=4,  # Reduzierte Anzahl an Epochen für kleines Dataset
    per_device_train_batch_size=4,  # Batch-Größe für Training
    per_device_eval_batch_size=4,  # Batch-Größe für Evaluation
    logging_steps=10,  # Wie oft Logs geschrieben werden
    save_steps=500,  # Speichert das Modell alle 500 Schritte
    eval_steps=20,  # Evaluation alle 20 Schritte
    logging_dir="./logs",  # Speicherort der Logs
    do_eval=True,  # Evaluation während des Trainings aktivieren
    report_to="wandb",   # Logging an Weights & Biases senden
    learning_rate=1e-5,  
)


In [8]:
# 6. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[LogEvalLossCallback]  # Callbacks zum Loggen von eval_loss
)


  trainer = Trainer(


In [9]:
trainer.train()  # Modell trainieren


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,4.5756
20,4.4233
30,4.3397


TrainOutput(global_step=36, training_loss=4.4164808591206866, metrics={'train_runtime': 135.8349, 'train_samples_per_second': 1.001, 'train_steps_per_second': 0.265, 'total_flos': 8883929088000.0, 'train_loss': 4.4164808591206866, 'epoch': 4.0})

In [10]:
# Auswertung des Modells nach dem Training
eval_results = trainer.evaluate()
wandb.log({"eval_loss": eval_results["eval_loss"]})  # Loggt den eval_loss explizit
print(eval_results)  # Gibt die Evaluierungs-Ergebnisse aus


{'eval_loss': 4.199766159057617, 'eval_runtime': 5.6919, 'eval_samples_per_second': 3.162, 'eval_steps_per_second': 0.878, 'epoch': 4.0}


In [11]:
# 8. Modell und Tokenizer im Hugging Face Hub speichern
trainer.push_to_hub()  # Speichert das Modell im Hugging Face Hub
tokenizer.push_to_hub("gpt2-finetuned")  # Speichert den Tokenizer im Hugging Face Hub


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/altkachenko11/gpt2-finetuned/commit/ba6886d43cfbf00aab8eff4a6a4d715894796614', commit_message='Upload tokenizer', commit_description='', oid='ba6886d43cfbf00aab8eff4a6a4d715894796614', pr_url=None, repo_url=RepoUrl('https://huggingface.co/altkachenko11/gpt2-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='altkachenko11/gpt2-finetuned'), pr_revision=None, pr_num=None)

In [13]:
# Modell vom Hugging Face Hub laden
model_name = "altkachenko11/gpt2-finetuned"

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.eval()  # Setzt das Modell in den Evaluierungsmodus


tokenizer_config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/506k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/494 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/918 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [14]:
# Setze den pad_token_id (wird auf eos_token_id gesetzt, da GPT2 standardmäßig kein pad_token hat)
model.config.pad_token_id = model.config.eos_token_id

# Prompt definieren
prompt = "Once upon a time"
input_ids = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).input_ids

# Attention mask erstellen (für korrektes Arbeiten mit Padding)
attention_mask = (input_ids != model.config.pad_token_id).long()  # Konvertiert in Typ long

# Text generieren
with torch.no_grad():  # Keine Gradientenberechnung während der Generierung
    output_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,  # Übergibt die Attention-Maske
        max_length=50,  # Maximale Länge des generierten Texts
        num_return_sequences=1,  # Anzahl der generierten Sequenzen
        do_sample=True,  # Aktiviert Sampling für zufällige Textgenerierung
        top_k=50,  # Beschränkt die Anzahl der Kandidaten pro Schritt auf die Top-k
        top_p=0.95,  # Beschränkt die Auswahl auf die Top-p (nucleus sampling)
        temperature=0.9  # Steuert die Zufälligkeit der Generation (niedrigerer Wert -> deterministischere Ausgabe)
    )

# Ausgabe dekodieren
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)  # Gibt den generierten Text aus

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time the human race grew up.

A hundred years ago, this planet was known only as Atlantis. It was a vast, fertile land that was the center of civilization and trade. A hundred years ago, it would be called
