## GenAI Projekt

In [None]:
# initializierung
!pip install transformers datasets wandb

## Die Importierungen + wandb.ai anmeldung

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import wandb
wandb.login()

## STEP 1: Embedding + Positional Encoding

In [None]:
class TokenAndPositionalEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = nn.Embedding(max_len, d_model)

    def forward(self, x):
        seq_len = x.size(1)
        positions = torch.arange(seq_len, device=x.device).repeat(x.size(0), 1)
        x = self.token_embed(x) + self.pos_embed(positions)
        return x

## STEP 2: TransformerDecoderLayer (PyTorch)

In [None]:
from torch.nn import TransformerDecoderLayer

d_model = 256
n_heads = 4
d_ff = 512
dropout = 0.1

decoder_layer = TransformerDecoderLayer(
    d_model=d_model,
    nhead=n_heads,
    dim_feedforward=d_ff,
    dropout=dropout,
    batch_first=True
)

## STEP 3 — TransformerDecoder (Layers)

In [None]:
from torch.nn import TransformerDecoder

num_layers = 4  # anzahl von layer
transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_layers)

## STEP 4 - Assembling the NanoTransformer (Decoder-Only)

In [None]:
class NanoTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, max_len, num_layers, dropout=0.1):
        super().__init__()
        self.embed = TokenAndPositionalEmbedding(vocab_size, d_model, max_len)

        decoder_layer = TransformerDecoderLayer(d_model, n_heads, d_ff, dropout, batch_first=True)
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers)

        self.output_proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        embedding = self.embed(x)

        # Masking
        T = x.size(1)
        tgt_mask = torch.triu(torch.ones(T, T), diagonal=1).bool().to(x.device)

        decoder_output = self.transformer_decoder(embedding, embedding, tgt_mask=tgt_mask)
        logits = self.output_proj(decoder_output)

        return logits

## Step 5 —  DataLoader (HuggingFace - GPT2 Tokenizer)


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Wikitext-2 Dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

max_len = 64
batch_size = 32

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=max_len, padding="max_length")

train_data = dataset["train"].map(tokenize_function, batched=True)
val_data = dataset["validation"].map(tokenize_function, batched=True)

train_data.set_format(type="torch", columns=["input_ids", "attention_mask"])
val_data.set_format(type="torch", columns=["input_ids", "attention_mask"])

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)

# ✅ 5. Kontroll
print(f"Train örnek sayısı: {len(dataset['train'])}")
print(f"Validation örnek sayısı: {len(dataset['validation'])}")

## STEP 6 - Model Hyperparameters

In [None]:
import torch.optim as optim

# ✅ Hyperparameters

epochs = 10
batch_size = 32
lr = 1e-4
vocab_size = tokenizer.vocab_size
d_model = 256
n_heads = 4
d_ff = 512
max_len = 64
num_layers = 4

# ✅ Model
model = NanoTransformer(
    vocab_size=vocab_size,
    d_model=d_model,
    n_heads=n_heads,
    d_ff=d_ff,
    max_len=max_len,
    num_layers=num_layers
)

## STEP 7 - wandb.io initializierung

In [None]:
import wandb

wandb.init(
    project="nano-transformer-rebuild",  # Project name
    name="nano_transformer_run1",         # Run name
    config={
        "epochs": epochs,
        "batch_size": batch_size,
        "learning_rate": lr,
        "d_model": d_model,
        "n_heads": n_heads,
        "d_ff": d_ff,
        "max_len": max_len,
        "num_layers": num_layers,
        "vocab_size": vocab_size,
        "dataset": "wikitext-2"
    }
)

## STEP 8 - Evaluation und Training Loop + wandb logging

In [None]:
from tqdm.notebook import tqdm
import torch.nn as nn
import torch.optim as optim


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss ve optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# train loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    train_loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]", leave=True)

    for batch in train_loader:
        inputs = batch["input_ids"].to(device)
        targets = inputs.clone()

        outputs = model(inputs)  # [batch, seq_len, vocab_size]
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    # 🧠 Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs = batch["input_ids"].to(device)
            targets = inputs.clone()

            outputs = model(inputs)
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)

    # 🖥️ Print ve WandB Log
    print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

    wandb.log({
        "train_loss": avg_train_loss,
        "val_loss": avg_val_loss,
        "epoch": epoch + 1
    })

# finish wandb
wandb.finish()

## STEP 9 — Text Generation

In [None]:
def generate(model, start_token, max_len=50, temperature=0.7, top_k=50, device=device):
    model.eval()
    input_ids = start_token.to(device)

    for _ in range(max_len):
        logits = model(input_ids)
        next_token_logits = logits[:, -1, :] / temperature

        # 🆕 Top-k Sampling Ekledik
        if top_k is not None:
            values, indices = torch.topk(next_token_logits, top_k)  # top-k toke
            probs = torch.softmax(values, dim=-1)                  # wahrscheinliche token
            next_token = indices.gather(-1, torch.multinomial(probs, num_samples=1))  # Sampling
        else:
            probs = torch.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

        input_ids = torch.cat([input_ids, next_token], dim=1)

    return input_ids.squeeze().tolist()

In [None]:
start_text = "The meaning of life is"
start_token = tokenizer.encode(start_text, return_tensors="pt").to(device)

generated_tokens = generate(model, start_token, max_len=50, temperature=0.7, top_k=30, device=device)

generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
print(generated_text)

In [None]:
!pip install -q huggingface_hub


In [None]:
import os
import torch

save_path = "./nano-transformer"

# Ordner erstellen, falls nicht vorhanden
os.makedirs(save_path, exist_ok=True)

# Modell-Gewichte speichern
torch.save(model.state_dict(), os.path.join(save_path, "pytorch_model.bin"))

# Konfiguration manuell speichern (optional)
config = {
    "vocab_size": vocab_size,
    "d_model": d_model,
    "n_heads": n_heads,
    "d_ff": d_ff,
    "max_len": max_len,
    "num_layers": num_layers
}

import json
with open(os.path.join(save_path, "config.json"), "w") as f:
    json.dump(config, f)

# Tokenizer speichern
tokenizer.save_pretrained(save_path)

print(f"✅ Modellgewichte und Tokenizer wurden erfolgreich im Ordner {save_path} gespeichert.")

## Vordefinierte

In [None]:
# 📦 Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
import wandb  

# ✅ 1. Tiny Modell und Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("sshleifer/tiny-gpt2")

# ✅ 2. Dataset vorbereiten
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

# Alle Texte zusammenfügen
text = "\n\n".join(dataset["text"])

# Tokenisieren
tokens = tokenizer.encode(text, truncation=False)
tokens = torch.tensor(tokens)

# Dataset Klasse
class TextDataset(Dataset):
    def __init__(self, tokens, seq_len):
        self.tokens = tokens
        self.seq_len = seq_len

    def __len__(self):
        return len(self.tokens) - self.seq_len

    def __getitem__(self, idx):
        return self.tokens[idx:idx+self.seq_len+1]

# Parameter
seq_len = 64
batch_size = 32  # Tiny Model erlaubt größere batch size

# DataLoader
train_dataset = TextDataset(tokens, seq_len)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

eval_dataset = TextDataset(tokens, seq_len)
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

print("✅ DataLoader bereit!")

# ✅ 3. Evaluation Funktion
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            inputs = batch[:, :-1].to(device)
            targets = batch[:, 1:].to(device)

            outputs = model(input_ids=inputs, labels=targets)
            loss = outputs.loss
            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

# ✅ 4. Training Funktion
def train(model, train_dataloader, eval_dataloader, device, epochs=5, lr=1e-3):
    wandb.init(mode="online", project="tiny-gpt2-finetune-wikitext", config={
        "epochs": epochs,
        "lr": lr,
        "batch_size": train_dataloader.batch_size,
        "seq_len": train_dataloader.dataset.seq_len,
        "vocab_size": model.config.vocab_size,
        "dataset": "wikitext-2-raw-v1",
        "model_name": "sshleifer/tiny-gpt2"
    })

    model = model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}"):
            inputs = batch[:, :-1].to(device)
            targets = batch[:, 1:].to(device)

            outputs = model(input_ids=inputs, labels=targets)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1}: 🏋️‍♂️ Train Loss = {avg_train_loss:.4f}")

        avg_eval_loss = evaluate(model, eval_dataloader, device)
        print(f"Epoch {epoch+1}: 🧠 Eval Loss = {avg_eval_loss:.4f}")

        wandb.log({
            "epoch": epoch+1,
            "train_loss": avg_train_loss,
            "eval_loss": avg_eval_loss
        })

    torch.save(model.state_dict(), "fine_tuned_tiny_gpt2_wikitext.pth")
    print("✅ Modell gespeichert!")

# ✅ 5. Training starten
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, train_dataloader, eval_dataloader, device, epochs=3, lr=1e-3)

# wandb abschließen
wandb.finish()


# Bonus Aufgabe - Text Generation with GPT-2

## Step 1: Hugging Face ve Weights & Biases

## Step 3: Dataset Herunterladen und Tokenisation

## Step 4: Trainer ve TrainingArguments