## Generative AI / Transformer Projekt

In [None]:
# initializierung
!pip install transformers datasets wandb

## Die Importierungen + wandb.ai anmeldung

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import wandb
wandb.login()

## STEP 1: Embedding + Positional Encoding


In [None]:
class TokenAndPositionalEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = nn.Embedding(max_len, d_model)

    def forward(self, x):
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)
        x = self.token_embed(x) + self.pos_embed(positions)
        return x

## STEP 2: TransformerDecoderLayer (PyTorch)

In [None]:
from torch.nn import TransformerDecoderLayer

d_model = 128
n_heads = 4
d_ff = 512
dropout = 0.1

decoder_layer = TransformerDecoderLayer(
    d_model=d_model,
    nhead=n_heads,
    dim_feedforward=d_ff,
    dropout=dropout,
    batch_first=True
)


## STEP 3 — TransformerDecoder (Layers)

In [None]:
from torch.nn import TransformerDecoder

num_layers = 2  # anzahl von layer
transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_layers)


## STEP 4 - Assembling the NanoTransformer (Decoder-Only)

In [None]:
# Final Model

class NanoTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, max_len, num_layers, dropout=0.1):
        super().__init__()
        self.embed = TokenAndPositionalEmbedding(vocab_size, d_model, max_len)

        decoder_layer = TransformerDecoderLayer(d_model, n_heads, d_ff, dropout, batch_first=True)
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers)

        self.output_proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        embedding = self.embed(x)

        # Masking
        T = x.size(1)
        tgt_mask = torch.triu(torch.ones(T, T), diagonal=1).bool().to(x.device)

        decoder_output = self.transformer_decoder(embedding, embedding, tgt_mask=tgt_mask)
        logits = self.output_proj(decoder_output)

        return logits


## Step 5 —  DataLoader (HuggingFace - GPT2 Tokenizer)



In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader


tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tiny Shakespeare Dataset
dataset = load_dataset("tiny_shakespeare")

max_len = 64
batch_size = 32

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=max_len, padding="max_length")

train_data = dataset["train"].map(tokenize_function, batched=True)
val_data = dataset["validation"].map(tokenize_function, batched=True)

train_data.set_format(type="torch", columns=["input_ids", "attention_mask"])
val_data.set_format(type="torch", columns=["input_ids", "attention_mask"])

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)


## STEP 6 - Model Hyperparameters

In [None]:
# ✅ Hyperparameters

epochs = 50
batch_size = 32
lr = 1e-4
vocab_size = tokenizer.vocab_size       # Tokenizer'dan alınan kelime sayısı
d_model = 128                           # Embed + attention boyutu
n_heads = 4                             # Multi-head attention başlık sayısı
d_ff = 512                              # Feedforward katman boyutu
max_len = 64                            # Giriş uzunluğu
num_layers = 2                          # Transformer block sayısı

# ✅ Model
model = NanoTransformer(
    vocab_size=vocab_size,
    d_model=d_model,
    n_heads=n_heads,
    d_ff=d_ff,
    max_len=max_len,
    num_layers=num_layers
)


## STEP 7 - wandb.io initializierung

In [None]:
import wandb

wandb.init(
    project="nano-transformer-rebuild",  # Project name
    name="nano_transformer_run1",         # Run name
    config={
        "epochs": epochs,
        "batch_size": batch_size,
        "learning_rate": lr,
        "d_model": d_model,
        "n_heads": n_heads,
        "d_ff": d_ff,
        "max_len": max_len,
        "num_layers": num_layers,
        "vocab_size": vocab_size,
        "dataset": "tiny_shakespeare"  # oder wikitext-2
    }
)


## STEP 8 - Evaluation und Training Loop + wandb logging


In [None]:
import torch.nn as nn
import torch.optim as optim


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss ve optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# train loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        inputs = batch["input_ids"].to(device)
        targets = inputs.clone()

        outputs = model(inputs)  # [batch, seq_len, vocab_size]
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    # 🧠 Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs = batch["input_ids"].to(device)
            targets = inputs.clone()

            outputs = model(inputs)
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)

    # 🖥️ Print ve WandB Log
    print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

    wandb.log({
        "train_loss": avg_train_loss,
        "val_loss": avg_val_loss,
        "epoch": epoch + 1
    })

# finish wandb
wandb.finish()


## STEP 9 — Text Generation


In [None]:
def generate(model, start_token, max_len=50, temperature=0.7, top_k=50, device="cpu"):
    model.eval()
    input_ids = start_token.to(device)

    for _ in range(max_len):
        logits = model(input_ids)
        next_token_logits = logits[:, -1, :] / temperature

        # 🆕 Top-k Sampling Ekledik
        if top_k is not None:
            values, indices = torch.topk(next_token_logits, top_k)  # top-k toke
            probs = torch.softmax(values, dim=-1)                  # wahrscheinliche token
            next_token = indices.gather(-1, torch.multinomial(probs, num_samples=1))  # Sampling 
        else:
            probs = torch.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

        input_ids = torch.cat([input_ids, next_token], dim=1)

    return input_ids.squeeze().tolist()


In [None]:
start_text = "My love for thee"
start_token = tokenizer.encode(start_text, return_tensors="pt").to(device)

generated_tokens = generate(model, start_token, max_len=50, temperature=0.7, top_k=30, device=device)

generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
print(generated_text)


## Hugging Face Speichern

In [None]:
import os
import torch

save_path = "./nano-transformer"

# Ordner erstellen, falls nicht vorhanden
os.makedirs(save_path, exist_ok=True)

# Modell-Gewichte speichern
torch.save(model.state_dict(), os.path.join(save_path, "pytorch_model.bin"))

# Konfiguration manuell speichern (optional)
config = {
    "vocab_size": vocab_size,
    "d_model": d_model,
    "n_heads": n_heads,
    "d_ff": d_ff,
    "max_len": max_len,
    "num_layers": num_layers
}

import json
with open(os.path.join(save_path, "config.json"), "w") as f:
    json.dump(config, f)

# Tokenizer speichern
tokenizer.save_pretrained(save_path)

print(f"✅ Modellgewichte und Tokenizer wurden erfolgreich im Ordner {save_path} gespeichert.")


## README.md push