# GenerativeAI "Sprachmodell" Projekt

## Verbing mit wandb

In [1]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33madel-haj-jumah[0m ([33madel-haj-jumah-hochschule-hannover[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

1. Importieren die notwendigen Bibliotheken

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, Dataset
from pathlib import Path
import random

2. Token And Positional Embedding

In [3]:

class TokenAndPositionalEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len):
        super().__init__()
        # ID der Token in einen Vektorraum
        self.token_embed = nn.Embedding(vocab_size, d_model)
        # Positionale Einbettungen (lernen relative Positionen "das ist learnable")
        self.pos_embedding = nn.Parameter(torch.randn(1, 512, d_model)) # batch 1 , bis zu 512 token und vektor größe
    def forward(self, x):
        seq_len = x.size(1) # 1 ist Anzahl der Token"Sequenzlänge" (0 ist batch)
        token_emb = self.token_embed(x)
        pos_emb = self.pos_embedding[:, :seq_len, :]
        return token_emb + pos_emb

3. Masked Multi-Head Self-Attention


In [4]:

class MaskedSelfAttention(nn.Module):
    """
    Masked (Causal) Self-Attention Layer
    → Modell kann nur auf vergangene Tokens schauen
    → Verwendet PyTorch nn.MultiheadAttention
    """
    def __init__(self, d_model, n_heads):
        super().__init__()
        self.attn = nn.MultiheadAttention(
            embed_dim=d_model, 
            num_heads=n_heads, 
            batch_first=True  # wichtig! Damit x.shape = (B, T, C) funktioniert
        )

    def forward(self, x):
        T = x.size(1)  # Sequence Length

        # Causal Mask (obere Dreiecksmatrix)
        mask = torch.triu(torch.ones(T, T), diagonal=1).bool().to(x.device)

        # MultiheadAttention erwartet: (query, key, value, attn_mask)
        out, _ = self.attn(x, x, x, attn_mask=mask)
        return out


4. Add & Norm

In [5]:
class AddNorm(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, sublayer_output):
        return self.norm(x + sublayer_output)

5. Feedforward Layer

In [6]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )

    def forward(self, x):
        return self.net(x)

6. Alles zusammenfügen: Transformator-Decoderblock

In [7]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super().__init__()
        self.attn = MaskedSelfAttention(d_model, n_heads)  # PyTorch MultiheadAttention inside
        self.addnorm1 = AddNorm(d_model)
        self.ff = FeedForward(d_model, d_ff)
        self.addnorm2 = AddNorm(d_model)

    def forward(self, x):
        x = self.addnorm1(x, self.attn(x))  # Attention + Residual + LayerNorm
        x = self.addnorm2(x, self.ff(x))    # FeedForward + Residual + LayerNorm
        return x


7. Zusammenbau des NanoTransformers (nur Decoder)

In [8]:

class NanoTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, max_len, num_layers):
        super().__init__()
        self.embed = TokenAndPositionalEmbedding(vocab_size, d_model, max_len)
        self.blocks = nn.ModuleList([
            TransformerBlock(d_model, n_heads, d_ff) for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(d_model)
        self.output_proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        for block in self.blocks:
            x = block(x)
        x = self.norm(x)
        logits = self.output_proj(x)
        return logits

8. Trainieren des Modells

In [17]:

import torch.optim as optim
from tqdm import tqdm


def train(model, dataloader, vocab_size, device, epochs=50, lr=1e-4):
    # Wandb initialisieren, nur einmal zu Beginn des Trainings
    wandb.init(project="nano-transformer", config={
        "epochs": epochs,
        "lr": lr,
        "batch_size": dataloader.batch_size,
        "seq_len": dataloader.dataset.seq_len,
        "vocab_size": vocab_size
    })
    
    # Modell auf das richtige Gerät (GPU oder CPU) verschieben
    model = model.to(device)
    
    # Optimizer und Loss-Funktion
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    model.train()  # Modell in Trainingsmodus versetzen
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(dataloader, desc="Training Progress"):
            # Eingabe- und Ziel-Tensoren
            inputs = batch[:, :-1].to(device)
            targets = batch[:, 1:].to(device)

            # Vorwärtsdurchlauf
            logits = model(inputs)
            logits = logits.view(-1, vocab_size)  # Umformen für CrossEntropyLoss
            targets = targets.view(-1)  # Umformen für CrossEntropyLoss

            # Verlustberechnung
            loss = loss_fn(logits, targets)

            # Backpropagation und Optimierung
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Gesamten Verlust summieren
            total_loss += loss.item()

        # Durchschnittlichen Verlust für die Epoche berechnen
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}: Loss = {avg_loss:.4f}")

        # Verlust an Wandb senden
        wandb.log({"epoch": epoch+1, "loss": avg_loss})
    
    # Modell speichern nach Training (optional)
    torch.save(model.state_dict(), "nano_transformer_trained.pth")




9. Dataset Integration

In [15]:


from datasets import load_dataset



# 1. Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# 2. Tiny Shakespeare Dataset laden
dataset = load_dataset("tiny_shakespeare", split="train", trust_remote_code=True)
text = dataset['text'][0]

# 3. Text in Token IDs umwandeln
tokens = tokenizer.encode(text, truncation=False)
tokens = torch.tensor(tokens)

# 4. Dataset Klasse
class TextDataset(Dataset):
    def __init__(self, tokens, seq_len):
        self.tokens = tokens
        self.seq_len = seq_len

    def __len__(self):
        return len(self.tokens) - self.seq_len

    def __getitem__(self, idx):
        return self.tokens[idx:idx+self.seq_len+1]

# 5. DataLoader bauen
seq_len = 64
batch_size = 32
dataset = TextDataset(tokens, seq_len)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

print("DataLoader bereit!")

# 6. Dein Modell (Mini NanoTransformer)
vocab_size = tokenizer.vocab_size
d_model = 32
n_heads = 1
d_ff = 256
max_len = 64
num_layers = 1

model = NanoTransformer(vocab_size, d_model, n_heads, d_ff, max_len, num_layers)


Token indices sequence length is longer than the specified maximum sequence length for this model (301966 > 1024). Running this sequence through the model will result in indexing errors


DataLoader bereit!


In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, dataloader, vocab_size, device, epochs=100, lr=3e-4)



Training Progress: 100%|██████████| 9435/9435 [02:02<00:00, 76.73it/s]


Epoch 1: Loss = 4.9525


Training Progress:  70%|██████▉   | 6566/9435 [01:25<00:37, 76.53it/s]


KeyboardInterrupt: 