# GenerativeAI "Sprachmodell" Projekt

## Verbing mit wandb

In [6]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33madel-haj-jumah[0m ([33madel-haj-jumah-hochschule-hannover[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

1. import datasets

In [5]:
from datasets import load_dataset

ds = load_dataset("wikitext", "wikitext-103-v1", split="train[:5%]")
text = "\n".join(ds['text'])


2. Importieren die notwendigen Bibliotheken

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer
import wandb
from torch.utils.data import DataLoader, Dataset
from pathlib import Path
import random

3. Token And Positional Embedding

In [3]:

class TokenAndPositionalEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len):
        super().__init__()
        # ID der Token in einen Vektorraum
        self.token_embed = nn.Embedding(vocab_size, d_model)
        # Positionale Einbettungen (lernen relative Positionen "das ist learnable")
        self.pos_embedding = nn.Parameter(torch.randn(1, 512, d_model)) # batch 1 , bis zu 512 token und vektor größe
    def forward(self, x):
        seq_len = x.size(1) # 1 ist Anzahl der Token"Sequenzlänge" (0 ist batch)
        token_emb = self.token_embed(x)
        pos_emb = self.pos_embedding[:, :seq_len, :]
        return token_emb + pos_emb

4. Masked Multi-Head Self-Attention


In [None]:

class MaskedSelfAttention(nn.Module):
    """
    Masked (Causal) Self-Attention Layer
    → Modell kann nur auf vergangene Tokens schauen
    → Verwendet PyTorch nn.MultiheadAttention
    """
    def __init__(self, d_model, n_heads):
        super().__init__()
        self.attn = nn.MultiheadAttention(
            embed_dim=d_model, 
            num_heads=n_heads, 
            batch_first=True  # wichtig! Damit x.shape = (B, T, C) funktioniert
        )

    def forward(self, x):
        T = x.size(1)  # Sequence Length

        # Causal Mask (obere Dreiecksmatrix)
        mask = torch.triu(torch.ones(T, T), diagonal=1).bool().to(x.device)

        # MultiheadAttention erwartet: (query, key, value, attn_mask)
        out, _ = self.attn(x, x, x, attn_mask=mask)
        return out


5. Add & Norm

In [8]:
class AddNorm(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, sublayer_output):
        return self.norm(x + sublayer_output)

6. Feedforward Layer

In [9]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )

    def forward(self, x):
        return self.net(x)

7. Alles zusammenfügen: Transformator-Decoderblock

In [10]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super().__init__()
        self.attn = MaskedSelfAttention(d_model, n_heads)  # PyTorch MultiheadAttention inside
        self.addnorm1 = AddNorm(d_model)
        self.ff = FeedForward(d_model, d_ff)
        self.addnorm2 = AddNorm(d_model)

    def forward(self, x):
        x = self.addnorm1(x, self.attn(x))  # Attention + Residual + LayerNorm
        x = self.addnorm2(x, self.ff(x))    # FeedForward + Residual + LayerNorm
        return x


8. Zusammenbau des NanoTransformers (nur Decoder)

In [11]:

class NanoTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, max_len, num_layers):
        super().__init__()
        self.embed = TokenAndPositionalEmbedding(vocab_size, d_model, max_len)
        self.blocks = nn.ModuleList([
            TransformerBlock(d_model, n_heads, d_ff) for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(d_model)
        self.output_proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        for block in self.blocks:
            x = block(x)
        x = self.norm(x)
        logits = self.output_proj(x)
        return logits