# GenerativeAI "Sprachmodell" Projekt

## Verbing mit wandb

In [1]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33madel-haj-jumah[0m ([33madel-haj-jumah-hochschule-hannover[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

1. Importieren die notwendigen Bibliotheken

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, Dataset
from pathlib import Path
import random
from datasets import load_dataset
import torch.optim as optim


2. Token And Positional Embedding

In [3]:

class TokenAndPositionalEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len):
        super().__init__()
        # ID der Token in einen Vektorraum
        self.token_embed = nn.Embedding(vocab_size, d_model)
        # Positionale Einbettungen (lernen relative Positionen "das ist learnable")
        self.pos_embedding = nn.Parameter(torch.randn(1, 512, d_model)) # batch 1 , bis zu 512 token und vektor größe
    def forward(self, x):
        seq_len = x.size(1) # 1 ist Anzahl der Token"Sequenzlänge" (0 ist batch)
        token_emb = self.token_embed(x)
        pos_emb = self.pos_embedding[:, :seq_len, :]
        return token_emb + pos_emb

3. Masked Multi-Head Self-Attention


In [4]:

class MaskedSelfAttention(nn.Module):
    """
    Masked (Causal) Self-Attention Layer
    → Modell kann nur auf vergangene Tokens schauen
    → Verwendet PyTorch nn.MultiheadAttention
    """
    def __init__(self, d_model, n_heads):
        super().__init__()
        self.attn = nn.MultiheadAttention(
            embed_dim=d_model, 
            num_heads=n_heads, 
            batch_first=True  # wichtig! Damit x.shape = (B, T, C) funktioniert
        )

    def forward(self, x):
        T = x.size(1)  # Sequence Length

        # Causal Mask (obere Dreiecksmatrix)
        mask = torch.triu(torch.ones(T, T), diagonal=1).bool().to(x.device)

        # MultiheadAttention erwartet: (query, key, value, attn_mask)
        out, _ = self.attn(x, x, x, attn_mask=mask)
        return out


4. Add & Norm

In [5]:
class AddNorm(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, sublayer_output):
        return self.norm(x + sublayer_output)

5. Feedforward Layer

In [6]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )

    def forward(self, x):
        return self.net(x)

6. Alles zusammenfügen: Transformator-Decoderblock

In [7]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super().__init__()
        self.attn = MaskedSelfAttention(d_model, n_heads)  # PyTorch MultiheadAttention inside
        self.addnorm1 = AddNorm(d_model)
        self.ff = FeedForward(d_model, d_ff)
        self.addnorm2 = AddNorm(d_model)

    def forward(self, x):
        x = self.addnorm1(x, self.attn(x))  # Attention + Residual + LayerNorm
        x = self.addnorm2(x, self.ff(x))    # FeedForward + Residual + LayerNorm
        return x


7. Zusammenbau des NanoTransformers (nur Decoder)

In [8]:

class NanoTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, max_len, num_layers):
        super().__init__()
        self.embed = TokenAndPositionalEmbedding(vocab_size, d_model, max_len)
        self.blocks = nn.ModuleList([
            TransformerBlock(d_model, n_heads, d_ff) for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(d_model)
        self.output_proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        for block in self.blocks:
            x = block(x)
        x = self.norm(x)
        logits = self.output_proj(x)
        return logits

8. Trainieren des Modells

In [12]:

import torch.optim as optim
from tqdm import tqdm


def train(model, dataloader, vocab_size, device, epochs=10, lr=1e-4):
    # Wandb initialisieren, nur einmal zu Beginn des Trainings
    wandb.init(project="nano-transformer", config={
        "epochs": epochs,
        "lr": lr,
        "batch_size": dataloader.batch_size,
        "seq_len": dataloader.dataset.seq_len,
        "vocab_size": vocab_size
    })
    
    # Modell auf das richtige Gerät (GPU oder CPU) verschieben
    model = model.to(device)
    
    # Optimizer und Loss-Funktion
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    model.train()  # Modell in Trainingsmodus versetzen
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(dataloader, desc="Training Progress"):
            # Eingabe- und Ziel-Tensoren
            inputs = batch[:, :-1].to(device)
            targets = batch[:, 1:].to(device)

            # Vorwärtsdurchlauf
            logits = model(inputs)
            logits = logits.view(-1, vocab_size)  # Umformen für CrossEntropyLoss
            targets = targets.view(-1)  # Umformen für CrossEntropyLoss

            # Verlustberechnung
            loss = loss_fn(logits, targets)

            # Backpropagation und Optimierung
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Gesamten Verlust summieren
            total_loss += loss.item()

        # Durchschnittlichen Verlust für die Epoche berechnen
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}: Loss = {avg_loss:.4f}")

        # Verlust an Wandb senden
        wandb.log({"epoch": epoch+1, "loss": avg_loss})
    
    # Modell speichern nach Training (optional)
    torch.save(model.state_dict(), "nano_transformer_trained.pth")




9. Dataset Integration

In [None]:


from datasets import load_dataset



# 1. Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# 2. Tiny Shakespeare Dataset laden
dataset = load_dataset("tiny_shakespeare", split="train", trust_remote_code=True)
text = dataset['text'][0]

# 3. Text in Token IDs umwandeln
tokens = tokenizer.encode(text, truncation=False)
tokens = torch.tensor(tokens)

# 4. Dataset Klasse
class TextDataset(Dataset):
    def __init__(self, tokens, seq_len):
        self.tokens = tokens
        self.seq_len = seq_len

    def __len__(self):
        return len(self.tokens) - self.seq_len

    def __getitem__(self, idx):
        return self.tokens[idx:idx+self.seq_len+1]

# 5. DataLoader bauen
seq_len = 64
batch_size = 32
dataset = TextDataset(tokens, seq_len)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

print("DataLoader bereit!")

# 6. Den Modell (Mini NanoTransformer)
vocab_size = tokenizer.vocab_size
d_model = 32
n_heads = 1
d_ff = 256
max_len = 64
num_layers = 1

model = NanoTransformer(vocab_size, d_model, n_heads, d_ff, max_len, num_layers)


Token indices sequence length is longer than the specified maximum sequence length for this model (301966 > 1024). Running this sequence through the model will result in indexing errors


DataLoader bereit!


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, dataloader, vocab_size, device, epochs=10, lr=3e-4)



In [15]:


from datasets import load_dataset



# 1. Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# 2. Tiny Shakespeare Dataset laden
dataset = load_dataset("tiny_shakespeare", split="train", trust_remote_code=True)
text = dataset['text'][0]

# 3. Text in Token IDs umwandeln
tokens = tokenizer.encode(text, truncation=False)
tokens = torch.tensor(tokens)

# 4. Dataset Klasse
class TextDataset(Dataset):
    def __init__(self, tokens, seq_len):
        self.tokens = tokens
        self.seq_len = seq_len

    def __len__(self):
        return len(self.tokens) - self.seq_len

    def __getitem__(self, idx):
        return self.tokens[idx:idx+self.seq_len+1]

# 5. DataLoader bauen
seq_len = 64
batch_size = 32
dataset = TextDataset(tokens, seq_len)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

print("DataLoader bereit!")


d_model = 128
n_heads = 4
d_ff = 512
num_layers = 2
seq_len = 128 # 128 Token pro Sequenz
max_len = 128 # 128 Token pro Sequenz


model = NanoTransformer(vocab_size, d_model, n_heads, d_ff, max_len, num_layers)


Token indices sequence length is longer than the specified maximum sequence length for this model (301966 > 1024). Running this sequence through the model will result in indexing errors


DataLoader bereit!


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, dataloader, vocab_size, device, epochs=10, lr=3e-4)



## MaskedSelfAttention, AddNorm und FeedForward ersetzen durch nn.TransformerDecoderLayer

In [None]:
from tqdm import tqdm
#  NanoTransformer mit PyTorch Decoder
from torch.nn import TransformerDecoder, TransformerDecoderLayer

class NanoTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, max_len, num_layers):
        super().__init__()
        self.embed = TokenAndPositionalEmbedding(vocab_size, d_model, max_len)

        decoder_layer = TransformerDecoderLayer(d_model, n_heads, d_ff, batch_first=True)
        self.decoder = TransformerDecoder(decoder_layer, num_layers)

        self.norm = nn.LayerNorm(d_model)
        self.output_proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        T = x.size(1)

        # Causal Mask erstellen
        causal_mask = torch.triu(torch.ones(T, T), diagonal=1).bool().to(x.device)

        # Dummy Memory
        memory = torch.zeros(x.size(0), 1, x.size(2)).to(x.device)

        x = self.decoder(tgt=x, memory=memory, tgt_mask=causal_mask)
        x = self.norm(x)
        logits = self.output_proj(x)
        return logits

# Training und Evaluierung
def evaluate(model, dataloader, vocab_size, device):
    model.eval()  # Wechsel in den Evaluationsmodus
    total_loss = 0
    loss_fn = nn.CrossEntropyLoss()

    with torch.no_grad():  # Kein Gradientenberechnung
        for batch in dataloader:
            inputs = batch[:, :-1].to(device)
            targets = batch[:, 1:].to(device)

            logits = model(inputs)
            logits = logits.view(-1, vocab_size)
            targets = targets.view(-1)

            loss = loss_fn(logits, targets)
            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

def train(model, train_dataloader, eval_dataloader, vocab_size, device, epochs=10, lr=1e-4):
    wandb.init(project="nano-transformer", config={
        "epochs": epochs,
        "lr": lr,
        "batch_size": train_dataloader.batch_size,
        "seq_len": train_dataloader.dataset.seq_len,
        "vocab_size": vocab_size,
        "dataset": "tiny_shakespeare",
        "d_ff": d_ff,
        "max_len": max_len,
        "num_layers": num_layers
    })

    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}"):
            inputs = batch[:, :-1].to(device)
            targets = batch[:, 1:].to(device)

            logits = model(inputs)
            logits = logits.view(-1, vocab_size)
            targets = targets.view(-1)

            loss = loss_fn(logits, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1}: Training Loss = {avg_train_loss:.4f}")

        # Evaluierung nach jedem Trainingsepochendurchgang
        avg_eval_loss = evaluate(model, eval_dataloader, vocab_size, device)
        print(f"Epoch {epoch+1}: Evaluation Loss = {avg_eval_loss:.4f}")

        # Logs an WandB senden
        wandb.log({
            "epoch": epoch+1,
            "train_loss": avg_train_loss,
            "eval_loss": avg_eval_loss
        })

    # Modell speichern nach Training
    torch.save(model.state_dict(), "nano_transformer_trained.pth")

# Dataset vorbereiten
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset("tiny_shakespeare", split="train", trust_remote_code=True)
text = dataset['text'][0]
tokens = tokenizer.encode(text, truncation=False)
tokens = torch.tensor(tokens)

class TextDataset(Dataset):
    def __init__(self, tokens, seq_len):
        self.tokens = tokens
        self.seq_len = seq_len

    def __len__(self):
        return len(self.tokens) - self.seq_len

    def __getitem__(self, idx):
        return self.tokens[idx:idx+self.seq_len+1]

seq_len = 64
batch_size = 32
dataset = TextDataset(tokens, seq_len)
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Erstellen eines Evaluationsdatensatzes (z.B. indem wir die ersten 20% des Datensatzes verwenden)
eval_dataset = TextDataset(tokens[dataset.seq_len:], seq_len)  # Hier könnten auch speziellere Evaluierungsdaten genutzt werden
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

print("DataLoader bereit!")

# Modell konfigurieren
vocab_size = tokenizer.vocab_size
d_model = 32
n_heads = 1
d_ff = 256
max_len = 64
num_layers = 1

model = NanoTransformer(vocab_size, d_model, n_heads, d_ff, max_len, num_layers)

# Training starten
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, train_dataloader, eval_dataloader, vocab_size, device, epochs=100, lr=3e-4)


### lr statt 0.0003 zu 0.003

In [14]:
# Erstellen eines Evaluationsdatensatzes (z.B. indem wir die ersten 20% des Datensatzes verwenden)
eval_dataset = TextDataset(tokens[dataset.seq_len:], seq_len)  # Hier könnten auch speziellere Evaluierungsdaten genutzt werden
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

print("DataLoader bereit!")

# Modell konfigurieren
vocab_size = tokenizer.vocab_size
d_model = 32
n_heads = 1
d_ff = 256
max_len = 64
num_layers = 1

model = NanoTransformer(vocab_size, d_model, n_heads, d_ff, max_len, num_layers)

# Training starten
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, train_dataloader, eval_dataloader, vocab_size, device, epochs=10, lr=3e-3)


DataLoader bereit!


0,1
epoch,▁▂▂▃▄▅▅▆▇▇█
eval_loss,█▅▄▃▃▂▂▂▁▁▁
train_loss,█▄▃▃▂▂▂▁▁▁▁

0,1
epoch,11.0
eval_loss,3.12321
train_loss,3.14373


Epoch 1: 100%|██████████| 9435/9435 [02:01<00:00, 77.36it/s]


Epoch 1: Training Loss = 3.8809
Epoch 1: Evaluation Loss = 3.4163


Epoch 2: 100%|██████████| 9435/9435 [01:57<00:00, 80.60it/s]


Epoch 2: Training Loss = 3.2139
Epoch 2: Evaluation Loss = 3.0782


Epoch 3: 100%|██████████| 9435/9435 [01:54<00:00, 82.53it/s]


Epoch 3: Training Loss = 3.0033
Epoch 3: Evaluation Loss = 2.9272


Epoch 4: 100%|██████████| 9435/9435 [01:54<00:00, 82.14it/s]


Epoch 4: Training Loss = 2.8861
Epoch 4: Evaluation Loss = 2.8302


Epoch 5: 100%|██████████| 9435/9435 [01:56<00:00, 80.96it/s]


Epoch 5: Training Loss = 2.8056
Epoch 5: Evaluation Loss = 2.7626


Epoch 6: 100%|██████████| 9435/9435 [01:50<00:00, 85.38it/s]


Epoch 6: Training Loss = 2.7456
Epoch 6: Evaluation Loss = 2.7118


Epoch 7: 100%|██████████| 9435/9435 [01:50<00:00, 85.76it/s]


Epoch 7: Training Loss = 2.6989
Epoch 7: Evaluation Loss = 2.6711


Epoch 8: 100%|██████████| 9435/9435 [01:52<00:00, 84.17it/s]


Epoch 8: Training Loss = 2.6617
Epoch 8: Evaluation Loss = 2.6357


Epoch 9: 100%|██████████| 9435/9435 [01:49<00:00, 85.87it/s]


Epoch 9: Training Loss = 2.6311
Epoch 9: Evaluation Loss = 2.6102


Epoch 10: 100%|██████████| 9435/9435 [01:53<00:00, 82.88it/s]


Epoch 10: Training Loss = 2.6050
Epoch 10: Evaluation Loss = 2.5891


### num_layers =von 1 zu  2 / n_heads = von 1 zu 4


In [15]:
# Erstellen eines Evaluationsdatensatzes (z.B. indem wir die ersten 20% des Datensatzes verwenden)
eval_dataset = TextDataset(tokens[dataset.seq_len:], seq_len)  # Hier könnten auch speziellere Evaluierungsdaten genutzt werden
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

print("DataLoader bereit!")

# Modell konfigurieren
vocab_size = tokenizer.vocab_size
d_model = 32
n_heads = 4
d_ff = 256
max_len = 64
num_layers = 2

model = NanoTransformer(vocab_size, d_model, n_heads, d_ff, max_len, num_layers)

# Training starten
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, train_dataloader, eval_dataloader, vocab_size, device, epochs=10, lr=3e-3)


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


DataLoader bereit!


0,1
epoch,▁▂▃▃▄▅▆▆▇█
eval_loss,█▅▄▃▂▂▂▁▁▁
train_loss,█▄▃▃▂▂▂▁▁▁

0,1
epoch,10.0
eval_loss,2.58912
train_loss,2.60498


Epoch 1: 100%|██████████| 9435/9435 [02:18<00:00, 68.04it/s]


Epoch 1: Training Loss = 3.8056
Epoch 1: Evaluation Loss = 3.2602


Epoch 2: 100%|██████████| 9435/9435 [02:16<00:00, 69.04it/s]


Epoch 2: Training Loss = 2.9659
Epoch 2: Evaluation Loss = 2.7749


Epoch 3: 100%|██████████| 9435/9435 [02:20<00:00, 67.21it/s]


Epoch 3: Training Loss = 2.6883
Epoch 3: Evaluation Loss = 2.6001


Epoch 4: 100%|██████████| 9435/9435 [02:17<00:00, 68.85it/s]


Epoch 4: Training Loss = 2.5610
Epoch 4: Evaluation Loss = 2.5110


Epoch 5: 100%|██████████| 9435/9435 [02:14<00:00, 70.40it/s]


Epoch 5: Training Loss = 2.4837
Epoch 5: Evaluation Loss = 2.4507


Epoch 6: 100%|██████████| 9435/9435 [02:14<00:00, 69.92it/s]


Epoch 6: Training Loss = 2.4301
Epoch 6: Evaluation Loss = 2.3999


Epoch 7: 100%|██████████| 9435/9435 [02:11<00:00, 71.91it/s]


Epoch 7: Training Loss = 2.3896
Epoch 7: Evaluation Loss = 2.3705


Epoch 8: 100%|██████████| 9435/9435 [02:11<00:00, 71.71it/s]


Epoch 8: Training Loss = 2.3576
Epoch 8: Evaluation Loss = 2.3354


Epoch 9: 100%|██████████| 9435/9435 [02:12<00:00, 71.22it/s]


Epoch 9: Training Loss = 2.3321
Epoch 9: Evaluation Loss = 2.3148


Epoch 10: 100%|██████████| 9435/9435 [02:21<00:00, 66.45it/s]


Epoch 10: Training Loss = 2.3096
Epoch 10: Evaluation Loss = 2.2918


### max_len = von 64 zu 128 / d_ff = von 256 zu 512  

In [16]:
# Erstellen eines Evaluationsdatensatzes (z.B. indem wir die ersten 20% des Datensatzes verwenden)
eval_dataset = TextDataset(tokens[dataset.seq_len:], seq_len)  # Hier könnten auch speziellere Evaluierungsdaten genutzt werden
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

print("DataLoader bereit!")

# Modell konfigurieren
vocab_size = tokenizer.vocab_size
d_model = 32
n_heads = 4
d_ff = 512  
max_len = 128
num_layers = 2

model = NanoTransformer(vocab_size, d_model, n_heads, d_ff, max_len, num_layers)

# Training starten
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, train_dataloader, eval_dataloader, vocab_size, device, epochs=10, lr=3e-3)


DataLoader bereit!


0,1
epoch,▁▂▃▃▄▅▆▆▇█
eval_loss,█▄▃▃▂▂▂▁▁▁
train_loss,█▄▃▂▂▂▁▁▁▁

0,1
epoch,10.0
eval_loss,2.2918
train_loss,2.30964


Epoch 1: 100%|██████████| 9435/9435 [02:24<00:00, 65.20it/s]


Epoch 1: Training Loss = 3.8457
Epoch 1: Evaluation Loss = 3.2873


Epoch 2: 100%|██████████| 9435/9435 [02:20<00:00, 67.34it/s]


Epoch 2: Training Loss = 2.9488
Epoch 2: Evaluation Loss = 2.7384


Epoch 3: 100%|██████████| 9435/9435 [02:20<00:00, 67.14it/s]


Epoch 3: Training Loss = 2.6451
Epoch 3: Evaluation Loss = 2.5582


Epoch 4: 100%|██████████| 9435/9435 [02:20<00:00, 67.30it/s]


Epoch 4: Training Loss = 2.5092
Epoch 4: Evaluation Loss = 2.4452


Epoch 5: 100%|██████████| 9435/9435 [02:19<00:00, 67.46it/s]


Epoch 5: Training Loss = 2.4271
Epoch 5: Evaluation Loss = 2.3794


Epoch 6: 100%|██████████| 9435/9435 [02:19<00:00, 67.40it/s]


Epoch 6: Training Loss = 2.3698
Epoch 6: Evaluation Loss = 2.3415


Epoch 7: 100%|██████████| 9435/9435 [02:19<00:00, 67.77it/s]


Epoch 7: Training Loss = 2.3280
Epoch 7: Evaluation Loss = 2.2969


Epoch 8: 100%|██████████| 9435/9435 [02:19<00:00, 67.58it/s]


Epoch 8: Training Loss = 2.2949
Epoch 8: Evaluation Loss = 2.2728


Epoch 9: 100%|██████████| 9435/9435 [02:19<00:00, 67.45it/s]


Epoch 9: Training Loss = 2.2676
Epoch 9: Evaluation Loss = 2.2426


Epoch 10: 100%|██████████| 9435/9435 [02:17<00:00, 68.40it/s]


Epoch 10: Training Loss = 2.2442
Epoch 10: Evaluation Loss = 2.2160


### d_model = von 32 zu 128 / num_layers = von 2 zu 4

In [17]:
# Modell konfigurieren
vocab_size = tokenizer.vocab_size
d_model = 128
n_heads = 4
d_ff = 512  
max_len = 128
num_layers = 4

model = NanoTransformer(vocab_size, d_model, n_heads, d_ff, max_len, num_layers)

# Training starten
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, train_dataloader, eval_dataloader, vocab_size, device, epochs=10, lr=3e-3)


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
epoch,▁▂▃▃▄▅▆▆▇█
eval_loss,█▄▃▂▂▂▂▁▁▁
train_loss,█▄▃▂▂▂▁▁▁▁

0,1
epoch,10.0
eval_loss,2.216
train_loss,2.24425


Epoch 1: 100%|██████████| 9435/9435 [03:48<00:00, 41.22it/s]


Epoch 1: Training Loss = 5.9286
Epoch 1: Evaluation Loss = 5.8530


Epoch 2: 100%|██████████| 9435/9435 [03:46<00:00, 41.68it/s]


Epoch 2: Training Loss = 5.4539
Epoch 2: Evaluation Loss = 5.0933


Epoch 3: 100%|██████████| 9435/9435 [03:45<00:00, 41.84it/s]


Epoch 3: Training Loss = 5.1421
Epoch 3: Evaluation Loss = 5.0835


Epoch 4: 100%|██████████| 9435/9435 [03:45<00:00, 41.86it/s]


Epoch 4: Training Loss = 5.0913
Epoch 4: Evaluation Loss = 5.0308


Epoch 5: 100%|██████████| 9435/9435 [03:54<00:00, 40.16it/s]


Epoch 5: Training Loss = 4.7355
Epoch 5: Evaluation Loss = 4.5584


Epoch 6: 100%|██████████| 9435/9435 [03:36<00:00, 43.60it/s]


Epoch 6: Training Loss = 4.4312
Epoch 6: Evaluation Loss = 4.2819


Epoch 7: 100%|██████████| 9435/9435 [03:45<00:00, 41.87it/s]


Epoch 7: Training Loss = 4.2090
Epoch 7: Evaluation Loss = 4.1360


Epoch 8: 100%|██████████| 9435/9435 [03:34<00:00, 44.07it/s]


Epoch 8: Training Loss = 4.0530
Epoch 8: Evaluation Loss = 3.9643


Epoch 9: 100%|██████████| 9435/9435 [03:32<00:00, 44.38it/s]


Epoch 9: Training Loss = 3.9459
Epoch 9: Evaluation Loss = 3.8622


Epoch 10: 100%|██████████| 9435/9435 [03:32<00:00, 44.36it/s]


Epoch 10: Training Loss = 3.8531
Epoch 10: Evaluation Loss = 3.8033


### Optimierungsalgorithmus von  Adam zu SGD

In [19]:
from tqdm import tqdm
#  NanoTransformer mit PyTorch Decoder
from torch.nn import TransformerDecoder, TransformerDecoderLayer

class NanoTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, max_len, num_layers):
        super().__init__()
        self.embed = TokenAndPositionalEmbedding(vocab_size, d_model, max_len)

        decoder_layer = TransformerDecoderLayer(d_model, n_heads, d_ff, batch_first=True)
        self.decoder = TransformerDecoder(decoder_layer, num_layers)

        self.norm = nn.LayerNorm(d_model)
        self.output_proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        T = x.size(1)

        # Causal Mask erstellen
        causal_mask = torch.triu(torch.ones(T, T), diagonal=1).bool().to(x.device)

        # Dummy Memory
        memory = torch.zeros(x.size(0), 1, x.size(2)).to(x.device)

        x = self.decoder(tgt=x, memory=memory, tgt_mask=causal_mask)
        x = self.norm(x)
        logits = self.output_proj(x)
        return logits

# Training und Evaluierung
def evaluate(model, dataloader, vocab_size, device):
    model.eval()  # Wechsel in den Evaluationsmodus
    total_loss = 0
    loss_fn = nn.CrossEntropyLoss()

    with torch.no_grad():  # Kein Gradientenberechnung
        for batch in dataloader:
            inputs = batch[:, :-1].to(device)
            targets = batch[:, 1:].to(device)

            logits = model(inputs)
            logits = logits.view(-1, vocab_size)
            targets = targets.view(-1)

            loss = loss_fn(logits, targets)
            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

def train(model, train_dataloader, eval_dataloader, vocab_size, device, epochs=10, lr=1e-4):
    wandb.init(project="nano-transformer", config={
        "epochs": epochs,
        "lr": lr,
        "batch_size": train_dataloader.batch_size,
        "seq_len": train_dataloader.dataset.seq_len,
        "vocab_size": vocab_size,
        "dataset": "tiny_shakespeare",
        "d_ff": d_ff,
        "max_len": max_len,
        "num_layers": num_layers
    })

    model = model.to(device)
    optimizer =  optim.SGD(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}"):
            inputs = batch[:, :-1].to(device)
            targets = batch[:, 1:].to(device)

            logits = model(inputs)
            logits = logits.view(-1, vocab_size)
            targets = targets.view(-1)

            loss = loss_fn(logits, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1}: Training Loss = {avg_train_loss:.4f}")

        # Evaluierung nach jedem Trainingsepochendurchgang
        avg_eval_loss = evaluate(model, eval_dataloader, vocab_size, device)
        print(f"Epoch {epoch+1}: Evaluation Loss = {avg_eval_loss:.4f}")

        # Logs an WandB senden
        wandb.log({
            "epoch": epoch+1,
            "train_loss": avg_train_loss,
            "eval_loss": avg_eval_loss
        })

    # Modell speichern nach Training
    torch.save(model.state_dict(), "nano_transformer_trained.pth")

# Dataset vorbereiten
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset("tiny_shakespeare", split="train", trust_remote_code=True)
text = dataset['text'][0]
tokens = tokenizer.encode(text, truncation=False)
tokens = torch.tensor(tokens)

class TextDataset(Dataset):
    def __init__(self, tokens, seq_len):
        self.tokens = tokens
        self.seq_len = seq_len

    def __len__(self):
        return len(self.tokens) - self.seq_len

    def __getitem__(self, idx):
        return self.tokens[idx:idx+self.seq_len+1]

seq_len = 64
batch_size = 32
dataset = TextDataset(tokens, seq_len)
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Erstellen eines Evaluationsdatensatzes (z.B. indem wir die ersten 20% des Datensatzes verwenden)
eval_dataset = TextDataset(tokens[dataset.seq_len:], seq_len)  # Hier könnten auch speziellere Evaluierungsdaten genutzt werden
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

print("DataLoader bereit!")

# Modell konfigurieren
vocab_size = tokenizer.vocab_size
d_model = 32
n_heads = 4
d_ff = 512  
max_len = 128
num_layers = 2

model = NanoTransformer(vocab_size, d_model, n_heads, d_ff, max_len, num_layers)

# Training starten
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, train_dataloader, eval_dataloader, vocab_size, device, epochs=10, lr=3e-3)


Token indices sequence length is longer than the specified maximum sequence length for this model (301966 > 1024). Running this sequence through the model will result in indexing errors
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


DataLoader bereit!


0,1
epoch,▁▂▃▃▄▅▆▆▇█
eval_loss,█▆▅▄▃▃▂▂▁▁
train_loss,█▅▄▃▃▂▂▁▁▁

0,1
epoch,10.0
eval_loss,7.46348
train_loss,7.50356


Epoch 1: 100%|██████████| 9435/9435 [02:15<00:00, 69.82it/s]


Epoch 1: Training Loss = 8.3323
Epoch 1: Evaluation Loss = 7.5116


Epoch 2: 100%|██████████| 9435/9435 [02:13<00:00, 70.79it/s]


Epoch 2: Training Loss = 7.2151
Epoch 2: Evaluation Loss = 6.9832


Epoch 3: 100%|██████████| 9435/9435 [02:12<00:00, 71.47it/s]


Epoch 3: Training Loss = 6.8284
Epoch 3: Evaluation Loss = 6.6963


Epoch 4: 100%|██████████| 9435/9435 [02:14<00:00, 70.24it/s]


Epoch 4: Training Loss = 6.5963
Epoch 4: Evaluation Loss = 6.5087


Epoch 5: 100%|██████████| 9435/9435 [02:13<00:00, 70.71it/s]


Epoch 5: Training Loss = 6.4402
Epoch 5: Evaluation Loss = 6.3793


Epoch 6: 100%|██████████| 9435/9435 [02:08<00:00, 73.27it/s]


Epoch 6: Training Loss = 6.3292
Epoch 6: Evaluation Loss = 6.2830


Epoch 7: 100%|██████████| 9435/9435 [02:10<00:00, 72.41it/s]


Epoch 7: Training Loss = 6.2421
Epoch 7: Evaluation Loss = 6.2033


Epoch 8: 100%|██████████| 9435/9435 [02:15<00:00, 69.87it/s]


Epoch 8: Training Loss = 6.1660
Epoch 8: Evaluation Loss = 6.1289


Epoch 9: 100%|██████████| 9435/9435 [02:14<00:00, 69.92it/s]


Epoch 9: Training Loss = 6.0917
Epoch 9: Evaluation Loss = 6.0546


Epoch 10: 100%|██████████| 9435/9435 [02:14<00:00, 70.32it/s]


Epoch 10: Training Loss = 6.0164
Epoch 10: Evaluation Loss = 5.9775


### batch_size = von 32 zu  64


In [21]:
from tqdm import tqdm
#  NanoTransformer mit PyTorch Decoder
from torch.nn import TransformerDecoder, TransformerDecoderLayer

class NanoTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, max_len, num_layers):
        super().__init__()
        self.embed = TokenAndPositionalEmbedding(vocab_size, d_model, max_len)

        decoder_layer = TransformerDecoderLayer(d_model, n_heads, d_ff, batch_first=True)
        self.decoder = TransformerDecoder(decoder_layer, num_layers)

        self.norm = nn.LayerNorm(d_model)
        self.output_proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        T = x.size(1)

        # Causal Mask erstellen
        causal_mask = torch.triu(torch.ones(T, T), diagonal=1).bool().to(x.device)

        # Dummy Memory
        memory = torch.zeros(x.size(0), 1, x.size(2)).to(x.device)

        x = self.decoder(tgt=x, memory=memory, tgt_mask=causal_mask)
        x = self.norm(x)
        logits = self.output_proj(x)
        return logits

# Training und Evaluierung
def evaluate(model, dataloader, vocab_size, device):
    model.eval()  # Wechsel in den Evaluationsmodus
    total_loss = 0
    loss_fn = nn.CrossEntropyLoss()

    with torch.no_grad():  # Kein Gradientenberechnung
        for batch in dataloader:
            inputs = batch[:, :-1].to(device)
            targets = batch[:, 1:].to(device)

            logits = model(inputs)
            logits = logits.view(-1, vocab_size)
            targets = targets.view(-1)

            loss = loss_fn(logits, targets)
            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

def train(model, train_dataloader, eval_dataloader, vocab_size, device, epochs=10, lr=1e-4):
    wandb.init(project="nano-transformer", config={
        "epochs": epochs,
        "lr": lr,
        "batch_size": train_dataloader.batch_size,
        "seq_len": train_dataloader.dataset.seq_len,
        "vocab_size": vocab_size,
        "dataset": "tiny_shakespeare",
        "d_ff": d_ff,
        "max_len": max_len,
        "num_layers": num_layers
    })

    model = model.to(device)
    optimizer =  optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}"):
            inputs = batch[:, :-1].to(device)
            targets = batch[:, 1:].to(device)

            logits = model(inputs)
            logits = logits.view(-1, vocab_size)
            targets = targets.view(-1)

            loss = loss_fn(logits, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1}: Training Loss = {avg_train_loss:.4f}")

        # Evaluierung nach jedem Trainingsepochendurchgang
        avg_eval_loss = evaluate(model, eval_dataloader, vocab_size, device)
        print(f"Epoch {epoch+1}: Evaluation Loss = {avg_eval_loss:.4f}")

        # Logs an WandB senden
        wandb.log({
            "epoch": epoch+1,
            "train_loss": avg_train_loss,
            "eval_loss": avg_eval_loss
        })

    # Modell speichern nach Training
    torch.save(model.state_dict(), "nano_transformer_trained.pth")

# Dataset vorbereiten
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset("tiny_shakespeare", split="train", trust_remote_code=True)
text = dataset['text'][0]
tokens = tokenizer.encode(text, truncation=False)
tokens = torch.tensor(tokens)

class TextDataset(Dataset):
    def __init__(self, tokens, seq_len):
        self.tokens = tokens
        self.seq_len = seq_len

    def __len__(self):
        return len(self.tokens) - self.seq_len

    def __getitem__(self, idx):
        return self.tokens[idx:idx+self.seq_len+1]

seq_len = 64
batch_size = 64
dataset = TextDataset(tokens, seq_len)
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Erstellen eines Evaluationsdatensatzes (z.B. indem wir die ersten 20% des Datensatzes verwenden)
eval_dataset = TextDataset(tokens[dataset.seq_len:], seq_len)  # Hier könnten auch speziellere Evaluierungsdaten genutzt werden
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

print("DataLoader bereit!")

# Modell konfigurieren
vocab_size = tokenizer.vocab_size
d_model = 32
n_heads = 4
d_ff = 512  
max_len = 128
num_layers = 2

model = NanoTransformer(vocab_size, d_model, n_heads, d_ff, max_len, num_layers)

# Training starten
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, train_dataloader, eval_dataloader, vocab_size, device, epochs=10, lr=3e-3)


Token indices sequence length is longer than the specified maximum sequence length for this model (301966 > 1024). Running this sequence through the model will result in indexing errors
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


DataLoader bereit!


Epoch 1: 100%|██████████| 4718/4718 [02:14<00:00, 35.20it/s]


Epoch 1: Training Loss = 3.9587
Epoch 1: Evaluation Loss = 3.3660


Epoch 2: 100%|██████████| 4718/4718 [02:35<00:00, 30.32it/s]


Epoch 2: Training Loss = 3.0333
Epoch 2: Evaluation Loss = 2.8101


Epoch 3: 100%|██████████| 4718/4718 [02:37<00:00, 29.93it/s]


Epoch 3: Training Loss = 2.7099
Epoch 3: Evaluation Loss = 2.6113


Epoch 4: 100%|██████████| 4718/4718 [03:10<00:00, 24.70it/s]


Epoch 4: Training Loss = 2.5601
Epoch 4: Evaluation Loss = 2.4975


Epoch 5: 100%|██████████| 4718/4718 [03:24<00:00, 23.04it/s]


Epoch 5: Training Loss = 2.4698
Epoch 5: Evaluation Loss = 2.4229


Epoch 6: 100%|██████████| 4718/4718 [03:24<00:00, 23.08it/s]


Epoch 6: Training Loss = 2.4080
Epoch 6: Evaluation Loss = 2.3788


Epoch 7: 100%|██████████| 4718/4718 [03:24<00:00, 23.05it/s]


Epoch 7: Training Loss = 2.3617
Epoch 7: Evaluation Loss = 2.3388


Epoch 8: 100%|██████████| 4718/4718 [02:45<00:00, 28.53it/s]


Epoch 8: Training Loss = 2.3252
Epoch 8: Evaluation Loss = 2.3018


Epoch 9: 100%|██████████| 4718/4718 [02:40<00:00, 29.37it/s]


Epoch 9: Training Loss = 2.2955
Epoch 9: Evaluation Loss = 2.2779


Epoch 10: 100%|██████████| 4718/4718 [02:36<00:00, 30.13it/s]


Epoch 10: Training Loss = 2.2699
Epoch 10: Evaluation Loss = 2.2472


### Dropout hinzufügen

In [4]:
from tqdm import tqdm
#  NanoTransformer mit PyTorch Decoder
from torch.nn import TransformerDecoder, TransformerDecoderLayer

class NanoTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, max_len, num_layers, dropout=0.1):
        super().__init__()
        self.embed = TokenAndPositionalEmbedding(vocab_size, d_model, max_len)

        decoder_layer = TransformerDecoderLayer(d_model, n_heads, d_ff, batch_first=True)
        self.decoder = TransformerDecoder(decoder_layer, num_layers)

        self.norm = nn.LayerNorm(d_model)
        self.output_proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        T = x.size(1)

        # Causal Mask erstellen
        causal_mask = torch.triu(torch.ones(T, T), diagonal=1).bool().to(x.device)

        # Dummy Memory
        memory = torch.zeros(x.size(0), 1, x.size(2)).to(x.device)

        x = self.decoder(tgt=x, memory=memory, tgt_mask=causal_mask)
        x = self.norm(x)
        logits = self.output_proj(x)
        return logits

# Training und Evaluierung
def evaluate(model, dataloader, vocab_size, device):
    model.eval()  # Wechsel in den Evaluationsmodus
    total_loss = 0
    loss_fn = nn.CrossEntropyLoss()

    with torch.no_grad():  # Kein Gradientenberechnung
        for batch in dataloader:
            inputs = batch[:, :-1].to(device)
            targets = batch[:, 1:].to(device)

            logits = model(inputs)
            logits = logits.view(-1, vocab_size)
            targets = targets.view(-1)

            loss = loss_fn(logits, targets)
            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

def train(model, train_dataloader, eval_dataloader, vocab_size, device, epochs=10, lr=1e-3):
    wandb.init(project="nano-transformer", config={
        "epochs": epochs,
        "lr": lr,
        "batch_size": train_dataloader.batch_size,
        "seq_len": train_dataloader.dataset.seq_len,
        "vocab_size": vocab_size,
        "dataset": "tiny_shakespeare",
        "d_ff": d_ff,
        "max_len": max_len,
        "num_layers": num_layers
    })

    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}"):
            inputs = batch[:, :-1].to(device)
            targets = batch[:, 1:].to(device)

            logits = model(inputs)
            logits = logits.view(-1, vocab_size)
            targets = targets.view(-1)

            loss = loss_fn(logits, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1}: Training Loss = {avg_train_loss:.4f}")

        # Evaluierung nach jedem Trainingsepochendurchgang
        avg_eval_loss = evaluate(model, eval_dataloader, vocab_size, device)
        print(f"Epoch {epoch+1}: Evaluation Loss = {avg_eval_loss:.4f}")

        # Logs an WandB senden
        wandb.log({
            "epoch": epoch+1,
            "train_loss": avg_train_loss,
            "eval_loss": avg_eval_loss
        })

    # Modell speichern nach Training
    torch.save(model.state_dict(), "nano_transformer_trained.pth")

# Dataset vorbereiten
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset("tiny_shakespeare", split="train", trust_remote_code=True)
text = dataset['text'][0]
tokens = tokenizer.encode(text, truncation=False)
tokens = torch.tensor(tokens)

class TextDataset(Dataset):
    def __init__(self, tokens, seq_len):
        self.tokens = tokens
        self.seq_len = seq_len

    def __len__(self):
        return len(self.tokens) - self.seq_len

    def __getitem__(self, idx):
        return self.tokens[idx:idx+self.seq_len+1]

seq_len = 64
batch_size = 32
dataset = TextDataset(tokens, seq_len)
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Erstellen eines Evaluationsdatensatzes (z.B. indem wir die ersten 20% des Datensatzes verwenden)
eval_dataset = TextDataset(tokens[dataset.seq_len:], seq_len)  # Hier könnten auch speziellere Evaluierungsdaten genutzt werden
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

print("DataLoader bereit!")

# Modell konfigurieren
vocab_size = tokenizer.vocab_size
d_model = 32
n_heads = 4
d_ff = 512  
max_len = 128
num_layers = 2

model = NanoTransformer(vocab_size, d_model, n_heads, d_ff, max_len, num_layers, dropout=0.2)

# Training starten
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, train_dataloader, eval_dataloader, vocab_size, device, epochs=10, lr=3e-3)



Token indices sequence length is longer than the specified maximum sequence length for this model (301966 > 1024). Running this sequence through the model will result in indexing errors


DataLoader bereit!


Epoch 1: 100%|██████████| 9435/9435 [02:16<00:00, 69.19it/s]


Epoch 1: Training Loss = 3.8361
Epoch 1: Evaluation Loss = 3.2683


Epoch 2: 100%|██████████| 9435/9435 [02:11<00:00, 71.72it/s]


Epoch 2: Training Loss = 2.9342
Epoch 2: Evaluation Loss = 2.7216


Epoch 3: 100%|██████████| 9435/9435 [02:12<00:00, 70.97it/s]


Epoch 3: Training Loss = 2.6330
Epoch 3: Evaluation Loss = 2.5407


Epoch 4: 100%|██████████| 9435/9435 [02:12<00:00, 71.02it/s]


Epoch 4: Training Loss = 2.5005
Epoch 4: Evaluation Loss = 2.4423


Epoch 5: 100%|██████████| 9435/9435 [02:13<00:00, 70.58it/s]


Epoch 5: Training Loss = 2.4212
Epoch 5: Evaluation Loss = 2.3806


Epoch 6: 100%|██████████| 9435/9435 [02:12<00:00, 71.44it/s]


Epoch 6: Training Loss = 2.3654
Epoch 6: Evaluation Loss = 2.3293


Epoch 7: 100%|██████████| 9435/9435 [02:11<00:00, 71.48it/s]


Epoch 7: Training Loss = 2.3239
Epoch 7: Evaluation Loss = 2.2975


Epoch 8: 100%|██████████| 9435/9435 [02:19<00:00, 67.73it/s]


Epoch 8: Training Loss = 2.2906
Epoch 8: Evaluation Loss = 2.2623


Epoch 9: 100%|██████████| 9435/9435 [02:12<00:00, 71.45it/s]


Epoch 9: Training Loss = 2.2639
Epoch 9: Evaluation Loss = 2.2466


Epoch 10: 100%|██████████| 9435/9435 [02:07<00:00, 73.75it/s]


Epoch 10: Training Loss = 2.2412
Epoch 10: Evaluation Loss = 2.2234


### Text Generation


In [1]:
def generate(model, start_token, max_len=50, temperature=1.0):
    model.eval()
    input_ids = start_token.to(device)

    for _ in range(max_len):
        logits = model(input_ids)
        next_token_logits = logits[:, -1, :] / temperature
        probs = torch.softmax(next_token_logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        input_ids = torch.cat([input_ids, next_token], dim=1)

    return input_ids.squeeze().tolist()

In [None]:
start_text = "The future of AI"
input_ids = tokenizer.encode(start_text, return_tensors="pt").to(device)
print("Input IDs:", input_ids.shape)

output_ids = generate(model, input_ids, max_len=50, temperature=1.0)
output_text = tokenizer.decode(output_ids)
print(output_text)

# fertiges Modell statt mein eigenes Transformer-Modell
1. Imports

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
import wandb  


2. Tokenizer und Modell laden

In [None]:
# 1. Modell und Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("sshleifer/tiny-gpt2")



3. Dataset vorbereiten

In [None]:
# Tiny Shakespeare laden
dataset = load_dataset("tiny_shakespeare", split="train")

# Den Text extrahieren
text = dataset['text'][0]  # Nur der Text

# Tokenisieren
tokens = tokenizer.encode(text, truncation=False)
tokens = torch.tensor(tokens)

# Dataset Klasse definieren
class TextDataset(Dataset):
    def __init__(self, tokens, seq_len):
        self.tokens = tokens
        self.seq_len = seq_len

    def __len__(self):
        return len(self.tokens) - self.seq_len

    def __getitem__(self, idx):
        return self.tokens[idx:idx+self.seq_len+1]

# Parameter
seq_len = 64
batch_size = 32

# Dataset und DataLoader erstellen
train_dataset = TextDataset(tokens, seq_len)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Für Evaluation einfach ein weiteres Dataset erstellen
# Erstellen eines Evaluationsdatensatzes (z.B. indem wir die ersten 20% des Datensatzes verwenden)  
eval_dataset = TextDataset(tokens, seq_len)
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

print("DataLoader bereit!")


4. neues train und evaluate

In [22]:
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            inputs = batch[:, :-1].to(device)
            targets = batch[:, 1:].to(device)

            outputs = model(input_ids=inputs, labels=targets)
            loss = outputs.loss
            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss


def train(model, train_dataloader, eval_dataloader, device, epochs=5, lr=1e-3):
    wandb.init(mode="offline", project="gpt2-finetune", config={
        "epochs": epochs,
        "lr": lr,
        "batch_size": train_dataloader.batch_size,
        "seq_len": train_dataloader.dataset.seq_len,
        "vocab_size": model.config.vocab_size,
        "dataset": "tiny_shakespeare",
        "model_name": "sshleifer/tiny-gpt2"
    })

    model = model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}"):
            inputs = batch[:, :-1].to(device)
            targets = batch[:, 1:].to(device)

            outputs = model(input_ids=inputs, labels=targets)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1}: Training Loss = {avg_train_loss:.4f}")

        avg_eval_loss = evaluate(model, eval_dataloader, device)
        print(f"Epoch {epoch+1}: Evaluation Loss = {avg_eval_loss:.4f}")

        wandb.log({
            "epoch": epoch+1,
            "train_loss": avg_train_loss,
            "eval_loss": avg_eval_loss
        })

    torch.save(model.state_dict(), "fine_tuned_gpt2.pth")
    print("Modell gespeichert!")


5. Ganzes Setup zusammengefasst

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train(
    model, 
    train_dataloader, 
    eval_dataloader, 
    device, 
    epochs=5, 
    lr=4e-3  # ein bisschen niedrigerer Lernrate für finetuning
)
wandb.finish()  # Beende die WandB-Sitzung


Epoch 1: 100%|██████████| 9435/9435 [05:10<00:00, 30.40it/s]


Epoch 1: Training Loss = 6.3228
Epoch 1: Evaluation Loss = 6.3212


Epoch 2: 100%|██████████| 9435/9435 [05:38<00:00, 27.84it/s]


Epoch 2: Training Loss = 6.3217
Epoch 2: Evaluation Loss = 6.3212


Epoch 3: 100%|██████████| 9435/9435 [06:30<00:00, 24.15it/s]


Epoch 3: Training Loss = 6.3215
Epoch 3: Evaluation Loss = 6.3213


Epoch 4: 100%|██████████| 9435/9435 [04:31<00:00, 34.74it/s]


Epoch 4: Training Loss = 6.3214
Epoch 4: Evaluation Loss = 6.3209


Epoch 5: 100%|██████████| 9435/9435 [03:34<00:00, 44.06it/s]


Epoch 5: Training Loss = 6.3213
Epoch 5: Evaluation Loss = 6.3208
Modell gespeichert!


0,1
epoch,▁▃▅▆█
eval_loss,▇▆█▃▁
train_loss,█▃▂▁▁

0,1
epoch,5.0
eval_loss,6.32083
train_loss,6.32134


6. Text Generation nach dem Finetuning

In [28]:
# Text-Input
prompt = "Once upon a time"

# Tokenisieren
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)  # <-- GANZ WICHTIG!!

# Text generieren
outputs = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    pad_token_id=tokenizer.eos_token_id,
    max_length=50,
    temperature=0.7,     # Etwas weniger chaotisch
    top_k=20,            # Nur Top-20 Token zur Auswahl
    top_p=0.9,           # Weniger random
    do_sample=True,
    num_return_sequences=1
)


# Ausgabe decodieren
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Once upon a time,
:
.

;
:
,




;:
 the:
,



 to


,




 the
;.






In [29]:
from transformers import GPT2LMHeadModel, AutoTokenizer

# Statt:
# model = GPT2LMHeadModel.from_pretrained("sshleifer/tiny-gpt2")
# Nutze das echte GPT2:
model = GPT2LMHeadModel.from_pretrained("gpt2")

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token


In [30]:
# Text-Input
prompt = "Once upon a time"

# Tokenisieren
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)  # <-- GANZ WICHTIG!!

# Text generieren
outputs = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    pad_token_id=tokenizer.eos_token_id,
    max_length=50,
    temperature=0.7,     # Etwas weniger chaotisch
    top_k=20,            # Nur Top-20 Token zur Auswahl
    top_p=0.9,           # Weniger random
    do_sample=True,
    num_return_sequences=1
)


# Ausgabe decodieren
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Once upon a time, the first person to enter a room with a weapon or weapon of some kind, was an individual who had a great deal of experience in combat, or was a member of a guild, or a member of a guild's guild
