## Benötigte Bibliotheken installieren

In [1]:
from datasets import load_dataset

dataset = load_dataset("tiny_shakespeare", trust_remote_code=True)
train_texts = dataset["train"]["text"]


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
import torch.nn as nn

class TinyGPT(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_layers=2, dim_feedforward=512):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, tgt, memory):
        embedded = self.embedding(tgt)
        output = self.decoder(embedded, memory)
        return self.fc_out(output)


In [3]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # wichtig für Padding


In [4]:
def tokenize_function(texts):
    return tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

tokens = tokenize_function(train_texts)
input_ids = tokens["input_ids"]


In [5]:
import wandb
wandb.init(project="tiny-gpt")


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin




In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TinyGPT(vocab_size=tokenizer.vocab_size).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()


In [7]:
for epoch in range(3):
    model.train()
    total_loss = 0

    for batch in input_ids:
        batch = batch.to(device)
        batch = batch.unsqueeze(0)  # [1, seq_len] -> [batch_size, seq_len]

        # Input and target
        input_seq = batch[:, :-1]     # Input tokens
        target_seq = batch[:, 1:]     # Next-token targets

        # Forward pass: input_seq is both tgt and memory for simplicity
        output = model(input_seq, input_seq)  # Model handles embedding inside

        # Reshape output and targets for CrossEntropyLoss
        output = output.reshape(-1, output.shape[-1])       # [batch * seq_len, vocab_size]
        target_seq = target_seq.reshape(-1)                 # [batch * seq_len]

        loss = criterion(output, target_seq)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(input_ids)
    print(f"Epoch {epoch+1} Loss: {avg_loss:.4f}")
    wandb.log({"loss": avg_loss})


AssertionError: For batched (3-D) `query`, expected `key` and `value` to be 3-D but found 2-D and 2-D tensors respectively