In [23]:


import torch
import torch.nn as nn

class GPTDecoderOnlyModel(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_layers=2, dim_feedforward=256, max_seq_length=128, dropout=0.1):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.positional_embedding = nn.Parameter(torch.zeros(1, max_seq_length, d_model))

        self.blocks = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=d_model,
                nhead=nhead,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ) for _ in range(num_layers)
        ])
        
        self.norm = nn.LayerNorm(d_model)
        self.lm_head = nn.Linear(d_model, vocab_size)
        self.max_seq_length = max_seq_length

    def generate_causal_mask(self, seq_len, device):
        return torch.triu(torch.full((seq_len, seq_len), float('-inf')), diagonal=1).to(device)

    def forward(self, input_ids):
        B, T = input_ids.shape
        x = self.token_embedding(input_ids)
        x = x + self.positional_embedding[:, :T, :]
        mask = self.generate_causal_mask(T, input_ids.device)

        for block in self.blocks:
            x = block(x, mask)

        x = self.norm(x)
        logits = self.lm_head(x)
        return logits



In [24]:

from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
#"""
model = GPTDecoderOnlyModel(vocab_size=tokenizer.vocab_size)
model.eval()

# Textgenerierung wie vorher
def generate(model, tokenizer, prompt, max_new_tokens=20):
    model.eval()
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    for _ in range(max_new_tokens):
        logits = model(input_ids)
        next_token_logits = logits[:, -1, :]
        next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1)
        input_ids = torch.cat([input_ids, next_token], dim=-1)

    return tokenizer.decode(input_ids[0], skip_special_tokens=True)

print(generate(model, tokenizer, "Once upon a time", max_new_tokens=30))

#"""

Once upon a timeä Christie Issa Bishop hammeredwage twists deprived376 coincides Tibet beltnyder olive354amber acknowledging villages Sister disdain987 timelymail Care>,giplacesopard emancipationMod


In [28]:
from datasets import load_dataset



dataset = load_dataset("Trelis/tiny-shakespeare", split="train")

texts = [tokenizer.encode(x['Text'], truncation=True, max_length=64, padding="max_length") for x in dataset if len(x['Text']) > 0]



# Umwandlung in Tensor mit richtiger Dimension (List[List[int]] -> Tensor)
import torch.utils.data as data

inputs_tensor = torch.tensor(texts, dtype=torch.long)
dataset = data.TensorDataset(inputs_tensor)
batch_size = 32
dataloader = data.DataLoader(dataset, batch_size=batch_size, shuffle=True)









In [31]:
from tqdm.notebook import tqdm

import torch.optim as optim

def train(model, dataloader, vocab_size, device, epochs=5, lr=1e-4):
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(dataloader, desc=f"Training Epoch {epoch+1}"):
            batch = batch[0]  # Extrahiere eigentlichen Tensor
            inputs = batch[:, :-1].to(device)
            targets = batch[:, 1:].to(device)

            logits = model(inputs)
            logits = logits.reshape(-1, vocab_size)  # statt .view
            targets = targets.reshape(-1)


            loss = loss_fn(logits, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}: Loss = {avg_loss:.4f}")


In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train(model, dataloader, vocab_size, device, epochs=5, lr=3e-4)

Training Epoch 1:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 1: Loss = 10.5292


Training Epoch 2:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 2: Loss = 9.3769


Training Epoch 3:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 3: Loss = 8.3779


Training Epoch 4:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 4: Loss = 7.5627


Training Epoch 5:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch 5: Loss = 6.9498
