## Installing required libraries

In [5]:
%pip install torch transformers datasets wandb




Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install setuptools
%pip install wheel
%pip install wandb
%pip install huggingface_hub[hf_xet]

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## importing libraries 

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2Tokenizer
from datasets import load_dataset
import wandb
from huggingface_hub import HfApi, HfFolder, Repository
import os
import random

## Configurations

In [7]:
class Config:
    block_size = 128
    batch_size = 32
    embed_dim = 128
    num_heads = 4
    num_layers = 2
    dropout = 0.1
    epochs = 5
    lr = 1e-3
    device = "cuda" if torch.cuda.is_available() else "cpu"
    wandb_project = "tiny-shakespeare-decoder-only"
    model_name = "decoder-only-tinyshakespeare"

cfg = Config()



## Preparing and Preparing Data

In [9]:
import requests


data_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
with open("shakespeare.txt", "w") as f:
    f.write(requests.get(data_url).text)

with open("shakespeare.txt", "r", encoding="utf-8") as f:
    text = f.read()
from transformers import GPT2TokenizerFast

## Loading   Tokenizer 


In [10]:

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})



tokens = tokenizer(text, return_tensors="pt")
input_ids = tokens["input_ids"].squeeze(0)

Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors


## spliting dataset

In [11]:

n = int(0.9 * len(input_ids))
train_ids = input_ids[:n]
val_ids = input_ids[n:]

def get_batch(data, block_size, batch_size):
    if len(data) <= block_size:
        raise ValueError(f"Datensatz ist zu klein für block_size={block_size} (nur {len(data)} Tokens).")

    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(cfg.device), y.to(cfg.device)



## Decoder-only Modell


In [12]:

class DecoderOnlyModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, cfg.embed_dim)
        self.pos_embedding = nn.Parameter(torch.zeros(1, cfg.block_size, cfg.embed_dim))

        decoder_layer = nn.TransformerDecoderLayer(d_model=cfg.embed_dim, nhead=cfg.num_heads, dropout=cfg.dropout)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=cfg.num_layers)

        self.output_proj = nn.Linear(cfg.embed_dim, vocab_size)

    def forward(self, x):
        tok_emb = self.embedding(x)
        pos_emb = self.pos_embedding[:, :x.size(1), :]
        x = tok_emb + pos_emb

        tgt_mask = nn.Transformer.generate_square_subsequent_mask(x.size(1)).to(x.device)

        # Autoregressives Target als sowohl memory als auch tgt
        out = self.decoder(x.transpose(0, 1), x.transpose(0, 1), tgt_mask=tgt_mask)
        logits = self.output_proj(out.transpose(0, 1))
        return logits