In [68]:


import torch
import torch.nn as nn

class GPTDecoderOnlyModel(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_layers=2, dim_feedforward=256, max_seq_length=128, dropout=0.1):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.positional_embedding = nn.Parameter(torch.zeros(1, max_seq_length, d_model))

        self.blocks = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=d_model,
                nhead=nhead,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ) for _ in range(num_layers)
        ])
        
        self.norm = nn.LayerNorm(d_model)
        self.lm_head = nn.Linear(d_model, vocab_size)
        self.max_seq_length = max_seq_length

    def generate_causal_mask(self, seq_len, device):
        return torch.triu(torch.full((seq_len, seq_len), float('-inf')), diagonal=1).to(device)

    def forward(self, input_ids):
        B, T = input_ids.shape
        x = self.token_embedding(input_ids)
        x = x + self.positional_embedding[:, :T, :]
        mask = self.generate_causal_mask(T, input_ids.device)

        for block in self.blocks:
            x = block(x, mask)

        x = self.norm(x)
        logits = self.lm_head(x)
        return logits



In [69]:

from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
#"""
model = GPTDecoderOnlyModel(vocab_size=tokenizer.vocab_size)
model.eval()



GPTDecoderOnlyModel(
  (token_embedding): Embedding(50257, 128)
  (blocks): ModuleList(
    (0-1): 2 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (linear1): Linear(in_features=128, out_features=256, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=256, out_features=128, bias=True)
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
  (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (lm_head): Linear(in_features=128, out_features=50257, bias=True)
)

In [70]:
from datasets import load_dataset



dataset = load_dataset("Trelis/tiny-shakespeare", split="train")

texts = [tokenizer.encode(x['Text'], truncation=True, max_length=64, padding="max_length") for x in dataset if len(x['Text']) > 0]



# Umwandlung in Tensor mit richtiger Dimension (List[List[int]] -> Tensor)
import torch.utils.data as data

inputs_tensor = torch.tensor(texts, dtype=torch.long)
dataset = data.TensorDataset(inputs_tensor)
batch_size = 32
dataloader = data.DataLoader(dataset, batch_size=batch_size, shuffle=True)









In [71]:
from torch.utils.data import random_split

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = data.DataLoader(val_dataset, batch_size=batch_size)


In [72]:
@torch.no_grad()
def evaluate(model, dataloader, vocab_size, device):
    model.eval()
    total_loss = 0
    loss_fn = nn.CrossEntropyLoss()

    for batch in dataloader:
        batch = batch[0]
        inputs = batch[:, :-1].to(device)
        targets = batch[:, 1:].to(device)

        logits = model(inputs)
        logits = logits.reshape(-1, vocab_size)
        targets = targets.reshape(-1)

        loss = loss_fn(logits, targets)
        total_loss += loss.item()

    return total_loss / len(dataloader)

In [73]:
def generate(model, tokenizer, prompt, max_new_tokens=50, device="cpu", temperature=1.0, top_k=50):
    model.eval()
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    for _ in range(max_new_tokens):
        logits = model(input_ids)
        logits = logits[:, -1, :] / temperature

        # top-k sampling
        if top_k is not None:
            top_k = min(top_k, logits.size(-1))
            values, indices = torch.topk(logits, top_k)
            probs = torch.softmax(values, dim=-1)
            next_token = indices.gather(1, torch.multinomial(probs, num_samples=1))
        else:
            probs = torch.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

        input_ids = torch.cat([input_ids, next_token], dim=-1)

    return tokenizer.decode(input_ids[0], skip_special_tokens=True)



In [74]:
import wandb

wandb.login()  # du wirst aufgefordert, deinen API-Key einzugeben

True

In [75]:

from tqdm.notebook import tqdm

import torch.optim as optim

def train(model, train_loader, val_loader, vocab_size, device, epochs=5, lr=1e-4):
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    model.train()
    wandb.init(project="tiny-gpt", name="GPT-run-1", config={
    "epochs": epochs,
    "batch_size": batch_size,
    "learning_rate": lr,
    "model_dim": model.token_embedding.embedding_dim,
    "layers": len(model.blocks),
    "vocab_size": vocab_size
})
    wandb.watch(model)
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            batch = batch[0]  # Extrahiere eigentlichen Tensor
            inputs = batch[:, :-1].to(device)
            targets = batch[:, 1:].to(device)

            logits = model(inputs)
            logits = logits.reshape(-1, vocab_size)  # statt .view
            targets = targets.reshape(-1)


            loss = loss_fn(logits, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        val_loss = evaluate(model, val_loader, vocab_size, device)
        #print(f"Epoch {epoch+1}: Loss = {avg_loss:.4f}, Val_Loss = {val_loss:.4f}")
        prompt = "i have shoes with"

        generated_text = generate(model, tokenizer, prompt, max_new_tokens=50, device=device)
        wandb.log({
            "epoch": epoch + 1,
            "train_loss": avg_loss,
            "val_loss": val_loss,
            "generated_text": wandb.Html(f"<pre>{generated_text}</pre>"),
        })

        print(f"Epoch {epoch+1}/{wandb.config.epochs}, Train Loss: {avg_loss:.4f}, Val_Loss = {val_loss:.4f}")
        
        print(generated_text)
      
    wandb.finish()



In [76]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train(model, train_loader, val_loader,  vocab_size, device, epochs=5, lr=3e-4)

Training Epoch 1:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 1/5, Train Loss: 9.8676, Val_Loss = 10.0273
i have shoes with2009Famask350 mafiainput MannbraIns towering KJoiat racistsccording Let Debugjpowskilicts synonymous cha Cherry shaky precariouswalking covered Elvisumperghai1972 visibility prevents Corinthians KashmirKINGjp Cogn Trayvon Repeatctive R wherein interHe cas'?tten
,


Training Epoch 2:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 2/5, Train Loss: 8.8849, Val_Loss = 8.9997
i have shoes with
 news

 sentence


 yourself like
 uncle
Wh

.

 times


TheW
LoARDous





KEN
KINGHere firmly
O
ICK are
 gAngelo




Training Epoch 3:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 3/5, Train Loss: 7.9453, Val_Loss = 8.1708
i have shoes with
My affect with
'd my are
 news newsMy

Your

, my
ous




The
 him times

 the
 so,--


, are are our
: yourself





Training Epoch 4:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 4/5, Train Loss: 7.1924, Val_Loss = 7.5133
i have shoes with--

 like
 him this,.



'd



: me:



ARDBR with are
 the

KING.


 see Warwick'dly

.'s
 them soThe


Training Epoch 5:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 5/5, Train Loss: 6.5959, Val_Loss = 7.0475
i have shoes with
:

 our this





 see

 me

 I
 the
 you'd

,




'd


 are thy him,,'d
 the,
, are me,


0,1
epoch,▁▃▅▆█
train_loss,█▆▄▂▁
val_loss,█▆▄▂▁

0,1
epoch,5.0
train_loss,6.5959
val_loss,7.04754


In [77]:
prompt = "i have shoes with"

generated_text = generate(model, tokenizer, prompt, max_new_tokens=50, device=device)
print(generated_text)

i have shoes with
 not



 my thou;.
-- with,

 are lord
My

 is but,, this my: this a theBR


,The


 our,
 the, fromHe,

