## Generative AI / Transformer Projekt
1. Embedding + Positional Encoding,  
2. Masked Multi-Head Self-Attention,
3. Add & Norm,
4. Feedforward Layer,
5. Putting It All Together: Transformer Decoder Block,
6. Assembling the NanoTransformer (Decoder-Only)

<div style="text-align: center;">
    <img src="https://machinelearningmastery.com/wp-content/uploads/2021/08/attention_research_1.png" alt="Attention Research" style="max-width: 40%; height: auto;">
</div>

Source: [machinelearningmastery.com](https://machinelearningmastery.com/wp-content/uploads/2021/08/attention_research_1.png)

In [None]:
# initializierung
%pip install transformers datasets wandb

## Die Importierungen + wandb.ai anmeldung

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import wandb
wandb.login()

## STEP 1: Embedding + Positional Encoding


In [None]:
class TokenAndPositionalEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = nn.Embedding(max_len, d_model)

    def forward(self, x):
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)
        x = self.token_embed(x) + self.pos_embed(positions)
        return x


## STEP 2: Masked Multi-Head Self-Attention (PyTorch)

In [None]:
class MaskedSelfAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        self.attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True)

    def forward(self, x):
        T = x.size(1)
        # Causal mask: üst üçgeni -inf yap
        mask = torch.triu(torch.ones(T, T, device=x.device), diagonal=1).bool()
        return self.attn(x, x, x, attn_mask=mask)[0]


## STEP 3 — Add & Norm (Residual Connection + Layer Normalization)

In [None]:
class AddNorm(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, sublayer_output):

        return self.norm(x + sublayer_output)

## STEP 4  - FeedForward Layer (MLP)

In [None]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()

        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),  # Genişlet
            nn.ReLU(),                 # Aktivasyon
            nn.Linear(d_ff, d_model)   # Tekrar küçült
        )

    def forward(self, x):
        return self.net(x)


## STEP 5 - Putting It All Together: Transformer Decoder Block

In [None]:
class DecoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super().__init__()
        self.attn = MaskedSelfAttention(d_model, n_heads)
        self.add_norm1 = AddNorm(d_model)

        self.ff = FeedForward(d_model, d_ff)
        self.add_norm2 = AddNorm(d_model)

    def forward(self, x):
        x = self.add_norm1(x, self.attn(x))  # Attention + Add & Norm
        x = self.add_norm2(x, self.ff(x))    # FF + Add & Norm
        return x


## STEP 6 - Assembling the NanoTransformer (Decoder-Only)

In [None]:
# Final Model

class NanoTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, max_len, num_layers):
        super().__init__()

        self.embed = TokenAndPositionalEmbedding(vocab_size, d_model, max_len)

        self.blocks = nn.ModuleList([
            DecoderBlock(d_model, n_heads, d_ff) for _ in range(num_layers)
        ])

        # Final Layer Norm
        self.norm = nn.LayerNorm(d_model)

        self.output_proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        # Embedding + Position
        x = self.embed(x)

        # Transformer Blocks
        for block in self.blocks:
            x = block(x)

        # Norm + Output
        x = self.norm(x)
        logits = self.output_proj(x)

        return logits

## Step 7 —  DataLoader (HuggingFace - GPT2 Tokenizer)



In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader


tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tiny Shakespeare Dataset
dataset = load_dataset("tiny_shakespeare")

max_len = 64
batch_size = 32

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=max_len, padding="max_length")

train_data = dataset["train"].map(tokenize_function, batched=True)
val_data = dataset["validation"].map(tokenize_function, batched=True)

train_data.set_format(type="torch", columns=["input_ids", "attention_mask"])
val_data.set_format(type="torch", columns=["input_ids", "attention_mask"])

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)


## STEP 8 - Model Hyperparameters

In [None]:
# ✅ Hyperparameters

epochs = 50
batch_size = 32
lr = 1e-4
vocab_size = tokenizer.vocab_size       # Tokenizer'dan alınan kelime sayısı
d_model = 128                           # Embed + attention boyutu
n_heads = 4                             # Multi-head attention başlık sayısı
d_ff = 512                              # Feedforward katman boyutu
max_len = 64                            # Giriş uzunluğu
num_layers = 2                          # Transformer block sayısı

# ✅ Model
model = NanoTransformer(
    vocab_size=vocab_size,
    d_model=d_model,
    n_heads=n_heads,
    d_ff=d_ff,
    max_len=max_len,
    num_layers=num_layers
)


## STEP 8 - wandb.io initializierung

In [None]:
wandb.init(
    project="nano-transformer",
    config={
        "epochs": epochs,
        "batch_size": batch_size,
        "d_model": d_model,
        "n_heads": n_heads,
        "d_ff": d_ff,
        "num_layers": num_layers,
        "lr": lr,
        "max_len": max_len
    }
)

## STEP 9 - Evaluation und Training Loop + wandb logging


In [None]:
# ✅ Evaluation fonktion
@torch.no_grad()
def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0

    for batch in val_loader:
        inputs = batch["input_ids"].to(device)
        targets = inputs.clone()

        outputs = model(inputs)
        loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
        total_loss += loss.item()

    avg_loss = total_loss / len(val_loader)
    return avg_loss

In [None]:
import torch.nn.functional as F
import torch.optim as optim



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss ve optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# Training Loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        inputs = batch["input_ids"].to(device)
        targets = inputs.clone()

        outputs = model(inputs)  # output = [B, T, vocab_size]
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)


    val_loss = evaluate(model, val_loader, criterion, device)

    print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_loss:.4f} | Val Loss: {val_loss:.4f}")

    # 🎯 wandb log
    wandb.log({
        "train_loss": avg_loss,
        "val_loss": val_loss,
        "epoch": epoch + 1
    })

wandb.finish()


## STEP 10 — Text Generation


In [None]:
def generate(model, start_token, max_len=50, temperature=0.7, top_k=50, device="cpu"):
    model.eval()
    input_ids = start_token.to(device)

    for _ in range(max_len):
        logits = model(input_ids)
        next_token_logits = logits[:, -1, :] / temperature
        probs = torch.softmax(next_token_logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        input_ids = torch.cat([input_ids, next_token], dim=1)

    return input_ids.squeeze().tolist()


In [None]:
start_text = "My love for thee"
input_ids = tokenizer.encode(start_text, return_tensors="pt").to(device)
print("Input IDs:", input_ids.shape)

# Üretim
output_ids = generate(model, input_ids, max_len=50, temperature=0.7, top_k=30, device=device)
output_text = tokenizer.decode(output_ids, skip_special_tokens=True)

print(output_text)


## Step 11 - Hugging Face Transformers

In [None]:
import wandb

wandb.login()



In [None]:
wandb.init(
    project="distilgpt2-wikitext2",
    config={
        "model_name": "distilgpt2",
        "dataset": "wikitext-2",
        "max_length": 50,
        "temperature": 1.0,
        "top_k": 50,
        "top_p": 0.95
    }
)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
import torch

prompt = "In the future, AI will"
inputs = tokenizer(prompt, return_tensors="pt")

# Generate text
outputs = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    pad_token_id=tokenizer.eos_token_id,
    max_length=50,
    temperature=1.0,
    top_k=50,
    top_p=0.95,
    do_sample=True,
    num_return_sequences=1
)

# Decode and print
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

wandb.log({"generated_text": wandb.Html(generated_text)})

wandb.finish()