## Encoder

In [None]:
import torch
import torch.nn as nn
import math


class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # [1, max_len, embed_dim]
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: [B, T, C]
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :]

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

    def forward(self, x):
        # x: [B, T] (long)
        return self.embedding(x)  # [B, T, embed_dim]


class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads=8, dp=0.1):
        super().__init__()
        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5

        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dp)

    def split_heads(self, x):
        # x: [B, T, C] -> [B, num_heads, T, head_dim]
        B, T, C = x.size()
        return x.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

    def combine_heads(self, x):
        # x: [B, num_heads, T, head_dim] -> [B, T, C]
        x = x.transpose(1, 2).contiguous()  # -> [B, T, num_heads, head_dim]
        B, T, _, _ = x.size()
        return x.view(B, T, self.num_heads * self.head_dim)

    def forward(self, query, key, value, mask=None):
        # query/key/value: [B, T, C]
        Q = self.split_heads(self.q_proj(query))
        K = self.split_heads(self.k_proj(key))
        V = self.split_heads(self.v_proj(value))

        # scores: [B, num_heads, T_q, T_k]
        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale

        # mask: expected shape can be [T_q, T_k] or [B, T_q, T_k] or broadcastable.
        if mask is not None:
            # mask should be 1 for allowed positions and 0 for masked positions
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)

        out = torch.matmul(attn, V)  # [B, num_heads, T_q, head_dim]
        out = self.combine_heads(out)  # [B, T_q, C]
        return self.out_proj(out)     # [B, T_q, C]


class FeedForward(nn.Module):
    def __init__(self, embed_dim, expansion=4, dp=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * expansion),
            nn.GELU(),
            nn.Dropout(dp),
            nn.Linear(embed_dim * expansion, embed_dim),
            nn.Dropout(dp)
        )

    def forward(self, x):
        return self.net(x)


class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads=8, dp=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(embed_dim, num_heads, dp)
        self.ffn = FeedForward(embed_dim, dp=dp)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dp)

    def forward(self, x, mask=None):
        # Pre-norm style
        x = x + self.dropout(self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), mask=mask))
        x = x + self.dropout(self.ffn(self.norm2(x)))
        return x

class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim=512, num_layers=6, num_heads=8, dp=0.1, max_len=5000):
        super().__init__()
        self.tok_emb = TokenEmbedding(vocab_size, embed_dim)
        self.pos_enc = PositionalEncoding(embed_dim, max_len)
        self.layers = nn.ModuleList([TransformerEncoderBlock(embed_dim, num_heads, dp) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, src_tokens, src_mask=None):
        """
        src_tokens: [B, T] (long)
        src_mask: either None or mask with shape broadcastable to [B, num_heads, T, T] OR [T, T] or [B, T, T]
                  mask values: 1 -> allowed, 0 -> masked
        returns: encoder_outputs [B, T, C]
        """
        x = self.tok_emb(src_tokens)       # [B, T, C]
        x = self.pos_enc(x)
        for layer in self.layers:
            x = layer(x, mask=src_mask)
        x = self.norm(x)
        return x

def build_padding_mask(pad_mask):
    """
    pad_mask: [B, T] where 1 means token is valid, 0 means padding
    returns mask [B, 1, T, T] or broadcastable mask of 1/0
    """
    # We want mask of shape [B, T, T] where allowed positions = 1
    if pad_mask is None:
        return None
    B, T = pad_mask.shape
    # allowed positions along keys dimension
    mask = pad_mask.unsqueeze(1) * pad_mask.unsqueeze(2)  # [B, T, T]
    return mask  # 1/0

## Decoder


In [3]:
import torch
import torch.nn as nn
import math


class DropPath(nn.Module):
    def __init__(self, drop_prob=0.0):
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        if self.drop_prob == 0.0 or not self.training:
            return x
        keep_prob = 1 - self.drop_prob
        shape = (x.size(0),) + (1,) * (x.ndim - 1)
        random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
        random_tensor.floor_()
        return x.div(keep_prob) * random_tensor

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
    def forward(self, x):
        return self.embedding(x)

class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    def forward(self, x):
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :]


class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads=8, dp=0.1):
        super().__init__()
        assert embed_dim % num_heads == 0
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5

        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dp)

    def split_heads(self, x):
        B, T, C = x.size()
        return x.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

    def combine_heads(self, x):
        B, _, T, _ = x.size()
        return x.transpose(1, 2).contiguous().view(B, T, self.num_heads * self.head_dim)

    def forward(self, query, key, value, mask=None):
        Q = self.split_heads(self.q_proj(query))
        K = self.split_heads(self.k_proj(key))
        V = self.split_heads(self.v_proj(value))
        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        out = torch.matmul(attn, V)
        return self.out_proj(self.combine_heads(out))

class FeedForward(nn.Module):
    def __init__(self, embed_dim, expansion=4, dp=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * expansion),
            nn.GELU(),
            nn.Dropout(dp),
            nn.Linear(embed_dim * expansion, embed_dim),
            nn.Dropout(dp)
        )
    def forward(self, x):
        return self.net(x)


class TransformerDecoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads=8, dp=0.1, drop_path=0.0):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.norm3 = nn.LayerNorm(embed_dim)

        self.self_attn = MultiHeadAttention(embed_dim, num_heads, dp)
        self.cross_attn = MultiHeadAttention(embed_dim, num_heads, dp)
        self.ffn = FeedForward(embed_dim, dp=dp)

        self.drop_path = DropPath(drop_path)
        self.gamma_1 = nn.Parameter(torch.ones(embed_dim) * 1e-2)
        self.gamma_2 = nn.Parameter(torch.ones(embed_dim) * 1e-2)
        self.gamma_3 = nn.Parameter(torch.ones(embed_dim) * 1e-2)

    def forward(self, x, enc_out=None, self_mask=None, enc_mask=None):
        # Masked Self-Attention
        self_attn_out = self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), mask=self_mask)
        x = x + self.drop_path(self.gamma_1 * self_attn_out)

        # Cross-Attention
        if enc_out is not None:
            cross_attn_out = self.cross_attn(self.norm2(x), self.norm2(enc_out), self.norm2(enc_out), mask=enc_mask)
            x = x + self.drop_path(self.gamma_2 * cross_attn_out)

        # FeedForward
        ffn_out = self.ffn(self.norm3(x))
        x = x + self.drop_path(self.gamma_3 * ffn_out)

        return x

class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, embed_dim=512, num_layers=6, num_heads=8, dp=0.1, max_len=5000, drop_path=0.1):
        super().__init__()
        self.embedding = TokenEmbedding(vocab_size, embed_dim)
        self.pos_encoding = PositionalEncoding(embed_dim, max_len)
        self.layers = nn.ModuleList([TransformerDecoderBlock(embed_dim, num_heads, dp, drop_path) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(embed_dim)
        self.lm_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x, enc_out=None, self_mask=None, enc_mask=None):
        x = self.embedding(x)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x, enc_out, self_mask, enc_mask)
        x = self.norm(x)
        logits = self.lm_head(x)
        return logits

# 🔹 Encoder

* Token embedding + positional encoding

* Multi-Head Self-Attention (mask opsiyonel)

* FeedForward (MLP + GELU + dropout)

* LayerNorm + residual + pre-norm

* Çoklu encoder blokları (num_layers)

* Padding mask builder

# 🔹 Decoder

* Token embedding + positional encoding

* Masked self-attention

* Cross-attention (encoder çıkışını alır)

* FeedForward

* LayerNorm + residual + pre-norm

* DropPath (stochastic depth) + LayerScale (gamma_1,2,3)

* Çoklu decoder blokları (num_layers)

* Linear output layer (lm_head)

----

# 1️⃣ Temel Parametre Artırımı

| Parametre    | Mevcut | Mini-LLM öneri |
| ------------ | ------ | -------------- |
| embed_dim    | 512    | 1024 / 2048    |
| num_heads    | 8      | 16             |
| num_layers   | 6      | 12             |
| dropout      | 0.1    | 0.1-0.2        |
| ff_expansion | 4      | 4-8            |


# 2️⃣ Positional Encoding

* Rotary Positional Embeddings (RoPE) ekleyelim.

* Daha uzun sequence’ler için scalable.

# 3️⃣ Attention

* Self + Cross Attention mevcut → koru.

* Opsiyon: Flash Attention / memory-efficient attention kullanalım.

# 4️⃣ FeedForward

* Genişletilmiş expansion (4 → 8) ile kapasite artıralım.

* GELU veya SwiGLU aktivasyonu kullanılabilir.

# 5️⃣ DropPath + LayerScale

* Mevcut LayerScale parametreleri ve DropPath’i koru → stabilite ve genelleme.

# 6️⃣ Output Layer

* Weight tying: embedding weight’i output head ile paylaşalım.

* Logits üretiminde softmax → causal mask.

---

# TAM ENCODER

In [4]:
import torch
import torch.nn as nn
import math

# -----------------------------
# DropPath (Stochastic Depth)
# -----------------------------
class DropPath(nn.Module):
    def __init__(self, drop_prob=0.0):
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        if self.drop_prob == 0.0 or not self.training:
            return x
        keep_prob = 1 - self.drop_prob
        shape = (x.size(0),) + (1,) * (x.ndim - 1)
        random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
        random_tensor.floor_()
        return x.div(keep_prob) * random_tensor

# -----------------------------
# Token Embedding
# -----------------------------
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
    def forward(self, x):
        return self.embedding(x)

# -----------------------------
# Positional Encoding
# -----------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :]

# -----------------------------
# Multi-Head Attention
# -----------------------------
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads=16, dp=0.1):
        super().__init__()
        assert embed_dim % num_heads == 0
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5

        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dp)

    def split_heads(self, x):
        B, T, C = x.size()
        return x.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

    def combine_heads(self, x):
        B, _, T, _ = x.size()
        return x.transpose(1, 2).contiguous().view(B, T, self.num_heads * self.head_dim)

    def forward(self, query, key, value, mask=None):
        Q = self.split_heads(self.q_proj(query))
        K = self.split_heads(self.k_proj(key))
        V = self.split_heads(self.v_proj(value))
        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        out = torch.matmul(attn, V)
        return self.out_proj(self.combine_heads(out))

# -----------------------------
# FeedForward
# -----------------------------
class FeedForward(nn.Module):
    def __init__(self, embed_dim, expansion=8, dp=0.1, use_swiglu=False):
        super().__init__()
        if use_swiglu:
            # SwiGLU activation
            self.net = nn.Sequential(
                nn.Linear(embed_dim, embed_dim * expansion * 2),
                nn.SiLU(),
                nn.Dropout(dp),
                nn.Linear(embed_dim * expansion, embed_dim),
                nn.Dropout(dp)
            )
        else:
            self.net = nn.Sequential(
                nn.Linear(embed_dim, embed_dim * expansion),
                nn.GELU(),
                nn.Dropout(dp),
                nn.Linear(embed_dim * expansion, embed_dim),
                nn.Dropout(dp)
            )
    def forward(self, x):
        return self.net(x)

# -----------------------------
# Transformer Encoder Block (LLM)
# -----------------------------
class TransformerEncoderBlockLLM(nn.Module):
    def __init__(self, embed_dim=1024, num_heads=16, dp=0.1, drop_path=0.1, expansion=8, use_swiglu=False):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)

        self.self_attn = MultiHeadAttention(embed_dim, num_heads, dp)
        self.ffn = FeedForward(embed_dim, expansion, dp, use_swiglu)

        self.drop_path = DropPath(drop_path)
        self.gamma_1 = nn.Parameter(torch.ones(embed_dim) * 1e-2)
        self.gamma_2 = nn.Parameter(torch.ones(embed_dim) * 1e-2)

    def forward(self, x, mask=None):
        # Self-Attention
        attn_out = self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), mask)
        x = x + self.drop_path(self.gamma_1 * attn_out)
        # FeedForward
        ffn_out = self.ffn(self.norm2(x))
        x = x + self.drop_path(self.gamma_2 * ffn_out)
        return x

# -----------------------------
# Full Transformer Encoder (LLM)
# -----------------------------
class TransformerEncoderLLM(nn.Module):
    def __init__(self, vocab_size, embed_dim=1024, num_layers=12, num_heads=16, dp=0.1, drop_path=0.1, expansion=8, max_len=5000, use_swiglu=False):
        super().__init__()
        self.tok_emb = TokenEmbedding(vocab_size, embed_dim)
        self.pos_enc = PositionalEncoding(embed_dim, max_len)
        self.layers = nn.ModuleList([
            TransformerEncoderBlockLLM(embed_dim, num_heads, dp, drop_path, expansion, use_swiglu) for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, src_tokens, src_mask=None):
        x = self.tok_emb(src_tokens)
        x = self.pos_enc(x)
        for layer in self.layers:
            x = layer(x, mask=src_mask)
        x = self.norm(x)
        return x

# TAM DECODER

In [5]:
import torch
import torch.nn as nn
import math

# -----------------------------
# DropPath (Stochastic Depth)
# -----------------------------
class DropPath(nn.Module):
    def __init__(self, drop_prob=0.0):
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        if self.drop_prob == 0.0 or not self.training:
            return x
        keep_prob = 1 - self.drop_prob
        shape = (x.size(0),) + (1,) * (x.ndim - 1)
        random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
        random_tensor.floor_()
        return x.div(keep_prob) * random_tensor

# -----------------------------
# Token Embedding
# -----------------------------
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
    def forward(self, x):
        return self.embedding(x)

# -----------------------------
# Positional Encoding
# -----------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :]

# -----------------------------
# Multi-Head Attention
# -----------------------------
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads=16, dp=0.1):
        super().__init__()
        assert embed_dim % num_heads == 0
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5

        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dp)

    def split_heads(self, x):
        B, T, C = x.size()
        return x.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

    def combine_heads(self, x):
        B, _, T, _ = x.size()
        return x.transpose(1, 2).contiguous().view(B, T, self.num_heads * self.head_dim)

    def forward(self, query, key, value, mask=None):
        Q = self.split_heads(self.q_proj(query))
        K = self.split_heads(self.k_proj(key))
        V = self.split_heads(self.v_proj(value))
        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        out = torch.matmul(attn, V)
        return self.out_proj(self.combine_heads(out))

# -----------------------------
# FeedForward
# -----------------------------
class FeedForward(nn.Module):
    def __init__(self, embed_dim, expansion=8, dp=0.1, use_swiglu=False):
        super().__init__()
        if use_swiglu:
            # SwiGLU activation
            self.net = nn.Sequential(
                nn.Linear(embed_dim, embed_dim * expansion * 2),
                nn.SiLU(),
                nn.Dropout(dp),
                nn.Linear(embed_dim * expansion, embed_dim),
                nn.Dropout(dp)
            )
        else:
            self.net = nn.Sequential(
                nn.Linear(embed_dim, embed_dim * expansion),
                nn.GELU(),
                nn.Dropout(dp),
                nn.Linear(embed_dim * expansion, embed_dim),
                nn.Dropout(dp)
            )
    def forward(self, x):
        return self.net(x)

# -----------------------------
# Transformer Decoder Block (LLM)
# -----------------------------
class TransformerDecoderBlockLLM(nn.Module):
    def __init__(self, embed_dim=1024, num_heads=16, dp=0.1, drop_path=0.1, expansion=8, use_swiglu=False):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.norm3 = nn.LayerNorm(embed_dim)

        self.self_attn = MultiHeadAttention(embed_dim, num_heads, dp)
        self.cross_attn = MultiHeadAttention(embed_dim, num_heads, dp)
        self.ffn = FeedForward(embed_dim, expansion, dp, use_swiglu)

        self.drop_path = DropPath(drop_path)
        self.gamma_1 = nn.Parameter(torch.ones(embed_dim) * 1e-2)
        self.gamma_2 = nn.Parameter(torch.ones(embed_dim) * 1e-2)
        self.gamma_3 = nn.Parameter(torch.ones(embed_dim) * 1e-2)

    def forward(self, x, enc_out=None, self_mask=None, enc_mask=None):
        # Masked Self-Attention
        attn_out = self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), self_mask)
        x = x + self.drop_path(self.gamma_1 * attn_out)

        # Cross-Attention
        if enc_out is not None:
            cross_out = self.cross_attn(self.norm2(x), self.norm2(enc_out), self.norm2(enc_out), enc_mask)
            x = x + self.drop_path(self.gamma_2 * cross_out)

        # FeedForward
        ffn_out = self.ffn(self.norm3(x))
        x = x + self.drop_path(self.gamma_3 * ffn_out)

        return x

# -----------------------------
# Full Transformer Decoder (LLM)
# -----------------------------
class TransformerDecoderLLM(nn.Module):
    def __init__(self, vocab_size, embed_dim=1024, num_layers=12, num_heads=16, dp=0.1, drop_path=0.1, expansion=8, max_len=5000, use_swiglu=False):
        super().__init__()
        self.embedding = TokenEmbedding(vocab_size, embed_dim)
        self.pos_encoding = PositionalEncoding(embed_dim, max_len)
        self.layers = nn.ModuleList([
            TransformerDecoderBlockLLM(embed_dim, num_heads, dp, drop_path, expansion, use_swiglu) for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(embed_dim)
        self.lm_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x, enc_out=None, self_mask=None, enc_mask=None):
        x = self.embedding(x)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x, enc_out, self_mask, enc_mask)
        x = self.norm(x)
        logits = self.lm_head(x)
        return logits

# Transformer LLM Genel Akış Diyagramı (Markdown)
```BASH

Input Tokens (src_tokens / decoder input)
│
├─> **Encoder Side (src_tokens)**
│    │
│    ├─> Token Embedding
│    │
│    ├─> Positional Encoding
│    │
│    └─> **Encoder Layers** (N adet)
│         │
│         ├─ LayerNorm 1
│         ├─ Self-Attention (mask optional)
│         ├─ DropPath + γ₁ scaling
│         └─ Residual Add
│         │
│         ├─ LayerNorm 2
│         ├─ FeedForward Network (FFN)
│         ├─ DropPath + γ₂ scaling
│         └─ Residual Add
│
├─> Final LayerNorm
│
└─> **Encoder Output** (enc_out)
      │
      │
      ▼
Decoder Input Tokens (tgt_tokens)
│
├─> Token Embedding
│
├─> Positional Encoding
│
└─> **Decoder Layers** (M adet)
     │
     ├─ LayerNorm 1
     ├─ Masked Self-Attention (decoder tokens only)
     ├─ DropPath + γ₁ scaling
     └─ Residual Add
     │
     ├─ LayerNorm 2
     ├─ Cross-Attention (enc_out)
     ├─ DropPath + γ₂ scaling
     └─ Residual Add
     │
     ├─ LayerNorm 3
     ├─ FeedForward Network (FFN)
     ├─ DropPath + γ₃ scaling
     └─ Residual Add
│
├─> Final LayerNorm
│
└─> LM Head
     └─ Linear(embed_dim → vocab_size)
     └─ Output: Logits (next token prediction)
