# 🛠 Transformer Encoder Yapısı

* Encoder, giriş token’larını alır ve her token için bağlamı (context) öğrenen bir gizli temsil üretir.
Bu temsil, decoder tarafından kullanılacak ve cross-attention ile output üretilecek.

# -----------------------------
# Positional Encoding (sin/cos)
# -----------------------------

In [6]:
import torch
import torch.nn as nn
import math

In [7]:
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # [1, max_len, embed_dim]
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: [B, T, C]
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :]

# -----------------------------
# Token Embedding
# -----------------------------

In [8]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

    def forward(self, x):
        # x: [B, T] (long)
        return self.embedding(x)  # [B, T, embed_dim]

# -----------------------------
# Multi-Head Attention
# -----------------------------

In [9]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads=8, dp=0.1):
        super().__init__()
        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5

        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dp)

    def split_heads(self, x):
        # x: [B, T, C] -> [B, num_heads, T, head_dim]
        B, T, C = x.size()
        return x.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

    def combine_heads(self, x):
        # x: [B, num_heads, T, head_dim] -> [B, T, C]
        x = x.transpose(1, 2).contiguous()  # -> [B, T, num_heads, head_dim]
        B, T, _, _ = x.size()
        return x.view(B, T, self.num_heads * self.head_dim)

    def forward(self, query, key, value, mask=None):
        # query/key/value: [B, T, C]
        Q = self.split_heads(self.q_proj(query))
        K = self.split_heads(self.k_proj(key))
        V = self.split_heads(self.v_proj(value))

        # scores: [B, num_heads, T_q, T_k]
        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale

        # mask: expected shape can be [T_q, T_k] or [B, T_q, T_k] or broadcastable.
        if mask is not None:
            # mask should be 1 for allowed positions and 0 for masked positions
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)

        out = torch.matmul(attn, V)  # [B, num_heads, T_q, head_dim]
        out = self.combine_heads(out)  # [B, T_q, C]
        return self.out_proj(out)     # [B, T_q, C]

# -----------------------------
# Position-wise FeedForward
# -----------------------------

In [10]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim, expansion=4, dp=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * expansion),
            nn.GELU(),
            nn.Dropout(dp),
            nn.Linear(embed_dim * expansion, embed_dim),
            nn.Dropout(dp)
        )

    def forward(self, x):
        return self.net(x)


# -----------------------------
# Transformer Encoder Block
# -----------------------------

In [11]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads=8, dp=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(embed_dim, num_heads, dp)
        self.ffn = FeedForward(embed_dim, dp=dp)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dp)

    def forward(self, x, mask=None):
        # Pre-norm style
        x = x + self.dropout(self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), mask=mask))
        x = x + self.dropout(self.ffn(self.norm2(x)))
        return x

# -----------------------------
# Full Transformer Encoder
# -----------------------------

In [12]:
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim=512, num_layers=6, num_heads=8, dp=0.1, max_len=5000):
        super().__init__()
        self.tok_emb = TokenEmbedding(vocab_size, embed_dim)
        self.pos_enc = PositionalEncoding(embed_dim, max_len)
        self.layers = nn.ModuleList([TransformerEncoderBlock(embed_dim, num_heads, dp) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, src_tokens, src_mask=None):
        """
        src_tokens: [B, T] (long)
        src_mask: either None or mask with shape broadcastable to [B, num_heads, T, T] OR [T, T] or [B, T, T]
                  mask values: 1 -> allowed, 0 -> masked
        returns: encoder_outputs [B, T, C]
        """
        x = self.tok_emb(src_tokens)       # [B, T, C]
        x = self.pos_enc(x)
        for layer in self.layers:
            x = layer(x, mask=src_mask)
        x = self.norm(x)
        return x

# -----------------------------
# Helper: Padding mask builder (optional)
# -----------------------------

In [13]:
def build_padding_mask(pad_mask):
    """
    pad_mask: [B, T] where 1 means token is valid, 0 means padding
    returns mask [B, 1, T, T] or broadcastable mask of 1/0
    """
    # We want mask of shape [B, T, T] where allowed positions = 1
    if pad_mask is None:
        return None
    B, T = pad_mask.shape
    # allowed positions along keys dimension
    mask = pad_mask.unsqueeze(1) * pad_mask.unsqueeze(2)  # [B, T, T]
    return mask  # 1/0

# TAM KOD 

In [None]:
import torch
import torch.nn as nn
import math

# -----------------------------
# Positional Encoding (sin/cos)
# -----------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # [1, max_len, embed_dim]
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: [B, T, C]
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :]

# -----------------------------
# Token Embedding
# -----------------------------
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

    def forward(self, x):
        # x: [B, T] (long)
        return self.embedding(x)  # [B, T, embed_dim]

# -----------------------------
# Multi-Head Attention
# -----------------------------
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads=8, dp=0.1):
        super().__init__()
        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5

        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dp)

    def split_heads(self, x):
        # x: [B, T, C] -> [B, num_heads, T, head_dim]
        B, T, C = x.size()
        return x.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

    def combine_heads(self, x):
        # x: [B, num_heads, T, head_dim] -> [B, T, C]
        x = x.transpose(1, 2).contiguous()  # -> [B, T, num_heads, head_dim]
        B, T, _, _ = x.size()
        return x.view(B, T, self.num_heads * self.head_dim)

    def forward(self, query, key, value, mask=None):
        # query/key/value: [B, T, C]
        Q = self.split_heads(self.q_proj(query))
        K = self.split_heads(self.k_proj(key))
        V = self.split_heads(self.v_proj(value))

        # scores: [B, num_heads, T_q, T_k]
        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale

        # mask: expected shape can be [T_q, T_k] or [B, T_q, T_k] or broadcastable.
        if mask is not None:
            # mask should be 1 for allowed positions and 0 for masked positions
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)

        out = torch.matmul(attn, V)  # [B, num_heads, T_q, head_dim]
        out = self.combine_heads(out)  # [B, T_q, C]
        return self.out_proj(out)     # [B, T_q, C]

# -----------------------------
# Position-wise FeedForward
# -----------------------------
class FeedForward(nn.Module):
    def __init__(self, embed_dim, expansion=4, dp=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * expansion),
            nn.GELU(),
            nn.Dropout(dp),
            nn.Linear(embed_dim * expansion, embed_dim),
            nn.Dropout(dp)
        )

    def forward(self, x):
        return self.net(x)

# -----------------------------
# Transformer Encoder Block
# -----------------------------
class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads=8, dp=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(embed_dim, num_heads, dp)
        self.ffn = FeedForward(embed_dim, dp=dp)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dp)

    def forward(self, x, mask=None):
        # Pre-norm style
        x = x + self.dropout(self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), mask=mask))
        x = x + self.dropout(self.ffn(self.norm2(x)))
        return x

# -----------------------------
# Full Transformer Encoder
# -----------------------------
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim=512, num_layers=6, num_heads=8, dp=0.1, max_len=5000):
        super().__init__()
        self.tok_emb = TokenEmbedding(vocab_size, embed_dim)
        self.pos_enc = PositionalEncoding(embed_dim, max_len)
        self.layers = nn.ModuleList([TransformerEncoderBlock(embed_dim, num_heads, dp) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, src_tokens, src_mask=None):
        """
        src_tokens: [B, T] (long)
        src_mask: either None or mask with shape broadcastable to [B, num_heads, T, T] OR [T, T] or [B, T, T]
                  mask values: 1 -> allowed, 0 -> masked
        returns: encoder_outputs [B, T, C]
        """
        x = self.tok_emb(src_tokens)       # [B, T, C]
        x = self.pos_enc(x)
        for layer in self.layers:
            x = layer(x, mask=src_mask)
        x = self.norm(x)
        return x

# -----------------------------
# Helper: Padding mask builder (optional)
# -----------------------------
def build_padding_mask(pad_mask):
    """
    pad_mask: [B, T] where 1 means token is valid, 0 means padding
    returns mask [B, 1, T, T] or broadcastable mask of 1/0
    """
    # We want mask of shape [B, T, T] where allowed positions = 1
    if pad_mask is None:
        return None
    B, T = pad_mask.shape
    # allowed positions along keys dimension
    mask = pad_mask.unsqueeze(1) * pad_mask.unsqueeze(2)  # [B, T, T]
    return mask  # 1/0


---
# Encoder Yapısını daha ileriye taşıyalım.Bazı eklentiler ekleyeceğiz.Bu eklentilerin ne olduğunu ve en sonda modelin tamamını daha da ileriye taşımış olalaım.
---

# 1️⃣ DropPath (Stochastic Depth)

* Derin Transformer’larda bazı blokları rastgele bypass etmek için kullanılır.

* Training stabilitesini ve genelleme performansını artırır.

Kod: DropPath sınıfı.

### DropPath (Stochastic Depth) Nedir?

* DropPath, derin Transformer’larda bazı blokların çıkışını rastgele sıfırlayıp bypass etmek için kullanılır.

* Bu, çok derin ağlarda training stabilitesini artırır ve overfitting’i azaltır.

* Farkı Dropout’tan: Dropout bir tensor içindeki elemanları rastgele sıfırlar; DropPath bir bloğu tamamen atlatır (residual yol üzerinden).

#### Nereye entegre edeceğiz?

Bir Transformer Encoder bloğu tipik olarak şöyle görünür:

```python 
x = x + Attention(LayerNorm(x))
x = x + FFN(LayerNorm(x))


* Burada Residual + PreNorm mevcut.

#### DropPath’i entegre etmek için:

```python 
x = x + drop_path(Attention(LayerNorm(x)))
x = x + drop_path(FFN(LayerNorm(x)))


## Kod Örneği

In [15]:
class DropPath(nn.Module):
    def __init__(self, drop_prob=0.0):
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        if self.drop_prob == 0.0 or not self.training:
            return x
        keep_prob = 1 - self.drop_prob
        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
        random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
        random_tensor.floor_()
        return x / keep_prob * random_tensor

## Encoder Blok Örneği ile Kullanımı

In [17]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads=8, dp=0.1, drop_path=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(embed_dim, num_heads, dp)
        self.ffn = FeedForward(embed_dim, dp=dp)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.drop_path = DropPath(drop_path)
        self.dropout = nn.Dropout(dp)

    def forward(self, x, mask=None):
        # PreNorm + DropPath
        x = x + self.drop_path(self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), mask=mask))
        x = x + self.drop_path(self.ffn(self.norm2(x)))
        return x

### ✅ Özet:

* DropPath’i her residual connection’a ekliyoruz.

* drop_prob ile ne kadar blok rastgele bypass edileceğini kontrol ediyoruz.

* Training sırasında aktif, eval modunda devre dışı.

---

# 2️⃣ LayerScale Nedir?

* Transformer bloklarının çıkışını küçük bir katsayı ile çarpar (alpha * block_out).

* Derin modellerde, özellikle LLM’lerde, training stabilitesini artırır.

* Küçük bir alpha (örn. 1e-4 ~ 1e-2) ile başlatılır ve öğrenilebilir parametredir

### Nereye entegre ediyoruz?

* Önceki DropPath’li blokta her residual connection üzerine uygulayabiliriz:

```python 
# x = x + drop_path(block_out)
# → LayerScale ile
x = x + drop_path(alpha * block_out)


* alpha tensor olarak embed_dim boyutunda ve learnable olabilir.

* Genellikle her blok için ayrı alpha kullanılır.

### **Kod Örneği**

In [19]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads=8, dp=0.1, drop_path=0.1, init_values=1e-4):
        super().__init__()
        self.self_attn = MultiHeadAttention(embed_dim, num_heads, dp)
        self.ffn = FeedForward(embed_dim, dp=dp)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.drop_path = DropPath(drop_path)
        self.dropout = nn.Dropout(dp)

        # LayerScale
        self.gamma_1 = nn.Parameter(init_values * torch.ones(embed_dim))
        self.gamma_2 = nn.Parameter(init_values * torch.ones(embed_dim))

    def forward(self, x, mask=None):
        # Self-Attention + LayerScale + DropPath
        attn_out = self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), mask=mask)
        x = x + self.drop_path(self.gamma_1 * attn_out)

        # FFN + LayerScale + DropPath
        ffn_out = self.ffn(self.norm2(x))
        x = x + self.drop_path(self.gamma_2 * ffn_out)

        return x

## ✅ Özet:

* gamma_1 ve gamma_2 → learnable scale parametreleri

* DropPath + LayerScale → modern LLM bloklarının temel stabilite mekanizması

* Training sırasında aktif, eval modunda normal residual gibi çalışıyor

---

# 3️⃣ FFN: GeLU → SwiGLU (Modern LLM)
### Neden SwiGLU?

* GeLU: klasik Transformer FFN aktivasyonu.

* SwiGLU (Gated Linear Unit):

> Daha hızlı öğrenme sağlar

> Daha iyi genelleme (özellikle LLM’lerde)

> Modern LLM’lerde standart (GPT-NeoX, LLaMA, Falcon vb.)

### **Kod Örneği: SwiGLU FFN**

In [20]:
class SwiGLUFFN(nn.Module):
    def __init__(self, embed_dim, expansion=4, dp=0.1):
        super().__init__()
        hidden_dim = embed_dim * expansion
        self.w1 = nn.Linear(embed_dim, hidden_dim)
        self.w2 = nn.Linear(embed_dim, hidden_dim)
        self.w3 = nn.Linear(hidden_dim, embed_dim)
        self.dropout = nn.Dropout(dp)
        self.act = nn.GELU()  # GeLU aktivasyonu

    def forward(self, x):
        return self.dropout(self.w3(self.act(self.w2(x)) * self.w1(x)))


### **Encoder Blokta Kullanımı**

LayerScale ve DropPath ile kombine edersek:

In [21]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads=8, dp=0.1, drop_path=0.1, init_values=1e-4):
        super().__init__()
        self.self_attn = MultiHeadAttention(embed_dim, num_heads, dp)
        self.ffn = SwiGLUFFN(embed_dim, dp=dp)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.drop_path = DropPath(drop_path)

        # LayerScale
        self.gamma_1 = nn.Parameter(init_values * torch.ones(embed_dim))
        self.gamma_2 = nn.Parameter(init_values * torch.ones(embed_dim))

    def forward(self, x, mask=None):
        # Self-Attention + LayerScale + DropPath
        attn_out = self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), mask=mask)
        x = x + self.drop_path(self.gamma_1 * attn_out)

        # SwiGLU FFN + LayerScale + DropPath
        ffn_out = self.ffn(self.norm2(x))
        x = x + self.drop_path(self.gamma_2 * ffn_out)

        return x


## ✅ Özet:

* FFN artık SwiGLU tabanlı → LLM standardı

* LayerScale + DropPath ile residual bağlantılar stabilize edilmiş

* Self-Attention ve FFN için modern LLM tasarımı tamamlanmış oldu

----
# 4️⃣ Rotary Positional Encoding (RoPE)
### Neden RoPE?

* Standart sin/cos positional encoding yalnızca ekleme ile çalışır.

* RoPE, attention mekanizmasına rotasyon matrisi ile pozisyon bilgisi ekler.

* Uzun dizilerde genelleme performansını artırır.

* Modern LLM’lerin çoğu (LLaMA, Falcon, MPT) kullanıyor.

### **Kod Örneği: Rotary Embedding Uygulaması**

In [23]:
import torch
import math

class RotaryEmbedding(nn.Module):
    def __init__(self, dim, max_seq_len=2048):
        super().__init__()
        self.dim = dim
        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer("inv_freq", inv_freq)
        self.max_seq_len = max_seq_len

    def forward(self, x):
        # x: [B, num_heads, T, head_dim]
        B, H, T, D = x.shape
        t = torch.arange(T, device=x.device).type_as(self.inv_freq)
        freqs = torch.einsum("i,j->ij", t, self.inv_freq)  # [T, D/2]
        emb = torch.cat([freqs, freqs], dim=-1)  # [T, D]
        cos = emb.cos()[None, None, :, :]  # [1,1,T,D]
        sin = emb.sin()[None, None, :, :]
        x1, x2 = x[..., ::2], x[..., 1::2]
        x_rotated = torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)
        return x_rotated

### **Attention Blokta Kullanımı**

* MultiHeadAttention içinde query ve key’lere uygulayabiliriz:

In [24]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads=8, dp=0.1, use_rope=True):
        super().__init__()
        assert embed_dim % num_heads == 0
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5
        self.use_rope = use_rope

        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dp)

        if use_rope:
            self.rope = RotaryEmbedding(self.head_dim)

    def forward(self, query, key, value, mask=None):
        Q = self.split_heads(self.q_proj(query))
        K = self.split_heads(self.k_proj(key))
        V = self.split_heads(self.v_proj(value))

        if self.use_rope:
            Q = self.rope(Q)
            K = self.rope(K)

        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        out = torch.matmul(attn, V)
        out = self.combine_heads(out)
        return self.out_proj(out)


## ✅ Özet:

* RotaryEmbedding query ve key’e uygulanır → pozisyon bilgisi attention matrisine doğal olarak eklenir

* Uzun dizilerde standart sin/cos’dan daha iyi genelleme

* Modern LLM’lerde self-attention’ı güçlendiren bir adım

----

# Yukarıda bulunan eklentileri encoder yapımıza entegre edelim:

----

In [25]:
import torch
import torch.nn as nn
import math

# -----------------------------
# Positional Encoding (sin/cos)
# -----------------------------

In [26]:
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # [1, max_len, embed_dim]
        self.register_buffer('pe', pe)

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :]

# -----------------------------
# Token Embedding
# -----------------------------


In [27]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

    def forward(self, x):
        return self.embedding(x)  # [B, T, embed_dim]

# -----------------------------
# DropPath (Stochastic Depth)
# -----------------------------

In [28]:
class DropPath(nn.Module):
    def __init__(self, drop_prob=0.0):
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        if self.drop_prob == 0.0 or not self.training:
            return x
        keep_prob = 1 - self.drop_prob
        shape = (x.size(0),) + (1,) * (x.ndim - 1)
        random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
        random_tensor.floor_()
        return x / keep_prob * random_tensor

# -----------------------------
# Rotary Positional Encoding (RoPE)
# -----------------------------

In [29]:
class RotaryEmbedding(nn.Module):
    def __init__(self, dim, max_seq_len=2048):
        super().__init__()
        self.dim = dim
        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer("inv_freq", inv_freq)

    def forward(self, x):
        B, H, T, D = x.shape
        t = torch.arange(T, device=x.device).type_as(self.inv_freq)
        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
        emb = torch.cat([freqs, freqs], dim=-1)
        cos = emb.cos()[None, None, :, :]
        sin = emb.sin()[None, None, :, :]
        x1, x2 = x[..., ::2], x[..., 1::2]
        return torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)

# -----------------------------
# Multi-Head Attention
# -----------------------------

In [30]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads=8, dp=0.1, use_rope=True):
        super().__init__()
        assert embed_dim % num_heads == 0
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5
        self.use_rope = use_rope

        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dp)

        if use_rope:
            self.rope = RotaryEmbedding(self.head_dim)

    def split_heads(self, x):
        B, T, C = x.size()
        return x.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

    def combine_heads(self, x):
        B, _, T, _ = x.size()
        return x.transpose(1, 2).contiguous().view(B, T, self.num_heads * self.head_dim)

    def forward(self, query, key, value, mask=None):
        Q = self.split_heads(self.q_proj(query))
        K = self.split_heads(self.k_proj(key))
        V = self.split_heads(self.v_proj(value))

        if self.use_rope:
            Q = self.rope(Q)
            K = self.rope(K)

        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        out = torch.matmul(attn, V)
        out = self.combine_heads(out)
        return self.out_proj(out)

# -----------------------------
# SwiGLU FFN
# -----------------------------

In [31]:
class SwiGLUFFN(nn.Module):
    def __init__(self, embed_dim, expansion=4, dp=0.1):
        super().__init__()
        hidden_dim = embed_dim * expansion
        self.w1 = nn.Linear(embed_dim, hidden_dim)
        self.w2 = nn.Linear(embed_dim, hidden_dim)
        self.w3 = nn.Linear(hidden_dim, embed_dim)
        self.dropout = nn.Dropout(dp)
        self.act = nn.GELU()

    def forward(self, x):
        return self.dropout(self.w3(self.act(self.w2(x)) * self.w1(x)))

# -----------------------------
# Transformer Encoder Block
# -----------------------------

In [32]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads=8, dp=0.1, drop_path=0.1, init_values=1e-4, use_rope=True):
        super().__init__()
        self.self_attn = MultiHeadAttention(embed_dim, num_heads, dp, use_rope=use_rope)
        self.ffn = SwiGLUFFN(embed_dim, dp=dp)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.drop_path = DropPath(drop_path)
        self.gamma_1 = nn.Parameter(init_values * torch.ones(embed_dim))
        self.gamma_2 = nn.Parameter(init_values * torch.ones(embed_dim))

    def forward(self, x, mask=None):
        attn_out = self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), mask=mask)
        x = x + self.drop_path(self.gamma_1 * attn_out)
        ffn_out = self.ffn(self.norm2(x))
        x = x + self.drop_path(self.gamma_2 * ffn_out)
        return x

# -----------------------------
# Full Transformer Encoder
# -----------------------------

In [33]:
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim=512, num_layers=12, num_heads=8, dp=0.1, drop_path=0.1, max_len=5000, use_rope=True):
        super().__init__()
        self.tok_emb = TokenEmbedding(vocab_size, embed_dim)
        self.pos_enc = PositionalEncoding(embed_dim, max_len)
        self.layers = nn.ModuleList([
            TransformerEncoderBlock(embed_dim, num_heads, dp, drop_path, use_rope=use_rope)
            for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, src_tokens, src_mask=None):
        x = self.tok_emb(src_tokens)
        x = self.pos_enc(x)
        for layer in self.layers:
            x = layer(x, mask=src_mask)
        x = self.norm(x)
        return x

# TAM KOD :

In [34]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # [1, max_len, embed_dim]
        self.register_buffer('pe', pe)

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :]


class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

    def forward(self, x):
        return self.embedding(x)  # [B, T, embed_dim]


class DropPath(nn.Module):
    def __init__(self, drop_prob=0.0):
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        if self.drop_prob == 0.0 or not self.training:
            return x
        keep_prob = 1 - self.drop_prob
        shape = (x.size(0),) + (1,) * (x.ndim - 1)
        random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
        random_tensor.floor_()
        return x / keep_prob * random_tensor


class RotaryEmbedding(nn.Module):
    def __init__(self, dim, max_seq_len=2048):
        super().__init__()
        self.dim = dim
        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer("inv_freq", inv_freq)

    def forward(self, x):
        B, H, T, D = x.shape
        t = torch.arange(T, device=x.device).type_as(self.inv_freq)
        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
        emb = torch.cat([freqs, freqs], dim=-1)
        cos = emb.cos()[None, None, :, :]
        sin = emb.sin()[None, None, :, :]
        x1, x2 = x[..., ::2], x[..., 1::2]
        return torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads=8, dp=0.1, use_rope=True):
        super().__init__()
        assert embed_dim % num_heads == 0
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5
        self.use_rope = use_rope

        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dp)

        if use_rope:
            self.rope = RotaryEmbedding(self.head_dim)

    def split_heads(self, x):
        B, T, C = x.size()
        return x.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

    def combine_heads(self, x):
        B, _, T, _ = x.size()
        return x.transpose(1, 2).contiguous().view(B, T, self.num_heads * self.head_dim)

    def forward(self, query, key, value, mask=None):
        Q = self.split_heads(self.q_proj(query))
        K = self.split_heads(self.k_proj(key))
        V = self.split_heads(self.v_proj(value))

        if self.use_rope:
            Q = self.rope(Q)
            K = self.rope(K)

        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        out = torch.matmul(attn, V)
        out = self.combine_heads(out)
        return self.out_proj(out)

class SwiGLUFFN(nn.Module):
    def __init__(self, embed_dim, expansion=4, dp=0.1):
        super().__init__()
        hidden_dim = embed_dim * expansion
        self.w1 = nn.Linear(embed_dim, hidden_dim)
        self.w2 = nn.Linear(embed_dim, hidden_dim)
        self.w3 = nn.Linear(hidden_dim, embed_dim)
        self.dropout = nn.Dropout(dp)
        self.act = nn.GELU()

    def forward(self, x):
        return self.dropout(self.w3(self.act(self.w2(x)) * self.w1(x)))


class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads=8, dp=0.1, drop_path=0.1, init_values=1e-4, use_rope=True):
        super().__init__()
        self.self_attn = MultiHeadAttention(embed_dim, num_heads, dp, use_rope=use_rope)
        self.ffn = SwiGLUFFN(embed_dim, dp=dp)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.drop_path = DropPath(drop_path)
        self.gamma_1 = nn.Parameter(init_values * torch.ones(embed_dim))
        self.gamma_2 = nn.Parameter(init_values * torch.ones(embed_dim))

    def forward(self, x, mask=None):
        attn_out = self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), mask=mask)
        x = x + self.drop_path(self.gamma_1 * attn_out)
        ffn_out = self.ffn(self.norm2(x))
        x = x + self.drop_path(self.gamma_2 * ffn_out)
        return x

class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim=512, num_layers=12, num_heads=8, dp=0.1, drop_path=0.1, max_len=5000, use_rope=True):
        super().__init__()
        self.tok_emb = TokenEmbedding(vocab_size, embed_dim)
        self.pos_enc = PositionalEncoding(embed_dim, max_len)
        self.layers = nn.ModuleList([
            TransformerEncoderBlock(embed_dim, num_heads, dp, drop_path, use_rope=use_rope)
            for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, src_tokens, src_mask=None):
        x = self.tok_emb(src_tokens)
        x = self.pos_enc(x)
        for layer in self.layers:
            x = layer(x, mask=src_mask)
        x = self.norm(x)
        return x


---
# Encoder için LLM İyileştirmeleri

## 1️⃣ Embed Dim ve Layer Sayısı
- Mevcut: `embed_dim=512`, `num_layers=6`
- Öneri: `embed_dim=1024`, `num_layers=12`

## 2️⃣ Attention Heads
- Mevcut: 8 head
- Öneri: 16 head  
  _(embed_dim / num_heads = head_dim)_

## 3️⃣ FeedForward
- Expansion: 4 → 4-8
- Aktivasyon: GELU veya SwiGLU

## 4️⃣ Dropout + DropPath + LayerScale
- Amaç: Training stabilitesi ve genelleme performansını artırmak
- Eklenmesi gerekenler: 
  - Dropout
  - DropPath (Stochastic Depth)
  - LayerScale parametreleri

## 5️⃣ Positional Encoding
- Rotary Embedding veya scalable sin/cos
- Şimdilik mevcut sin/cos’u geliştirmek yeterli

## 6️⃣ Norm & Stabilite
- PreNorm + LayerNorm + residual connection
- Her blokta attention ve feedforward için uygulanacak

## 7️⃣ Optimizer ve FP16 Uyumluluğu
- Training kısmı için: 
  - FP16 ile uyumlu
  - AdamW veya Lion optimizer tercih edilebilir


---
## 1️⃣ Token Embedding ve Embed Dim Yükseltme

* Amaç: LLM seviyesinde daha geniş bir temsil gücü sağlamak için embed_dim’i 1024 yapıyoruz.

```python 
import torch
import torch.nn as nn

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim=1024):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

    def forward(self, x):
        # x: [B, T]
        return self.embedding(x)  # [B, T, embed_dim]


### Notlar:

* embed_dim=1024 → head sayısı ve feedforward expansion ile uyumlu olmalı.

* Bu embedding her token’ı dense vektöre çevirir.

---

## 2️⃣ Positional Encoding (Geliştirilmiş Sin/Cos)

* Amaç: Uzun dizilerde modelin konum bilgisini daha iyi yakalaması. Mevcut sin/cos’u kullanacağız ama embed_dim=1024 ile uyumlu hale getireceğiz.

```python 
import math
import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim=1024, max_len=10000):
        super().__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # [1, max_len, embed_dim]
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: [B, T, C]
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :]


### Notlar:

* max_len uzun diziler için büyütüldü (örneğin 10k token).

* embed_dim=1024 ile uyumlu.

* İleride rotary embeddings ile değiştirilebilir ama şimdilik scalable sin/cos yeterli.

---

## 3️⃣ Multi-Head Attention (16 Head, LLM)

* Amaç: Daha büyük embedding ve daha fazla baş (head) ile modelin bağlamı yakalama kapasitesini artırmak.

```python 
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim=1024, num_heads=16, dp=0.1):
        super().__init__()
        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5

        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dp)

    def split_heads(self, x):
        B, T, C = x.size()
        return x.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

    def combine_heads(self, x):
        B, _, T, _ = x.size()
        return x.transpose(1, 2).contiguous().view(B, T, self.num_heads * self.head_dim)

    def forward(self, query, key, value, mask=None):
        Q = self.split_heads(self.q_proj(query))
        K = self.split_heads(self.k_proj(key))
        V = self.split_heads(self.v_proj(value))
        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale

        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        out = torch.matmul(attn, V)
        return self.out_proj(self.combine_heads(out))


### Özellikler:

* embed_dim=1024 → daha geniş embedding

* num_heads=16 → daha fazla dikkat başı

* self.scale = head_dim ** -0.5 ile stabilizasyon

* Dropout uygulanıyor, training sırasında regularizasyon

---

## 4️⃣ FeedForward (LLM)

* Amaç: Daha güçlü ve stabil feedforward katmanı, genişletilmiş boyut ve yeni aktivasyon (GELU veya SwiGLU) ile.

```python 
import torch
import torch.nn as nn
import torch.nn.functional as F

class FeedForward(nn.Module):
    def __init__(self, embed_dim=1024, expansion=8, dp=0.1, use_swiglu=False):
        super().__init__()
        inner_dim = embed_dim * expansion
        self.use_swiglu = use_swiglu
        self.dropout = nn.Dropout(dp)
        if use_swiglu:
            # SwiGLU: GELU gating mekanizması
            self.fc1 = nn.Linear(embed_dim, inner_dim * 2)  # ikiye katla
            self.fc2 = nn.Linear(inner_dim, embed_dim)
        else:
            self.net = nn.Sequential(
                nn.Linear(embed_dim, inner_dim),
                nn.GELU(),
                nn.Dropout(dp),
                nn.Linear(inner_dim, embed_dim),
                nn.Dropout(dp)
            )
        # LayerScale parametresi
        self.gamma = nn.Parameter(torch.ones(embed_dim) * 1e-2)

    def forward(self, x):
        if self.use_swiglu:
            x1, x2 = self.fc1(x).chunk(2, dim=-1)
            x = F.gelu(x1) * x2
            x = self.fc2(x)
            x = self.gamma * x
            return self.dropout(x)
        else:
            return self.gamma * self.net(x)


### Özellikler:

* expansion=8 → daha büyük hidden dimension

* use_swiglu=True → SwiGLU aktivasyonu (GELU gating)

* Dropout ile regularizasyon

* LayerScale ile training stabilitesi

---

## 5️⃣ Transformer Encoder Block (LLM)

Amaç:

* PreNorm + Residual Connection

* MultiHeadAttention (16 head, embed_dim=1024)

* Gelişmiş FeedForward (expansion=8, SwiGLU opsiyonel)

* Dropout + DropPath + LayerScale

```python 
class DropPath(nn.Module):
    """Stochastic Depth / DropPath"""
    def __init__(self, drop_prob=0.0):
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        if self.drop_prob == 0.0 or not self.training:
            return x
        keep_prob = 1 - self.drop_prob
        shape = (x.size(0),) + (1,) * (x.ndim - 1)
        random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
        random_tensor.floor_()
        return x.div(keep_prob) * random_tensor


class TransformerEncoderBlockLLM(nn.Module):
    def __init__(self, embed_dim=1024, num_heads=16, dp=0.1, drop_path=0.1, expansion=8, use_swiglu=False):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)

        self.self_attn = MultiHeadAttention(embed_dim, num_heads=num_heads, dp=dp)
        self.ffn = FeedForward(embed_dim, expansion=expansion, dp=dp, use_swiglu=use_swiglu)

        self.drop_path = DropPath(drop_path)
        # LayerScale parametreleri
        self.gamma_1 = nn.Parameter(torch.ones(embed_dim) * 1e-2)
        self.gamma_2 = nn.Parameter(torch.ones(embed_dim) * 1e-2)

    def forward(self, x, mask=None):
        # PreNorm + Self-Attention + Residual + DropPath
        attn_out = self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), mask=mask)
        x = x + self.drop_path(self.gamma_1 * attn_out)

        # FeedForward + Residual + DropPath
        ffn_out = self.ffn(self.norm2(x))
        x = x + self.drop_path(self.gamma_2 * ffn_out)

        return x


## Özellikler:

* PreNorm: LayerNorm önce uygulanır → training stabilitesi

* DropPath: Stochastic depth ile bloklar rastgele bypass edilebilir

* LayerScale (gamma_1, gamma_2) → derin modellerde residual scaling

* FeedForward: expansion=8 ve opsiyonel SwiGLU

* MultiHeadAttention: embed_dim / num_heads = head_dim, örneğin 1024/16 = 64

---

## 6️⃣ Full Transformer Encoder (LLM)

Özellikler:

* TokenEmbedding + PositionalEncoding

* 12 adet LLM Encoder Block (TransformerEncoderBlockLLM)

* LayerNorm sonunda

* DropPath ve LayerScale aktif

```python 
class TransformerEncoderLLM(nn.Module):
    def __init__(
        self, 
        vocab_size, 
        embed_dim=1024, 
        num_layers=12, 
        num_heads=16, 
        dp=0.1, 
        drop_path=0.1, 
        expansion=8, 
        max_len=5000,
        use_swiglu=False
    ):
        super().__init__()
        self.tok_emb = TokenEmbedding(vocab_size, embed_dim)
        self.pos_enc = PositionalEncoding(embed_dim, max_len)
        self.layers = nn.ModuleList([
            TransformerEncoderBlockLLM(
                embed_dim=embed_dim, 
                num_heads=num_heads, 
                dp=dp, 
                drop_path=drop_path, 
                expansion=expansion,
                use_swiglu=use_swiglu
            ) for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, src_tokens, src_mask=None):
        """
        src_tokens: [B, T] (long)
        src_mask: mask shape broadcastable to [B, num_heads, T, T] OR [T, T] or [B, T, T]
                  1 -> allowed, 0 -> masked
        returns: encoder_outputs [B, T, C]
        """
        x = self.tok_emb(src_tokens)       # [B, T, C]
        x = self.pos_enc(x)
        for layer in self.layers:
            x = layer(x, mask=src_mask)
        x = self.norm(x)
        return x


## Özellikler:

* 12 layer + 16 head + embed_dim=1024 → LLM ölçeğinde

* PreNorm + DropPath + LayerScale + Residual

* FeedForward genişletilmiş (expansion=8, opsiyonel SwiGLU)

* PositionalEncoding: sin/cos, scalable

---

# 🔹 Full LLM Transformer Encoder

Özellikler:

* Embed dim 1024, 12 katman (layers), 16 head

* PreNorm + LayerNorm + Residual + DropPath + LayerScale

* FeedForward genişletilmiş (expansion=8, opsiyonel SwiGLU)

* Positional Encoding: sin/cos

In [None]:
import torch
import torch.nn as nn
import math

# -----------------------------
# DropPath (Stochastic Depth)
# -----------------------------
class DropPath(nn.Module):
    def __init__(self, drop_prob=0.0):
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        if self.drop_prob == 0.0 or not self.training:
            return x
        keep_prob = 1 - self.drop_prob
        shape = (x.size(0),) + (1,) * (x.ndim - 1)
        random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
        random_tensor.floor_()
        return x.div(keep_prob) * random_tensor

# -----------------------------
# Token Embedding
# -----------------------------
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
    def forward(self, x):
        return self.embedding(x)

# -----------------------------
# Positional Encoding
# -----------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :]

# -----------------------------
# Multi-Head Attention
# -----------------------------
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads=16, dp=0.1):
        super().__init__()
        assert embed_dim % num_heads == 0
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5

        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dp)

    def split_heads(self, x):
        B, T, C = x.size()
        return x.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

    def combine_heads(self, x):
        B, _, T, _ = x.size()
        return x.transpose(1, 2).contiguous().view(B, T, self.num_heads * self.head_dim)

    def forward(self, query, key, value, mask=None):
        Q = self.split_heads(self.q_proj(query))
        K = self.split_heads(self.k_proj(key))
        V = self.split_heads(self.v_proj(value))
        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        out = torch.matmul(attn, V)
        return self.out_proj(self.combine_heads(out))

# -----------------------------
# FeedForward
# -----------------------------
class FeedForward(nn.Module):
    def __init__(self, embed_dim, expansion=8, dp=0.1, use_swiglu=False):
        super().__init__()
        if use_swiglu:
            # SwiGLU activation
            self.net = nn.Sequential(
                nn.Linear(embed_dim, embed_dim * expansion * 2),
                nn.SiLU(),
                nn.Dropout(dp),
                nn.Linear(embed_dim * expansion, embed_dim),
                nn.Dropout(dp)
            )
        else:
            self.net = nn.Sequential(
                nn.Linear(embed_dim, embed_dim * expansion),
                nn.GELU(),
                nn.Dropout(dp),
                nn.Linear(embed_dim * expansion, embed_dim),
                nn.Dropout(dp)
            )
    def forward(self, x):
        return self.net(x)

# -----------------------------
# Transformer Encoder Block (LLM)
# -----------------------------
class TransformerEncoderBlockLLM(nn.Module):
    def __init__(self, embed_dim=1024, num_heads=16, dp=0.1, drop_path=0.1, expansion=8, use_swiglu=False):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)

        self.self_attn = MultiHeadAttention(embed_dim, num_heads, dp)
        self.ffn = FeedForward(embed_dim, expansion, dp, use_swiglu)

        self.drop_path = DropPath(drop_path)
        self.gamma_1 = nn.Parameter(torch.ones(embed_dim) * 1e-2)
        self.gamma_2 = nn.Parameter(torch.ones(embed_dim) * 1e-2)

    def forward(self, x, mask=None):
        # Self-Attention
        attn_out = self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), mask)
        x = x + self.drop_path(self.gamma_1 * attn_out)
        # FeedForward
        ffn_out = self.ffn(self.norm2(x))
        x = x + self.drop_path(self.gamma_2 * ffn_out)
        return x

# -----------------------------
# Full Transformer Encoder (LLM)
# -----------------------------
class TransformerEncoderLLM(nn.Module):
    def __init__(self, vocab_size, embed_dim=1024, num_layers=12, num_heads=16, dp=0.1, drop_path=0.1, expansion=8, max_len=5000, use_swiglu=False):
        super().__init__()
        self.tok_emb = TokenEmbedding(vocab_size, embed_dim)
        self.pos_enc = PositionalEncoding(embed_dim, max_len)
        self.layers = nn.ModuleList([
            TransformerEncoderBlockLLM(embed_dim, num_heads, dp, drop_path, expansion, use_swiglu) for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, src_tokens, src_mask=None):
        x = self.tok_emb(src_tokens)
        x = self.pos_enc(x)
        for layer in self.layers:
            x = layer(x, mask=src_mask)
        x = self.norm(x)
        return x

# Transformer Encoder Block (LLM) Akış Diyagramı (Markdown)
```bash

Input Tokens (`src_tokens`)
│
├─> **Token Embedding**
│    - nn.Embedding(vocab_size, embed_dim)
│
├─> **Positional Encoding**
│    - sin/cos pozisyon vektörleri eklenir
│
└─> **Encoder Layers** (N adet, ör: 12)
     │
     ├─ **LayerNorm 1**
     │    │
     │    └─> **Self-Attention**
     │          - MultiHeadAttention(embed_dim, num_heads)
     │          - Mask optional (src_mask)
     │          - Dropout
     │    │
     │    └─> **DropPath + γ₁ scaling**
     │    │
     │    └─> **Residual Add** (x + scaled attn)
     │
     ├─ **LayerNorm 2**
     │    │
     │    └─> **FeedForward Network (FFN)**
     │          - Linear → GELU/SwiGLU → Linear
     │          - Dropout
     │    │
     │    └─> **DropPath + γ₂ scaling**
     │    │
     │    └─> **Residual Add**
     │
     └─> **Output of Layer** (`x` updated)
│
├─> **Final LayerNorm**
│
└─> **Encoder Output**
     - `x` (embedding boyutu: batch_size × seq_len × embed_dim)


## Açıklama:

* Token Embedding + Positional Encoding: Token ID’lerini vektöre çevirir ve pozisyon bilgisi ekler.

* Self-Attention: Sadece encoder’daki tokenler arasında dikkat (mask opsiyonel).

* FeedForward (FFN): Lineer + GELU/SwiGLU + Dropout.

* Residual + DropPath: Stochastic depth ve learnable scaling.

* Final LayerNorm: Katman çıkışını normalize eder.

* Encoder Output: Decoder’a girdi olarak veya başka downstream görevlerde kullanılabilir.