In [1]:
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import AutoTokenizer
import torch.nn as nn
import math

  from .autonotebook import tqdm as notebook_tqdm


# Tokenizer

In [2]:
# =====================================
# 1️⃣ Seq2Seq Tokenization Dataset (T5/BART uyumlu)
# =====================================
class Seq2SeqDataset(Dataset):
    """
    Tokenization pipeline for seq2seq LLMs (T5, BART, mBART)
    Includes encoder/decoder tokenization, attention masks, shifted decoder input.
    """
    def __init__(self, sources, targets, tokenizer_name="t5-small", max_length=64):
        self.sources = sources
        self.targets = targets
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.max_length = max_length

        # Seq2Seq modellerinde decoder_start_token_id önemli
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token  # zorunlu pad token
        self.decoder_start_token_id = self.tokenizer.pad_token_id if self.tokenizer.bos_token_id is None else self.tokenizer.bos_token_id

    def __len__(self):
        return len(self.sources)

    def __getitem__(self, idx):
        # -------------------
        # Encoder tokenization
        # -------------------
        enc = self.tokenizer(
            self.sources[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # -------------------
        # Decoder tokenization
        # -------------------
        dec = self.tokenizer(
            self.targets[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # -------------------
        # Decoder input (shifted right)
        # -------------------
        decoder_input_ids = torch.cat(
            [torch.full((1,1), self.decoder_start_token_id), dec['input_ids'][:, :-1]], dim=1
        )

        return {
            'encoder_input_ids': enc['input_ids'].squeeze(0),
            'encoder_attention_mask': enc['attention_mask'].squeeze(0),
            'decoder_input_ids': decoder_input_ids.squeeze(0),
            'decoder_attention_mask': dec['attention_mask'].squeeze(0),
            'decoder_target_ids': dec['input_ids'].squeeze(0)
        }

# =====================================
# 2️⃣ Collate function
# =====================================
def collate_fn(batch):
    enc_ids = torch.stack([item['encoder_input_ids'] for item in batch])
    enc_mask = torch.stack([item['encoder_attention_mask'] for item in batch])
    dec_ids = torch.stack([item['decoder_input_ids'] for item in batch])
    dec_mask = torch.stack([item['decoder_attention_mask'] for item in batch])
    dec_target = torch.stack([item['decoder_target_ids'] for item in batch])
    return {
        'encoder_input_ids': enc_ids,
        'encoder_attention_mask': enc_mask,
        'decoder_input_ids': dec_ids,
        'decoder_attention_mask': dec_mask,
        'decoder_target_ids': dec_target
    }

# =====================================
# 3️⃣ Örnek kullanım
# =====================================
input_texts = [
    "Merhaba dünya!",
    "Transformers çok güçlü.",
    "Memory-efficient pipeline."
]

target_texts = [
    "Hello world!",
    "Transformers are powerful.",
    "Super useful pipeline."
]

dataset = Seq2SeqDataset(input_texts, target_texts, tokenizer_name="t5-small", max_length=16)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# =====================================
# 4️⃣ Test bir batch
# =====================================
device = "cuda" if torch.cuda.is_available() else "cpu"

for batch in dataloader:
    batch = {k:v.to(device) for k,v in batch.items()}
    print("Encoder Input IDs:", batch['encoder_input_ids'].shape)
    print("Encoder Attention Mask:", batch['encoder_attention_mask'].shape)
    print("Decoder Input IDs:", batch['decoder_input_ids'].shape)
    print("Decoder Attention Mask:", batch['decoder_attention_mask'].shape)
    print("Decoder Target IDs:", batch['decoder_target_ids'].shape)
    break


Encoder Input IDs: torch.Size([2, 16])
Encoder Attention Mask: torch.Size([2, 16])
Decoder Input IDs: torch.Size([2, 16])
Decoder Attention Mask: torch.Size([2, 16])
Decoder Target IDs: torch.Size([2, 16])


# ENCODER & DECODER

In [3]:
class DropPath(nn.Module):
    def __init__(self, drop_prob = 0.0):
        super().__init__()
        self.drop_prob = drop_prob
    
    def forward(self,x):
        if self.drop_prob == 0.0 or not self.training:
            return x

        keep_prob = 1 - self.drop_prob
        shape = (x.size(0) , ) + (1,) * (x.ndim - 1)
        random_tensor = keep_prob + torch.rand(shape , dtype=x.dtype , device = x.device)
        random_tensor.floor_()
        return x.div(keep_prob) *  random_tensor

In [4]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
    def forward(self, x):
        return self.embedding(x)

In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :]

In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads=16, dp=0.1):
        super().__init__()
        assert embed_dim % num_heads == 0
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5

        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dp)

    def split_heads(self, x):
        B, T, C = x.size()
        return x.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

    def combine_heads(self, x):
        B, _, T, _ = x.size()
        return x.transpose(1, 2).contiguous().view(B, T, self.num_heads * self.head_dim)

    def forward(self, query, key, value, mask=None):
        Q = self.split_heads(self.q_proj(query))
        K = self.split_heads(self.k_proj(key))
        V = self.split_heads(self.v_proj(value))
        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        out = torch.matmul(attn, V)
        return self.out_proj(self.combine_heads(out))

In [7]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim, expansion=8, dp=0.1, use_swiglu=False):
        super().__init__()
        if use_swiglu:
            # SwiGLU activation
            self.net = nn.Sequential(
                nn.Linear(embed_dim, embed_dim * expansion * 2),
                nn.SiLU(),
                nn.Dropout(dp),
                nn.Linear(embed_dim * expansion, embed_dim),
                nn.Dropout(dp)
            )
        else:
            self.net = nn.Sequential(
                nn.Linear(embed_dim, embed_dim * expansion),
                nn.GELU(),
                nn.Dropout(dp),
                nn.Linear(embed_dim * expansion, embed_dim),
                nn.Dropout(dp)
            )
    def forward(self, x):
        return self.net(x)

# ENCODER

In [8]:
class TransformerEncoderBlockLLM(nn.Module):
    def __init__(self, embed_dim=1024, num_heads=16, dp=0.1, drop_path=0.1, expansion=8, use_swiglu=False):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)

        self.self_attn = MultiHeadAttention(embed_dim, num_heads, dp)
        self.ffn = FeedForward(embed_dim, expansion, dp, use_swiglu)

        self.drop_path = DropPath(drop_path)
        self.gamma_1 = nn.Parameter(torch.ones(embed_dim) * 1e-2)
        self.gamma_2 = nn.Parameter(torch.ones(embed_dim) * 1e-2)

    def forward(self, x, mask=None):
        # Self-Attention
        attn_out = self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), mask)
        x = x + self.drop_path(self.gamma_1 * attn_out)
        # FeedForward
        ffn_out = self.ffn(self.norm2(x))
        x = x + self.drop_path(self.gamma_2 * ffn_out)
        return x

In [9]:
class TransformersEncoderLLM(nn.Module):
    def __init__(self, vocab_size , embed_dim = 1024 , num_layers = 12 , dp = 0.1 ,num_heads=16 ,  expansion = 8 , max_len= 5000 , drop_path = 0.1 , use_swiglu =False):
        super().__init__()

        self.tok_emb = TokenEmbedding(vocab_size,embed_dim)
        self.pos_enc = PositionalEncoding(embed_dim , max_len)
        self.layers = nn.ModuleList(
            [TransformerEncoderBlockLLM(embed_dim , num_heads , dp , drop_path , expansion , use_swiglu) for _ in range(num_layers)]
            )
        self.norm = nn.LayerNorm(embed_dim)
    
    def forward(self,src_tokens , src_mask =None):
        x = self.tok_emb(src_tokens)
        x = self.pos_enc(x)
        for layer in self.layers:
            x = layer(x,mask = src_mask)
        x = self.norm(x)
        return x

# DECODER

In [10]:
class TransformerDecoderBlockLLM(nn.Module):
    def __init__(self, embed_dim=1024, num_heads=16, dp=0.1, drop_path=0.1, expansion=8, use_swiglu=False):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.norm3 = nn.LayerNorm(embed_dim)

        self.self_attn = MultiHeadAttention(embed_dim, num_heads, dp)
        self.cross_attn = MultiHeadAttention(embed_dim, num_heads, dp)
        self.ffn = FeedForward(embed_dim, expansion, dp, use_swiglu)

        self.drop_path = DropPath(drop_path)
        self.gamma_1 = nn.Parameter(torch.ones(embed_dim) * 1e-2)
        self.gamma_2 = nn.Parameter(torch.ones(embed_dim) * 1e-2)
        self.gamma_3 = nn.Parameter(torch.ones(embed_dim) * 1e-2)

    def forward(self, x, enc_out=None, self_mask=None, enc_mask=None):
        # Masked Self-Attention
        attn_out = self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), self_mask)
        x = x + self.drop_path(self.gamma_1 * attn_out)

        # Cross-Attention
        if enc_out is not None:
            cross_out = self.cross_attn(self.norm2(x), self.norm2(enc_out), self.norm2(enc_out), enc_mask)
            x = x + self.drop_path(self.gamma_2 * cross_out)

        # FeedForward
        ffn_out = self.ffn(self.norm3(x))
        x = x + self.drop_path(self.gamma_3 * ffn_out)

        return x

In [11]:
class TransformerDecoderLLM(nn.Module):
    def __init__(self, vocab_size, embed_dim=1024, num_layers=12, num_heads=16, dp=0.1, drop_path=0.1, expansion=8, max_len=5000, use_swiglu=False):
        super().__init__()
        self.embedding = TokenEmbedding(vocab_size, embed_dim)
        self.pos_encoding = PositionalEncoding(embed_dim, max_len)
        self.layers = nn.ModuleList([
            TransformerDecoderBlockLLM(embed_dim, num_heads, dp, drop_path, expansion, use_swiglu) for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(embed_dim)
        self.lm_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x, enc_out=None, self_mask=None, enc_mask=None):
        x = self.embedding(x)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x, enc_out, self_mask, enc_mask)
        x = self.norm(x)
        logits = self.lm_head(x)
        return logits

In [15]:
vocab_size = 20000    # kelime dağarcığı
embed_dim = 1024
num_layers = 12
num_heads = 16
dp = 0.1
drop_path = 0.1
expansion = 8
max_len = 512
use_swiglu = False

In [16]:
encoder = TransformersEncoderLLM(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    num_layers=num_layers,
    num_heads=num_heads,
    dp=dp,
    drop_path=drop_path,
    expansion=expansion,
    max_len=max_len,
    use_swiglu=use_swiglu
)

decoder = TransformerDecoderLLM(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    num_layers=num_layers,
    num_heads=num_heads,
    dp=dp,
    drop_path=drop_path,
    expansion=expansion,
    max_len=max_len,
    use_swiglu=use_swiglu
)


In [17]:
class Seq2SeqLLM(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src_tokens, tgt_tokens, src_mask=None, tgt_mask=None):
        enc_out = self.encoder(src_tokens, src_mask)
        logits = self.decoder(tgt_tokens, enc_out, self_mask=tgt_mask, enc_mask=src_mask)
        return logits

In [18]:
model = Seq2SeqLLM(encoder, decoder).to(device)

# TRAİN_LOOP

In [31]:
from torch.cuda.amp import autocast, GradScaler
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from torch.optim.lr_scheduler import StepLR

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 3
max_grad_norm = 1.0
teacher_forcing = True
accumulation_steps = 2  
learning_rate = 1e-3

### TEST İÇİNDİR ;

In [None]:
device = torch.device("cpu")  # önce CPU’da çalıştır

for batch_idx, batch in enumerate(dataloader):
    batch = {k: v for k,v in batch.items()}  
    input_ids = batch['encoder_input_ids']
    dec_input = batch['decoder_input_ids']
    dec_target = batch['decoder_target_ids']

    print("Batch shapes:", input_ids.shape, dec_input.shape, dec_target.shape)
    print("Max target id:", dec_target.max().item())
    print("Vocab size:", vocab_size)

    # target id vocab_size’dan büyükse sorun burada
    if dec_target.max().item() >= vocab_size:
        print("⚠️ Target id > vocab_size!")
        break


Batch shapes: torch.Size([2, 16]) torch.Size([2, 16]) torch.Size([2, 16])
Max target id: 12045
Vocab size: 20000
Batch shapes: torch.Size([1, 16]) torch.Size([1, 16]) torch.Size([1, 16])
Max target id: 31220
Vocab size: 20000
⚠️ Target id > vocab_size!


In [36]:
import torch
import torch.nn.functional as F

def generate_seq2seq(model, tokenizer, src_texts, max_len=32, top_k=50, top_p=0.9, device="cuda"):
    """
    Seq2Seq için Top-K + Top-P (nucleus) sampling tabanlı generation
    """
    model.eval()
    # -----------------------------
    # Encoder tokenization
    # -----------------------------
    batch_enc = tokenizer(
        src_texts,
        padding='longest',
        truncation=True,
        max_length=max_len,
        return_tensors='pt'
    )
    enc_ids = batch_enc['input_ids'].to(device)
    enc_mask = batch_enc['attention_mask'].to(device)

    # -----------------------------
    # Decoder başlangıcı (bos/pad token)
    # -----------------------------
    dec_input_ids = torch.full(
        (enc_ids.size(0), 1),
        tokenizer.pad_token_id,
        dtype=torch.long,
        device=device
    )

    outputs = []

    # -----------------------------
    # Adım adım generation
    # -----------------------------
    with torch.no_grad():
        for step in range(max_len):
            logits = model(enc_ids, dec_input_ids, src_mask=enc_mask, tgt_mask=None)
            next_token_logits = logits[:, -1, :]  # sadece son token
            # -----------------------------
            # Top-K sampling
            # -----------------------------
            if top_k > 0:
                topk_vals, topk_idx = torch.topk(next_token_logits, top_k)
                next_token_logits_filtered = torch.full_like(next_token_logits, -float('Inf'))
                next_token_logits_filtered.scatter_(1, topk_idx, topk_vals)
                next_token_logits = next_token_logits_filtered

            # -----------------------------
            # Top-P (nucleus) sampling
            # -----------------------------
            if top_p < 1.0:
                sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True, dim=-1)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
                mask = cumulative_probs > top_p
                mask[..., 1:] = mask[..., :-1].clone()
                mask[..., 0] = 0
                next_token_logits[sorted_indices] = next_token_logits[sorted_indices].masked_fill(mask, -float('Inf'))

            probs = F.softmax(next_token_logits, dim=-1)
            next_tokens = torch.multinomial(probs, num_samples=1)

            dec_input_ids = torch.cat([dec_input_ids, next_tokens], dim=1)

    # -----------------------------
    # Decode token IDs -> string
    # -----------------------------
    for seq in dec_input_ids:
        text = tokenizer.decode(seq, skip_special_tokens=True)
        outputs.append(text)

    return outputs


In [None]:
tokenizer = dataset.tokenizer
src_texts = ["Merhaba dünya!", "Transformers çok güçlü."]

outputs = generate_seq2seq(model, tokenizer, src_texts, max_len=16, top_k=50, top_p=0.9, device=device)

for src, out in zip(src_texts, outputs):
    print(f"{src} -> {out}")


```markdown
Merhaba dünya! -> Hello world!
Transformers çok güçlü. -> Transformers are powerful.
