In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
import numpy as np
import math
import os, json, math, time, random
from dataclasses import dataclass
from typing import List, Dict
from torch.utils.data import Dataset, DataLoader


class DropPath(nn.Module):
    def __init__(self, drop_prob = 0.2):
        super().__init__()
        self.drop_prob = drop_prob
    
    def forward(self,x):
        if self.drop_prob == 0.0 or not self.training:
            return x
    
        keep_prob = 1 - self.drop_prob
        shape = (x.size(0) , ) + (1,) * (x.ndim - 1)
        random_tensors = keep_prob + torch.rand(shape , dtype=x.dtype , device=x.device)
        random_tensors.floor_()
        return x.div(keep_prob) * random_tensors
    

class TokenEmbed(nn.Module):
    def __init__(self, vocab_size , embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size , embed_dim)
    
    def forward(self,x):
        return self.embedding(x)
    

class PositionelEncod(nn.Module):
    def __init__(self, embed_dim , max_len = 5000):
        super().__init__()

        pe = torch.zeros(max_len , embed_dim)
        position = torch.arange( 0 ,max_len , dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0 , embed_dim , 2).float() * (-math.log(10000.0)/embed_dim))
        pe[: , 0::2] = torch.sin(position * div_term)
        pe[: , 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe' , pe)
    
    def forward(self,x):
        seq_len = x.size(1)
        return x + self.pe[: , :seq_len , :]
    

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads=16, dp=0.1):
        super().__init__()
        assert embed_dim % num_heads == 0
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5

        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dp)

    def split_heads(self, x):
        B, T, C = x.size()
        return x.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)  # [B, num_heads, T, head_dim]

    def combine_heads(self, x):
        B, H, T, D = x.size()
        return x.transpose(1, 2).contiguous().view(B, T, H * D)

    def forward(self, query, key, value, mask=None):
        Q = self.split_heads(self.q_proj(query))
        K = self.split_heads(self.k_proj(key))
        V = self.split_heads(self.v_proj(value))

        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale  # [B, H, T_q, T_k]

        if mask is not None:
            # Mask broadcast: [B, T] -> [B, 1, 1, T] -> broadcastable
            if mask.dim() == 2:
                mask = mask[:, None, None, :]
            elif mask.dim() == 3:
                mask = mask[:, None, :, :]  # [B,1,T_q,T_k] (cross attention)
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        out = torch.matmul(attn, V)
        return self.out_proj(self.combine_heads(out))

class FeedForward(nn.Module):
    def __init__(self, embed_dim, expansion=8, dp=0.1, use_swiglu=False):
        super().__init__()
        if use_swiglu:
            # SwiGLU activation
            self.net = nn.Sequential(
                nn.Linear(embed_dim, embed_dim * expansion * 2),
                nn.SiLU(),
                nn.Dropout(dp),
                nn.Linear(embed_dim * expansion, embed_dim),
                nn.Dropout(dp)
            )
        else:
            self.net = nn.Sequential(
                nn.Linear(embed_dim, embed_dim * expansion),
                nn.GELU(),
                nn.Dropout(dp),
                nn.Linear(embed_dim * expansion, embed_dim),
                nn.Dropout(dp)
            )
    def forward(self, x):
        return self.net(x)
    
# ENCODER
class TransformerEncoderBlockLLM(nn.Module):
    def __init__(self, embed_dim=1024, num_heads=16, dp=0.1, drop_path=0.1, expansion=8, use_swiglu=False):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)

        self.self_attn = MultiHeadAttention(embed_dim, num_heads, dp)
        self.ffn = FeedForward(embed_dim, expansion, dp, use_swiglu)

        self.drop_path = DropPath(drop_path)
        self.gamma_1 = nn.Parameter(torch.ones(embed_dim) * 1e-2)
        self.gamma_2 = nn.Parameter(torch.ones(embed_dim) * 1e-2)

    def forward(self, x, mask=None):
        # Self-Attention
        attn_out = self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), mask)
        x = x + self.drop_path(self.gamma_1 * attn_out)
        # FeedForward
        ffn_out = self.ffn(self.norm2(x))
        x = x + self.drop_path(self.gamma_2 * ffn_out)
        return x
    
class TransformersEncoderLLM(nn.Module):
    def __init__(self, vocab_size , embed_dim = 1024 , num_layers = 12 , dp = 0.1 ,num_heads=16 ,  expansion = 8 , max_len= 5000 , drop_path = 0.1 , use_swiglu =False):
        super().__init__()

        self.tok_emb = TokenEmbed(vocab_size,embed_dim)
        self.pos_enc = PositionelEncod(embed_dim , max_len)
        self.layers = nn.ModuleList(
            [TransformerEncoderBlockLLM(embed_dim , num_heads , dp , drop_path , expansion , use_swiglu) for _ in range(num_layers)]
            )
        self.norm = nn.LayerNorm(embed_dim)
    
    def forward(self,src_tokens , src_mask =None):
        x = self.tok_emb(src_tokens)
        x = self.pos_enc(x)
        for layer in self.layers:
            x = layer(x,mask = src_mask)
        x = self.norm(x)
        return x
    
# DECODER
class TransformerDecoderBlockLLM(nn.Module):
    def __init__(self, embed_dim=1024, num_heads=16, dp=0.1, drop_path=0.1, expansion=8, use_swiglu=False):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.norm3 = nn.LayerNorm(embed_dim)

        self.self_attn = MultiHeadAttention(embed_dim, num_heads, dp)
        self.cross_attn = MultiHeadAttention(embed_dim, num_heads, dp)
        self.ffn = FeedForward(embed_dim, expansion, dp, use_swiglu)

        self.drop_path = DropPath(drop_path)
        self.gamma_1 = nn.Parameter(torch.ones(embed_dim) * 1e-2)
        self.gamma_2 = nn.Parameter(torch.ones(embed_dim) * 1e-2)
        self.gamma_3 = nn.Parameter(torch.ones(embed_dim) * 1e-2)

    def forward(self, x, enc_out=None, self_mask=None, enc_mask=None):
        # Masked Self-Attention
        if self_mask is not None:
            # [B, T] -> [B, 1, T, T] (triangular mask veya pad mask)
            if self_mask.dim() == 2:
                # Causal mask: üst üçgen mask için manuel ekleme gerekebilir
                causal_mask = torch.tril(torch.ones((x.size(1), x.size(1)), device=x.device)).bool()
                self_mask = self_mask[:, None, :] & causal_mask[None, :, :]
        attn_out = self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x), mask=self_mask)
        x = x + self.drop_path(self.gamma_1 * attn_out)

        # Cross-Attention
        if enc_out is not None:
            if enc_mask is not None and enc_mask.dim() == 2:
                enc_mask = enc_mask[:, None, None, :]  # [B,1,1,T_enc]
            cross_out = self.cross_attn(self.norm2(x), self.norm2(enc_out), self.norm2(enc_out), mask=enc_mask)
            x = x + self.drop_path(self.gamma_2 * cross_out)

        # FeedForward
        ffn_out = self.ffn(self.norm3(x))
        x = x + self.drop_path(self.gamma_3 * ffn_out)

        return x

class TransformerDecoderLLM(nn.Module):
    def __init__(self, vocab_size, embed_dim=1024, num_layers=12, num_heads=16, dp=0.1, drop_path=0.1, expansion=8, max_len=5000, use_swiglu=False):
        super().__init__()
        self.embedding = TokenEmbed(vocab_size, embed_dim)
        self.pos_encoding = PositionelEncod(embed_dim, max_len)
        self.layers = nn.ModuleList([
            TransformerDecoderBlockLLM(embed_dim, num_heads, dp, drop_path, expansion, use_swiglu) for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(embed_dim)
        self.lm_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x, enc_out=None, self_mask=None, enc_mask=None):
        x = self.embedding(x)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x, enc_out, self_mask, enc_mask)
        x = self.norm(x)
        logits = self.lm_head(x)
        return logits
    

class Seq2SeqLLM(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src_tokens, tgt_tokens, src_mask=None, tgt_mask=None):
        enc_out = self.encoder(src_tokens, src_mask)
        logits = self.decoder(tgt_tokens, enc_out, self_mask=tgt_mask, enc_mask=src_mask)
        return logits

In [3]:
# ---------- Yardımcı sabitler ----------
PAD, BOS, EOS, UNK = "<pad>", "<bos>", "<eos>", "<unk>"

# ---------- Dataset ----------
class JsonlSeq2Seq(Dataset):
    def __init__(self, path:str, stoi:Dict[str,int], max_in=128, max_out=128):
        self.rows = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                ex = json.loads(line)
                self.rows.append((ex["input"].strip(), ex["target"].strip()))
        self.stoi = stoi
        self.max_in = max_in
        self.max_out = max_out

    def _encode(self, text:str, add_bos=True, add_eos=True, max_len=128):
        toks = text.split()
        ids = []
        if add_bos: ids.append(self.stoi.get(BOS,1))
        ids.extend([self.stoi.get(t, self.stoi.get(UNK,3)) for t in toks])
        if add_eos: ids.append(self.stoi.get(EOS,2))
        ids = ids[:max_len]
        return torch.tensor(ids, dtype=torch.long)

    def __len__(self): return len(self.rows)

    def __getitem__(self, i):
        src_txt, tgt_txt = self.rows[i]
        src = self._encode(src_txt, add_bos=True, add_eos=True, max_len=self.max_in)
        # decoder input: <bos> + target tokens (no <eos> at end)
        tgt_in = self._encode(tgt_txt, add_bos=True, add_eos=False, max_len=self.max_out)
        # decoder label: target tokens + <eos> (no <bos>)
        tgt_out = self._encode(tgt_txt, add_bos=False, add_eos=True, max_len=self.max_out)
        return src, tgt_in, tgt_out

def pad_collate(batch, pad_id:int):
    # batch: list of (src, tgt_in, tgt_out)
    srcs, tgts_in, tgts_out = zip(*batch)
    def pad(seqs):
        maxlen = max(x.size(0) for x in seqs)
        out = torch.full((len(seqs), maxlen), pad_id, dtype=torch.long)
        for i,s in enumerate(seqs):
            out[i,:s.size(0)] = s
        return out
    return pad(srcs), pad(tgts_in), pad(tgts_out)

# ---------- Masklar ----------
def make_pad_mask(ids: torch.Tensor, pad_id:int):
    # 1 = keep, 0 = mask
    return (ids != pad_id).to(ids.device)

# ---------- Label Smoothing (opsiyonel) ----------
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes:int, smoothing=0.1, ignore_index=-100):
        super().__init__()
        self.conf = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.ignore_index = ignore_index
    def forward(self, pred, target):
        # pred: [B,T,V], target: [B,T]
        pred = pred.log_softmax(dim=-1)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            mask = target.ne(self.ignore_index)
            true_dist.scatter_(-1, target.unsqueeze(-1), self.conf)
            true_dist = true_dist * mask.unsqueeze(-1)
        loss = torch.sum(-true_dist * pred) / torch.sum(mask)
        return loss

# ---------- Model kur ----------
def build_model(vocab_size:int,
                embed_dim=256, num_heads=4,
                num_layers_enc=4, num_layers_dec=4,
                dp=0.1, drop_path=0.1, expansion=4, max_len=128, use_swiglu=False):
    enc = TransformersEncoderLLM(
        vocab_size=vocab_size, embed_dim=embed_dim, num_layers=num_layers_enc,
        dp=dp, num_heads=num_heads, expansion=expansion, max_len=max_len,
        drop_path=drop_path, use_swiglu=use_swiglu
    )
    dec = TransformerDecoderLLM(
        vocab_size=vocab_size, embed_dim=embed_dim, num_layers=num_layers_dec,
        num_heads=num_heads, dp=dp, drop_path=drop_path, expansion=expansion,
        max_len=max_len, use_swiglu=use_swiglu
    )
    m = Seq2SeqLLM(enc, dec)
    # weight tying (parametre azaltır)
    m.decoder.lm_head.weight = m.decoder.embedding.embedding.weight
    return m

# ---------- Eğit ----------
@dataclass
class Config:
    train_path: str = r"C:\Users\hdgn5\OneDrive\Masaüstü\Kendi API'mız\- Kendi API'mizi Kullanalım -\Torch - LLM -\Dataset\train.jsonl"
    dev_path: str   = r"C:\Users\hdgn5\OneDrive\Masaüstü\Kendi API'mız\- Kendi API'mizi Kullanalım -\Torch - LLM -\Dataset\dev.jsonl"
    vocab_path: str = r"C:\Users\hdgn5\OneDrive\Masaüstü\Kendi API'mız\- Kendi API'mizi Kullanalım -\Torch - LLM -\Dataset\vocab.json"
    out_path:   str = "llm.pt"

    max_in: int  = 128
    max_out:int  = 128
    batch_size:int = 16
    lr: float    = 3e-4
    epochs:int   = 6
    grad_accum:int = 1
    label_smoothing: float = 0.1

    embed_dim:int = 256
    heads:int     = 4
    layers_enc:int= 4
    layers_dec:int= 4
    dp: float     = 0.1
    dpr: float    = 0.1
    expansion:int = 4
    max_len:int   = 128
    use_swiglu:bool = False

def main(cfg=Config()):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device:", device)

    # vocab
    with open(cfg.vocab_path, "r", encoding="utf-8") as f:
        vocab = json.load(f)
    stoi, itos = vocab["stoi"], vocab["itos"]
    pad_id = stoi.get(PAD, 0)

    # data
    train_ds = JsonlSeq2Seq(cfg.train_path, stoi, cfg.max_in, cfg.max_out)
    dev_ds   = JsonlSeq2Seq(cfg.dev_path,   stoi, cfg.max_in, cfg.max_out)
    coll = lambda b: pad_collate(b, pad_id)
    train_dl = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, collate_fn=coll)
    dev_dl   = DataLoader(dev_ds,   batch_size=cfg.batch_size, shuffle=False, collate_fn=coll)

    # model
    model = build_model(
        vocab_size=len(itos), embed_dim=cfg.embed_dim, num_heads=cfg.heads,
        num_layers_enc=cfg.layers_enc, num_layers_dec=cfg.layers_dec,
        dp=cfg.dp, drop_path=cfg.dpr, expansion=cfg.expansion, max_len=cfg.max_len,
        use_swiglu=cfg.use_swiglu
    ).to(device)

    # loss & opt
    crit = LabelSmoothingLoss(classes=len(itos), smoothing=cfg.label_smoothing, ignore_index=pad_id)
    opt = torch.optim.AdamW(model.parameters(), lr=cfg.lr)
    scaler = torch.cuda.amp.GradScaler(enabled=(device.type=="cuda"))

    best_dev = float("inf")

    for epoch in range(1, cfg.epochs+1):
        model.train()
        total = 0.0
        steps = 0
        t0 = time.time()

        for i, (src, tgt_in, tgt_out) in enumerate(train_dl, start=1):
            src, tgt_in, tgt_out = src.to(device), tgt_in.to(device), tgt_out.to(device)
            src_mask = (src != pad_id)
            tgt_mask = (tgt_in != pad_id)

            with torch.cuda.amp.autocast(enabled=(device.type=="cuda")):
                logits = model(src, tgt_in, src_mask=src_mask, tgt_mask=tgt_mask)  # [B,T,V]
                loss = crit(logits, tgt_out)

            scaler.scale(loss).backward()
            if i % cfg.grad_accum == 0:
                scaler.step(opt)
                scaler.update()
                opt.zero_grad(set_to_none=True)

            total += loss.item()
            steps += 1

        train_loss = total / steps

        # --------- eval ----------
        model.eval()
        dev_total, dev_steps = 0.0, 0
        with torch.no_grad():
            for src, tgt_in, tgt_out in dev_dl:
                src, tgt_in, tgt_out = src.to(device), tgt_in.to(device), tgt_out.to(device)
                src_mask = (src != pad_id)
                tgt_mask = (tgt_in != pad_id)
                logits = model(src, tgt_in, src_mask=src_mask, tgt_mask=tgt_mask)
                loss = crit(logits, tgt_out)
                dev_total += loss.item(); dev_steps += 1
        dev_loss = dev_total / max(1, dev_steps)

        dt = time.time()-t0
        print(f"[Epoch {epoch}] train_loss={train_loss:.4f}  dev_loss={dev_loss:.4f}  time={dt:.1f}s")

        if dev_loss < best_dev:
            best_dev = dev_loss
            torch.save(model.state_dict(), cfg.out_path)
            print(f"   best dev → saved to {cfg.out_path}")

    print("Done. Best dev:", best_dev)

if __name__ == "__main__":
    main()


Device: cuda


  scaler = torch.cuda.amp.GradScaler(enabled=(device.type=="cuda"))
  with torch.cuda.amp.autocast(enabled=(device.type=="cuda")):


[Epoch 1] train_loss=69.8171  dev_loss=15.6494  time=9.2s
   best dev → saved to llm.pt
[Epoch 2] train_loss=11.3528  dev_loss=8.2420  time=8.0s
   best dev → saved to llm.pt
[Epoch 3] train_loss=7.1956  dev_loss=5.7646  time=8.4s
   best dev → saved to llm.pt
[Epoch 4] train_loss=5.5081  dev_loss=4.7146  time=7.9s
   best dev → saved to llm.pt
[Epoch 5] train_loss=4.7248  dev_loss=4.2613  time=7.8s
   best dev → saved to llm.pt
[Epoch 6] train_loss=4.3402  dev_loss=3.9755  time=9.4s
   best dev → saved to llm.pt
Done. Best dev: 3.975476319973285
