In [None]:
pip install torch



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [None]:
#Hyperparameters
VOCAB_SIZE=5000
EMBED_DIM=256
NUM_HEADS=8
NUM_LAYERS=6
MAX_SEQ_LEN=128
FF_DIM=1024
DROPOUT=0.1

In [None]:
#Self-Attention
class SelfAttention(nn.Module):
    def __init__(self):
        super().__init__()
        self.qkv = nn.Linear(EMBED_DIM, EMBED_DIM * 3)
        self.out = nn.Linear(EMBED_DIM, EMBED_DIM)
        self.head_dim = EMBED_DIM // NUM_HEADS

    def forward(self, x):
        B, T, C = x.shape

        qkv = self.qkv(x)
        q, k, v = qkv.chunk(3, dim=-1)

        q = q.view(B, T, NUM_HEADS, self.head_dim).transpose(1, 2)
        k = k.view(B, T, NUM_HEADS, self.head_dim).transpose(1, 2)
        v = v.view(B, T, NUM_HEADS, self.head_dim).transpose(1, 2)

        scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)

        mask = torch.tril(torch.ones(T, T)).to(x.device)
        scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = F.softmax(scores, dim=-1)
        out = attn @ v

        out = out.transpose(1, 2).contiguous().view(B, T, C)
        return self.out(out)

In [None]:
#Transformer-Block
class TransformerBlock(nn.Module):
    def __init__(self):
        super().__init__()
        self.attn = SelfAttention()
        self.ff = nn.Sequential(
            nn.Linear(EMBED_DIM, FF_DIM),
            nn.ReLU(),
            nn.Linear(FF_DIM, EMBED_DIM)
        )
        self.ln1 = nn.LayerNorm(EMBED_DIM)
        self.ln2 = nn.LayerNorm(EMBED_DIM)
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, x):
        x = x + self.dropout(self.attn(self.ln1(x)))
        x = x + self.dropout(self.ff(self.ln2(x)))
        return x

In [None]:
#GPT Style LLM
class MiniGPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_emb = nn.Embedding(VOCAB_SIZE, EMBED_DIM)
        self.pos_emb = nn.Embedding(MAX_SEQ_LEN, EMBED_DIM)

        self.blocks = nn.ModuleList([
            TransformerBlock() for _ in range(NUM_LAYERS)
        ])

        self.ln_f = nn.LayerNorm(EMBED_DIM)
        self.head = nn.Linear(EMBED_DIM, VOCAB_SIZE)

    def forward(self, idx):
        B, T = idx.shape
        positions = torch.arange(T, device=idx.device)

        x = self.token_emb(idx) + self.pos_emb(positions)

        for block in self.blocks:
            x = block(x)

        x = self.ln_f(x)
        logits = self.head(x)
        return logits

    @torch.no_grad()
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -MAX_SEQ_LEN:]
            logits = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, 1)
            idx = torch.cat([idx, next_token], dim=1)
        return idx

In [None]:
#Training loop
model = MiniGPT()
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

for step in range(1000):
    x = torch.randint(0, VOCAB_SIZE, (32, 64))
    y = x.clone()

    logits = model(x)
    loss = F.cross_entropy(
        logits.view(-1, VOCAB_SIZE),
        y.view(-1)
    )

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 100 == 0:
        print("Loss:", loss.item())

Loss: 8.661161422729492
Loss: 4.571882247924805
Loss: 0.9757809042930603
Loss: 0.17330001294612885
Loss: 0.07771746814250946
Loss: 0.04615394026041031
Loss: 0.03115539811551571
Loss: 0.023439552634954453
Loss: 0.01808004267513752
Loss: 0.014514915645122528
