In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import mmap
import random
import pickle


device= 'cuda' if torch.cuda.is_available() else 'cpu'
print (device)
block_size=256
batch_size=64

max_iters= 50000
#eval_interval=2500
learning_rate= 1e-4
eval_iters=100
eval_interval=500
n_embd=512
n_head=8
n_layer=6
dropout = 0.1
torch.cuda.is_available(), torch.cuda.device_count(), torch.cuda.get_device_name(0)

cuda


(True, 1, 'NVIDIA GeForce RTX 3050 6GB Laptop GPU')

In [2]:
# BYTE-LEVEL VOCAB (FIXED)
vocab_size = 256
print("Using byte-level vocabulary of size", vocab_size)


Using byte-level vocabulary of size 256


In [3]:
# BYTE-LEVEL ENCODE / DECODE (FIXED)

def encode(text: str):
    # UTF-8 bytes → integers [0,255]
    return list(text.encode("utf-8"))

def decode(tokens):
    # integers [0,255] → UTF-8 text
    return bytes(tokens).decode("utf-8", errors="ignore")


In [4]:
train_file = open("output_train.txt", "rb")
val_file   = open("output_val.txt", "rb")

train_mm = mmap.mmap(train_file.fileno(), 0, access=mmap.ACCESS_READ)
val_mm   = mmap.mmap(val_file.fileno(),   0, access=mmap.ACCESS_READ)


def get_batch(split):
    mm = train_mm if split == "train" else val_mm
    file_size = len(mm)

    ix = torch.randint(
        0,
        file_size - block_size - 1,
        (batch_size,)
    )

    x = torch.stack([
        torch.frombuffer(
            mm[i : i + block_size],
            dtype=torch.uint8
        ).long()
        for i in ix
    ])

    y = torch.stack([
        torch.frombuffer(
            mm[i + 1 : i + block_size + 1],
            dtype=torch.uint8
        ).long().clamp(0, vocab_size - 1)
        for i in ix
    ])


    return x.to(device), y.to(device)


In [5]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
        out = self.dropout(self.proj(out))
        return out
    

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)
    
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x
    
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(256, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
        
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, index, targets=None):
        #logits=self.token_embedding_table(index)
        B,T= index.shape
        
        
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(index) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            index_cond = index[:, -block_size:]
            # get the predictions
            logits, loss = self.forward(index_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = GPTLanguageModel(vocab_size)
print("Loading model parameters..")
#with open('model-01.pkl','rb') as f:
 #   model = pickle.load(f)
#print('loaded successfully!')

m = model.to(device)

Loading model parameters..


In [6]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [7]:
#pytorch optimizer

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=learning_rate,                
    betas=(0.9, 0.95),
    weight_decay=0.1
)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=max_iters          
)


scaler = torch.amp.GradScaler("cuda", enabled=(device == "cuda"))



for iter in range(max_iters):

    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(
            f"step: {iter}, "
            f"train loss: {losses['train']:.3f}, "
            f"val loss: {losses['val']:.3f}"
        )

    xb, yb = get_batch("train")

    optimizer.zero_grad(set_to_none=True)

    # AMP forward + loss
    with torch.cuda.amp.autocast(enabled=(device == "cuda")):
        logits, loss = model(xb, yb)

    scaler.scale(loss).backward()

    # stabilize gradients
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # optimizer step
    scaler.step(optimizer)
    scaler.update()

    scheduler.step()

print(loss.item())



  torch.frombuffer(


step: 0, train loss: 5.673, val loss: 5.675


  with torch.cuda.amp.autocast(enabled=(device == "cuda")):


step: 500, train loss: 2.259, val loss: 2.264
step: 1000, train loss: 2.240, val loss: 2.251
step: 1500, train loss: 2.239, val loss: 2.216
step: 2000, train loss: 2.214, val loss: 2.210
step: 2500, train loss: 2.198, val loss: 2.194
step: 3000, train loss: 2.178, val loss: 2.186
step: 3500, train loss: 2.147, val loss: 2.149
step: 4000, train loss: 2.087, val loss: 2.079
step: 4500, train loss: 2.047, val loss: 2.040
step: 5000, train loss: 2.016, val loss: 2.006
step: 5500, train loss: 1.987, val loss: 1.987
step: 6000, train loss: 1.965, val loss: 1.953
step: 6500, train loss: 1.949, val loss: 1.952
step: 7000, train loss: 1.937, val loss: 1.934
step: 7500, train loss: 1.950, val loss: 1.916
step: 8000, train loss: 1.917, val loss: 1.926
step: 8500, train loss: 1.931, val loss: 1.912
step: 9000, train loss: 1.916, val loss: 1.920
step: 9500, train loss: 1.922, val loss: 1.925
step: 10000, train loss: 1.903, val loss: 1.903
step: 10500, train loss: 1.884, val loss: 1.909
step: 11000,

In [8]:
with open('model-01-v2.pkl', 'wb') as f:
    pickle.dump(model, f)
print('model saved')

model saved


In [9]:
context =  torch.zeros((1,1),dtype=torch.long, device=device)
generated_chars = decode(m.generate(context,max_new_tokens=500)[0].tolist())
print(generated_chars)

                                                 0991954e90fe0f38ct                                             00000000 0000000957 007000000000 000000001  IChes Oar Exubar Polic (DeabouPs by 17, xpicerdctations and sturalics nout agrourater the clicasor chaps in 296t aloan narery Freay prentorica and numinuttion Shative ain, and vouading, may the seely machions Sireater aplopos. The Irdme as Chatirnic-pecrcons,” Un: Chustial #

the sment to I partsolians that not, Moding inty modising 
