In [1]:
import torch
import os,sys
import ipdb
from tqdm import tqdm
from datetime import datetime
import platform, shutil
import requests, zipfile, io

# pytorch
import torch.nn as nn
import torch.nn.functional as F

# tokenizer
import sentencepiece as spm

# improve performance
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# Empty GPU cache memory
torch.cuda.empty_cache()

In [2]:
# file_url = "https://ideami.com/llm_train"
# print("Wait bro file is downloading")
# response = requests.get(file_url)
# zipfile.ZipFile(io.BytesIO(response.content)).extractall(".")


In [None]:
#Architecture Parameters

batch_size = 8
context = 512
embed_size = 384
n_layers = 7
n_heads = 7
BIAS = True

#Hyper Parameters
lr = 3e-4
dropout = 0.05 # Regularization to prevent overfiting
weight_decay = 0.01 #limiting the weight
grad_clip = 1.0 # cliping or maximize the gradients or loss to become too large

# Training Hyperparameters 

train_iters = 100000 # epochs
eval_interval = 50 # to check if we are not overfiting the data like if the training loss is kept decreasing but the eval loss is not decreasing as the same time that means we're overfiting the data
eval_iters = 10 # epochs for evaluation 
compile = False # To accelarate the GPU performance and reduce memory usage
checkpoint_dir = "new-models/" # saving the checkout of our models
checkpoint_fn = 'latest.pt'
checkpoint_load_fn = 'latest.pt'
dtype = torch.bfloat16
load_pretrained = False

# MODE
inference = False # bascially evaluation of model after training with new data. right now we'll be training that's why it's false when we complete the training we'll set it to true

# Device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print (f"devie you'll be using {device}")





In [None]:
# Logging

wandb_log = True
wandb_project = "first_llm"
wandb_run_name = "llm1" + datetime.now().strftime("%y_%m_%d_%H_%m_%S")

if wandb_log:
    import wandb
    wandb.init(project = wandb_project, name= wandb_run_name)


In [None]:
# loading the dataset

with open('wiki.txt', 'r' , encoding='utf-8') as f:
    text = f.read()

text[:100]

In [None]:
# Loading the trained tokenizer

sp = spm.SentencePieceProcessor(model_file='wiki_tokenizer.model')
vocab_size = sp.get_piece_size()
print(f"Tokenizer vocab_size is {vocab_size}")

In [7]:
encode = lambda x : sp.Encode(x)
decode = lambda y : sp.Decode(y)

In [None]:
print(encode("I love you baby"))
print(decode(encode("I love you baby")))

In [None]:
if os.path.exists('encoded_data.pt'):
    print('Loading encoding')
    data = torch.load('encoded_data.pt')
else:
    data = torch.tensor(encode(text),dtype=torch.long)
    torch.save(data,'encoded_data.pt')

In [None]:
data_size = len(data)
spl = int(0.9*data_size)
train_data = data[:spl]
val_data = data[spl:]

print(f"Total Data is {data_size/1e6:.2f} Million | Train Data is {len(train_data)/1e6:.2f} Million | Val Data is {len(val_data)/1e6:.2f} Million")

In [11]:
def get_batch(split):
    data = train_data if split == "train" else val_data
    ind = torch.randint(len(data)-context, (batch_size,))
    x = torch.stack([data[i:i+context] for i in ind])
    y = torch.stack([data[i+1:i+context+1] for i in ind])

    x,y = x.to(device), y.to(device)
    return x,y

In [None]:
x,y = get_batch("train")
print(x.shape, y.shape)
print(x[0][:10])
print(y[0][:10])

In [13]:
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_size) # eg: 4096 * 384
        self.positions = nn.Embedding(context, embed_size) # 512, 384
        self.blocks = nn.Sequential(*[Blocks(n_heads) for _ in range(n_layers)])
        self.ln = nn.LayerNorm(embed_size)
        self.final_linear = nn.Linear(embed_size, vocab_size, bias=BIAS)
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, input, target=None):
        loss = None
        BS, SL = input.shape # BS * SL ie 8 * 512
        emb = self.embeddings(input) # BS * SL * 384
        pos = self.positions(torch.arange(SL, device=device)) # SL * 384
        x = emb + pos # BS * SL* 384
        x = self.blocks(x) # BS * SL* 384
        x = self.ln(x) # BS * SL* 384
        logits = self.final_linear(x) # BS * SL* 4096 ie our vocabulary size
        # These logits are the normalized probabilities of every 4096 tokens that can be the next token.
        

        # Calculating the loss that has between target probabilities and logits as predited probabilities
        if target is not None:
            BS, SL, VS = logits.shape
            logits = logits.view(BS*SL, VS)
            target = target.view(BS*SL)
            loss = F.cross_entropy(logits, target)

            # # Manual Calculation
            counts = logits.exp()
            prob = counts / counts.sum(-1, keepdim = True)
            loss2 = prob[torch.arange(BS*SL), target].log().mean()
        
        return logits, loss
    
    # Generate a new sample

    def generate(self, input, max = 500):
        for _ in range(max):
            input = input[:,-context:] # Here we are taking only last 512 tokens from the input as our llm can only process 512 which is our context lenght at a time.
            # input gonna be 1 dimesion tensor of lenght min(len(input), 512)
            logits, _ = self(input) # we pass it to the forward function to get logits
            logits = logits[:,-1,:] # (1 * 4096) now logits are shape of 8 * 512 * 4096 of normalized probabilities but we need to get the probability of of what will come after last token that's why we just take last dimension of 512
            probs = F.softmax(logits, dim=-1) # (1,4096) using softmax function we convert that normalized probability into an actual probability of every toekn present in our vocab size that can come after the last token of our input
            next = torch.multinomial(probs, num_samples=1)
            # this multinomial function will take 1 num_sample from the entire probabities as a predicted next token
            input = torch.cat((input,next), dim=1)
            # and we add that next predicted token to the input to got through the loop again keepgin the dim = 1
        
        return input

In [14]:
class Blocks(nn.Module):
    def __init__(self,n_heads):
        super().__init__()
        head_size = embed_size // n_heads # just dividing the embed size of every token in 7 different heads, each 7 layers have 7 different heads.
        self.ma = Multihead(n_heads, head_size)
        self.feed_forward = ForwardLayer(embed_size)
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)

    def forward(self,x):
        # This here is called residual network, we did this to prevent vanishing gradient and mapping for increasing computation.
        x = x + self.ma(self.ln1(x))
        x = x + self.feed_forward(self.ln2(x))
        return x

In [15]:
class Multihead(nn.Module):
    def __init__(self,n_heads,head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.combine = nn.Linear(n_heads * head_size, embed_size, bias=BIAS) # (374 // 7 = 54) and 54 * 7 = 378 so we're going here (378,embed_size)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self,x):
        x = torch.cat([head(x) for head in self.heads], dim = -1)
        # Each head output will be (BS,SL,head_size)
        x = self.combine(x) # this will return (BS,SL,384)
        x = self.dropout(x)
        return x


In [16]:
class ForwardLayer(nn.Module):
    def __init__(self,embed_size):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(embed_size, 6*embed_size, bias=BIAS),
            nn.GELU(),
            nn.Linear(6*embed_size, embed_size, bias=BIAS),
            nn.Dropout(dropout)
        )
    
    def forward(self,x):
        x = self.network(x)
        return x

In [17]:
class Head(nn.Module):
    def __init__(self,head_size):
        super().__init__()
        self.queries = nn.Linear(embed_size,head_size,bias=BIAS)
        self.keys = nn.Linear(embed_size,head_size,bias=BIAS)
        self.values = nn.Linear(embed_size,head_size,bias=BIAS)

        self.register_buffer('tril', torch.tril(torch.ones(context,context)))
        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        BS, SL, VS = x.shape
        q = self.queries(x) # BS , SL , 54
        k = self.keys(x) # BS , SL , 54
        v = self.values(x) # BS , SL , 54

        attn_w = q @ k.transpose(-2,-1) * k.shape[-1]**0.5 # BS, SL, SL
        # q @ k is matrix multiprication q is BS*512*54 and k is also same that's why we transpose k to make it BS*54*512 so that we can matrix mulitply them
        # and k.shape[-1]**0.5 this part is used to normalize the value and keep them between 0 and 1 as softmax function do.
        # that part was discovered by scientist for nomalizing values

        attr_w = attn_w.masked_fill(self.tril[:SL,:SL] == 0, float('-inf'))
        attn_w = F.softmax(attn_w, dim = -1) # BS, SL, SL

        x = attn_w @ v # BS, SL, 54 as attn_w (BS, 512, 512) and v (BS, 512*54) so the middle one should be the same tha's why we didn't tranpose the v matrix here

        return x

        

In [18]:
# x,y = get_batch('train')
# model = GPT()
# model = model.to(dtype)
# model = model.to(device)

# logits, loss = model(x,y)
# print(loss.item())

In [None]:
# Training Setup

model = GPT()
model = model.to(dtype)
model = model.to(device)

if compile:
    print("TORCH :: Compiling Model")
    model = torch.compile(model)

print(sum(p.numel() for p in model.parameters()) / 1e6, "Milion Parameters")

In [None]:
# Calculating Loss Averages
@torch.no_grad()
def calculate_loss():
    out = {}
    model.eval()
    for split in ['train','eval']:
        l = torch.zeros(eval_iters)
        for i in range(eval_iters):
            x, y = get_batch(split)
            _, loss = model(x,y)
            l[i] = loss
        out[split] = l.mean().item()
    model.train()
    return out

print(calculate_loss())

In [21]:
# Settting up the optimizer

# so here we're creating a dictionary with keys as parameter name and values as a tensor with all the parameters andd collecting all those which require gradients
p_dict = {p_name : p for p_name, p in model.named_parameters() if p.requires_grad}

# we do weight dacay as we're limiting the values of weight like any weight cannot be greater than a value, this will bring flexibility and prevent weight value to reach too high.

# Now here we're seperating those parameters which would not require for weight decay such as biaas
# so, we're seperating the parameters whose dim are greater than 2 will require weight decay and tensors with dim less than 2 will no require weight decay 
weight_decay_p = [p for n, p in p_dict.items() if p.dim() >=2]
no_weight_decay_p = [p for n,p in p_dict.items() if p.dim() < 2]

# here we set weight decay value which require weight decay as weight_decay = 0.01 and which doesnt rquire it will set the weight decay value for them as 0.0
optimizer_groups = [
    {'params':weight_decay_p, 'weight_decay': weight_decay},
    {'params':no_weight_decay_p, 'weight_decay':0.0}
]

# These betas values are for ADAMW optimizer and suggested by scientist for better performance
optimizer = torch.optim.AdamW(optimizer_groups, lr=lr, betas=(0.9,0.99))

# so this scheduler is for LR so as we get near to our less loss value in the graph we want our learning rate to get decrease with that that's why we use this scheduler and CosineAnnealingLR which will do the same for us.
# eta_min refers to do take lr rate less than this, so lr will never go down to lr/10.
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_iters, eta_min=lr/10)

start_iteration = 0
best_val_loss = float('inf') # track the best validation score value


In [22]:
# Loading checkpoints

def load_checkpoint(path):
    print("LLM Loading Checkpoint")
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    iteration = checkpoint['iteration']
    loss = checkpoint['loss']
    print(f"Loaded iter {iteration} with loss {loss}")
    return iteration,loss

if os.path.exists(f"{checkpoint_dir}/{checkpoint_load_fn}") and load_pretrained:
    start_iteration, loss = load_checkpoint(checkpoint_dir + checkpoint_load_fn)
    best_val_loss = loss

In [23]:
# Generate a new sample
@torch.no_grad()
def generate_sample(input):
    t1 = torch.tensor(encode(input), dtype=torch.long, device=device) # Tokenize string -> (tensor of ids)
    t1 = t1[None,:]  # (1 , [size of ids])
    newgen = model.generate(t1,max=64)[0].tolist() # call the generate method, limit output size
    result=decode(newgen) # decode the result with the tokenizer to get back characters
    print(f"{result}")

In [None]:
# Inference

if inference:
    model.eval()
    while True:
        qs = input("Enter you text here (q to quit): ")
        if qs == "":
            continue
        if qs == "q":
            break
        generate_sample(qs)
    #sys.exit()

In [None]:
!nvidia-smi

In [None]:
# Training Loop

try:
    for i in tqdm(range(start_iteration,train_iters)):
        xb, yb = get_batch('train')
        logits, loss = model(xb, yb)

        # Evaluation Loss
        if ( i % eval_interval == 0 or i == train_iters - 1):
            l = calculate_loss()
            print(f"\n{i} train loss: {l['train']} / val loss: {l['eval']}")
            # generate_sample("once upon a time")

            if l['eval'] < best_val_loss:
                best_val_loss = l['eval']
                print(f"[Checkpoint]: Saving with loss: {best_val_loss}")
                torch.save({
                    'model_state_dic': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss' : best_val_loss,
                    'iteration' : i,
                }, checkpoint_dir + checkpoint_fn)
            
            if wandb_log:
                wandb.log({
                    "loss/train": l['train'],
                    "loss/eval" : l['eval'],
                    "lr" : scheduler.get_last_lr()[0],
                },
                step = i)
        
        optimizer.zero_grad(set_to_none=True)

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip)

        optimizer.step()
        scheduler.step()

    if wandb_log:
        wandb.finish()

except KeyboardInterrupt:
    print("Traning Interupted... Cleaning up!!")

finally:
    # Release GPU Memmory

    torch.cuda.empty_cache()
    print("GPU memory released")
    #sys.exit(0)

torch.cuda.empty_cache()