In [1]:
!wget -O input.txt https://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt

with open("input.txt", "r", encoding="utf-8") as f:
    text = f.read()


--2025-03-29 19:30:57--  https://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4573338 (4.4M) [text/plain]
Saving to: ‘input.txt’


2025-03-29 19:30:57 (17.8 MB/s) - ‘input.txt’ saved [4573338/4573338]



In [2]:
print("length of dataset in characters: ", len(text))

print(text[1000:2000])

length of dataset in characters:  4573338
Second Citizen:
Would you proceed especially against Caius Marcius?

All:
Against him first: he's a very dog to the commonalty.

Second Citizen:
Consider you what services he has done for his country?

First Citizen:
Very well; and could be content to give him good
report fort, but that he pays himself with being proud.

Second Citizen:
Nay, but speak not maliciously.

First Citizen:
I say unto you, what he hath done famously, he did
it to that end: though soft-conscienced men can be
content to say it was for his country he did it to
please his mother and to be partly proud; which he
is, even till the altitude of his virtue.

Second Citizen:
What he cannot help in his nature, you account a
vice in him. You must in no way say he is covetous.

First Citizen:
If I must not, I need not be barren of accusations;
he hath faults, with surplus, to tire in repetition.
What shouts are these? The other side o' the city
is risen: why stay we prating here? 

In [3]:
!pip install tiktoken

import tiktoken
import torch
import torch.nn as nn
from torch.nn import functional as F
import time
import math



In [4]:
class DataLoader:
    def __init__(self, B, T):
        self.B = B
        self.T = T

        enc = tiktoken.get_encoding("gpt2")
        tokens = enc.encode(text)
        self.tokens = torch.tensor(tokens)
        print(f"Loaded {len(tokens)} tokens")
        print(f"1 epoch = {len(tokens) // (B * T)} batches")

        self.current_position = 0

    def next_batch(self):
        B, T = self.B, self.T
        buffer = self.tokens[self.current_position : self.current_position + B * T + 1]
        x = (buffer[:-1]).view(B, T) # inputs
        y = (buffer[1:]).view(B, T) # targets

        self.current_position += B * T

        if self.current_position + B * T > len(self.tokens):
            self.current_position = 0

        return x, y



In [11]:
# Model hyperparameters
n_embed = 768          # Size of embeddings
n_head = 12            # Number of attention heads
n_layer = 12           # Number of transformer layers
batch_size = 4         # Batch size
block_size = 1024      # Context length
max_iters = 1005       # Training iterations
eval_interval = 500    # Evaluation interval
learning_rate = 5e-4   # Learning rate
eval_iters = 200       # Evaluation steps for validation
dropout = 0.1          # Dropout probability

vocab_size = 50304     # Vocabulary size :: Originally = 50257, update to nice number 50304 -> divisible by 128


device = "cpu"
if torch.cuda.is_available():
    device = "cuda"

elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = "mps"

print(f"Using device {device}")


Using device cuda


In [12]:
class CasualSelfAttention(nn.Module):
    def __init__(self):
        super().__init__()
        assert n_embed % n_head == 0, "Embedding dimension must be divisible by number of heads"

        self.n_head = n_head
        self.n_embed = n_embed

        self.c_attn = nn.Linear(n_embed, 3 * n_embed) # for all 3 (key, query, value)
        self.c_proj = nn.Linear(n_embed, n_embed)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embed, dim = 2) # 3 -> (B, T, n_embed)

        head_size = C // self.n_head
        q = q.view(B, T, self.n_head, head_size).transpose(1, 2)
        k = k.view(B, T, self.n_head, head_size).transpose(1, 2)
        v = v.view(B, T, self.n_head, head_size).transpose(1, 2)

        # Flash Attention
        y = F.scaled_dot_product_attention(q, k, v, is_causal = True)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # Reshape back to (B, T, n_embed)
        y = self.c_proj(y)

        return y


class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.c_fc = nn.Linear(n_embed, 4 * n_embed)
        self.gelu = nn.GELU()
        self.c_proj = nn.Linear(4 * n_embed, n_embed)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(nn.Module):
    def __init__(self):
        super().__init__()
        head_size = n_embed // n_head
        # self.attention = MultiHeadAttention(n_head, head_size) # 4 heads of 8-dimensional self-attention
        # self.fforward = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.attention = CasualSelfAttention()
        self.ln2 = nn.LayerNorm(n_embed)
        self.mlp = MLP()

    def forward(self, x):
        x = x + self.attention(self.ln1(x))
        # x = x + self.fforward(self.ln2(x))
        x = x + self.mlp(self.ln2(x))
        return x



class GPT(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        # self.blocks = nn.Sequential(*[Block(n_embed, n_head = n_head) for _ in range(n_layer)])
        self.blocks = nn.Sequential(*[Block() for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embed)
        self.language_model_head = nn.Linear(n_embed, vocab_size, bias = False)

        self.language_model_head.weight = self.token_embedding_table.weight

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std *= (2 * n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std = std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)

    def forward(self, idx, targets = None):
        B, T = idx.size()
        assert T <= block_size, f"Sequence length {T} exceeds block size {block_size}"

        token_embeddings = self.token_embedding_table(idx) #(Batch, Time, Channel)
        position_embeddings = self.position_embedding_table(torch.arange(0, T, dtype = torch.long, device = idx.device)) #(Time, Channel)
        x = token_embeddings + position_embeddings

        x = self.blocks(x)
        x = self.ln_f(x)

        logits = self.language_model_head(x) # (B, T, vocab_size)

        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx -> (B, T)
        with torch.no_grad():
            for i in range(max_new_tokens):
                idx_cond = idx[:, -block_size:] # (B, T)
                logits, loss = self(idx_cond) # (B, T, C)
                logits = logits[:, -1, :] # last time step only | becomes (B, C)
                prob = F.softmax(logits, dim = -1)
                idx_next = torch.multinomial(prob, num_samples = 1) # predicted | (B, 1)
                idx = torch.cat((idx, idx_next), dim = 1) # (B, T + 1)
        return idx

    def configure_optimizers(self, weight_decay, learning_rate, device_type):
        # start with all of the candidate parameters (that require grad)
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]

        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)

        if master_process:
            print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
            print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == "cuda"

        if master_process:
            print(f"using fused AdamW: {use_fused}")
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer



model = GPT(vocab_size)
model = model.to(device = device)
# model = torch.compile(model) # Works with A100, not much help for T4

# logits, loss = model(x_batch, y_batch)

# print(logits.shape)
# print(loss)

# idx = torch.zeros((1, 1), dtype = torch.long).to(device)
# print(decode(model.generate(idx, max_new_tokens = 100)[0].tolist()))

In [13]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params}")


Total number of parameters: 124475904


In [8]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

num_params = count_parameters(model)
print(f"The model has {num_params} trainable parameters.")

def count_parameters_by_layer(model):
    total_params = 0
    for name, param in model.named_parameters():
        if param.requires_grad:
            num_params = param.numel()
            print(f"Layer: {name}, Parameters: {num_params}")
            total_params += num_params
    print(f"Total Trainable Parameters: {total_params}")

count_parameters_by_layer(model)

The model has 124439808 trainable parameters.
Layer: token_embedding_table.weight, Parameters: 38597376
Layer: position_embedding_table.weight, Parameters: 786432
Layer: blocks.0.ln1.weight, Parameters: 768
Layer: blocks.0.ln1.bias, Parameters: 768
Layer: blocks.0.attention.c_attn.weight, Parameters: 1769472
Layer: blocks.0.attention.c_attn.bias, Parameters: 2304
Layer: blocks.0.attention.c_proj.weight, Parameters: 589824
Layer: blocks.0.attention.c_proj.bias, Parameters: 768
Layer: blocks.0.ln2.weight, Parameters: 768
Layer: blocks.0.ln2.bias, Parameters: 768
Layer: blocks.0.mlp.c_fc.weight, Parameters: 2359296
Layer: blocks.0.mlp.c_fc.bias, Parameters: 3072
Layer: blocks.0.mlp.c_proj.weight, Parameters: 2359296
Layer: blocks.0.mlp.c_proj.bias, Parameters: 768
Layer: blocks.1.ln1.weight, Parameters: 768
Layer: blocks.1.ln1.bias, Parameters: 768
Layer: blocks.1.attention.c_attn.weight, Parameters: 1769472
Layer: blocks.1.attention.c_attn.bias, Parameters: 2304
Layer: blocks.1.attention

In [17]:
warmup_steps = 10      # learning rate increase upto this step
max_learning_rate = 6e-4
min_learning_rate = 0.1 * max_learning_rate
max_steps = 50

def get_learning_rate(it):
    if it < warmup_steps:
        return max_learning_rate * (it + 1) / warmup_steps

    if it > max_steps:
        return min_learning_rate

    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_learning_rate + coeff * (max_learning_rate - min_learning_rate)

# optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4, betas = (0.9, 0.95), eps = 1e-8)
optimizer = model.configure_optimizers(weight_decay = 0.1, learning_rate = 6e-4, device = device)


# In order to simulate GPT-3 0.5M batch size
total_batch_size = 524288
assert total_batch_size % (B * T) == 0, "check total_batch_size portion again"
grad_accumulation_steps = total_batch_size // (B * T)
print(f"Gradient accumulation steps: {grad_accumulation_steps}")
print(f"=> calculated gradient accumulation steps = {grad_accumulation_steps}")



# try to overfit model over single batch
train_loader = DataLoader(B = batch_size, T = block_size)

torch.set_float32_matmul_precision('high')


x, y = train_loader.next_batch()
x, y = x.to(device), y.to(device)


for step in range(50):
#     if (step % 25 == 0):
#         losses = estimate_loss()
#         print(f"step: {step}, train loss: {losses['train']}, val loss: {losses['val']:.4f}")


    t0 = time.time()
    optimizer.zero_grad()

    loss_accum = 0.0
    for micro_step in range(grad_accumulation_steps):
        x_batch, y_batch = x, y
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)


        logits, loss = model(x_batch, y_batch)
        loss = loss / grad_accumulation_steps
        loss_accum += loss.detach()
        loss.backward()

    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    learning_rate = get_learning_rate(step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate

    optimizer.step()


    torch.cuda.synchronize()
    T = (time.time() - t0) * 1000
    tokens_per_second = (train_loader.B * train_loader.T) / T * 1000
    print(f"Step {step}, Loss: {loss.item()} | Learning Rate = {learning_rate:.4f} | Norm = {norm:.4f} | Time: {T:.2f}ms | tok/sec = {tokens_per_second}")

print(loss.item())

Loaded 1395481 tokens
1 epoch = 340 batches
Step 0, Loss: 7.190181255340576, Time: 1172.14ms, tok/sec = 3494.4513840410973
Step 1, Loss: 8.258306503295898, Time: 1000.24ms, tok/sec = 4095.021718082066
Step 2, Loss: 7.2768354415893555, Time: 992.76ms, tok/sec = 4125.855691454731
Step 3, Loss: 6.7461113929748535, Time: 993.98ms, tok/sec = 4120.821384299503
Step 4, Loss: 10.635149002075195, Time: 1000.34ms, tok/sec = 4094.602039785153
Step 5, Loss: 6.351304054260254, Time: 1003.89ms, tok/sec = 4080.1252413241464
Step 6, Loss: 6.2443413734436035, Time: 1007.30ms, tok/sec = 4066.3249595908837
Step 7, Loss: 6.077468395233154, Time: 1003.48ms, tok/sec = 4081.8061925605543
Step 8, Loss: 5.961909294128418, Time: 1013.84ms, tok/sec = 4040.100580927617


KeyboardInterrupt: 

In [None]:
train_loader = DataLoader(B = batch_size, T = block_size, split = 'train')


for iter in range(100):
    # if (iter % eval_interval == 0):
    #     losses = estimate_loss()
    #     print(f"step: {iter}, train loss: {losses['train']}, val loss: {losses['val']:.4f}")

    t0 = time.time()
    x_batch, y_batch = train_loader.next_batch()
    x_batch, y_batch = x_batch.to(device), y_batch.to(device)

    optimizer.zero_grad()
    logits, loss = model(x_batch, y_batch)
    loss.backward()
    optimizer.step()

    torch.cuda.synchronize()
    T = (time.time() - t0) * 1000
    print(f"Step {iter}, Loss: {loss.item()}, Time: {T:.2f}ms")

print(loss.item())

6.053647518157959


In [None]:
torch.save(model.state_dict(), 'gpt2_model.pth')


In [None]:
model = GPT(vocab_size)  # Initialize model architecture

# If using CPU
# model.load_state_dict(torch.load('ngram_language_model.pth', map_location=torch.device('cpu')))

# If using GPU
model.load_state_dict(torch.load('gpt2_model.pth'))  # Load saved parameters
model = model.to(device)  # Move to GPU if available

# **FINAL CODE**

In [None]:
# Download and load dataset
!wget -O input.txt https://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt

with open("input.txt", "r", encoding="utf-8") as f:
    text = f.read()



# Imports
!pip install tiktoken

import tiktoken
import torch
import torch.nn as nn
from torch.nn import functional as F
import time
import math



# Custom Dataloader
class DataLoader:
    def __init__(self, B, T):
        self.B = B
        self.T = T

        enc = tiktoken.get_encoding("gpt2")
        tokens = enc.encode(text)
        self.tokens = torch.tensor(tokens)
        print(f"Loaded {len(tokens)} tokens")
        print(f"1 epoch = {len(tokens) // (B * T)} batches")

        self.current_position = 0

    def next_batch(self):
        B, T = self.B, self.T
        buffer = self.tokens[self.current_position : self.current_position + B * T + 1]
        x = (buffer[:-1]).view(B, T) # inputs
        y = (buffer[1:]).view(B, T) # targets

        self.current_position += B * T

        if self.current_position + B * T > len(self.tokens):
            self.current_position = 0

        return x, y



# Hyperparameters and device selection
n_embed = 768          # Size of embeddings
n_head = 12            # Number of attention heads
n_layer = 12           # Number of transformer layers
batch_size = 4         # Batch size
block_size = 1024      # Context length
max_iters = 1005       # Training iterations
eval_interval = 500    # Evaluation interval
learning_rate = 5e-4   # Learning rate
eval_iters = 200       # Evaluation steps for validation
dropout = 0.1          # Dropout probability

vocab_size = 50304     # Vocabulary size :: Originally = 50257, update to nice number 50304 -> divisible by 128


device = "cpu"
if torch.cuda.is_available():
    device = "cuda"

elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = "mps"

print(f"Using device {device}")



# GPT Model
class CasualSelfAttention(nn.Module):
    def __init__(self):
        super().__init__()
        assert n_embed % n_head == 0, "Embedding dimension must be divisible by number of heads"

        self.n_head = n_head
        self.n_embed = n_embed

        self.c_attn = nn.Linear(n_embed, 3 * n_embed) # for all 3 (key, query, value)
        self.c_proj = nn.Linear(n_embed, n_embed)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embed, dim = 2) # 3 -> (B, T, n_embed)

        head_size = C // self.n_head
        q = q.view(B, T, self.n_head, head_size).transpose(1, 2)
        k = k.view(B, T, self.n_head, head_size).transpose(1, 2)
        v = v.view(B, T, self.n_head, head_size).transpose(1, 2)

        # Flash Attention
        y = F.scaled_dot_product_attention(q, k, v, is_causal = True)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # Reshape back to (B, T, n_embed)
        y = self.c_proj(y)

        return y


class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.c_fc = nn.Linear(n_embed, 4 * n_embed)
        self.gelu = nn.GELU()
        self.c_proj = nn.Linear(4 * n_embed, n_embed)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(nn.Module):
    def __init__(self):
        super().__init__()
        head_size = n_embed // n_head
        # self.attention = MultiHeadAttention(n_head, head_size) # 4 heads of 8-dimensional self-attention
        # self.fforward = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.attention = CasualSelfAttention()
        self.ln2 = nn.LayerNorm(n_embed)
        self.mlp = MLP()

    def forward(self, x):
        x = x + self.attention(self.ln1(x))
        # x = x + self.fforward(self.ln2(x))
        x = x + self.mlp(self.ln2(x))
        return x



class GPT(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        # self.blocks = nn.Sequential(*[Block(n_embed, n_head = n_head) for _ in range(n_layer)])
        self.blocks = nn.Sequential(*[Block() for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embed)
        self.language_model_head = nn.Linear(n_embed, vocab_size, bias = False)

        self.language_model_head.weight = self.token_embedding_table.weight

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std *= (2 * n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std = std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)

    def forward(self, idx, targets = None):
        B, T = idx.size()
        assert T <= block_size, f"Sequence length {T} exceeds block size {block_size}"

        token_embeddings = self.token_embedding_table(idx) #(Batch, Time, Channel)
        position_embeddings = self.position_embedding_table(torch.arange(0, T, dtype = torch.long, device = idx.device)) #(Time, Channel)
        x = token_embeddings + position_embeddings

        x = self.blocks(x)
        x = self.ln_f(x)

        logits = self.language_model_head(x) # (B, T, vocab_size)

        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx -> (B, T)
        with torch.no_grad():
            for i in range(max_new_tokens):
                idx_cond = idx[:, -block_size:] # (B, T)
                logits, loss = self(idx_cond) # (B, T, C)
                logits = logits[:, -1, :] # last time step only | becomes (B, C)
                prob = F.softmax(logits, dim = -1)
                idx_next = torch.multinomial(prob, num_samples = 1) # predicted | (B, 1)
                idx = torch.cat((idx, idx_next), dim = 1) # (B, T + 1)
        return idx

    def configure_optimizers(self, weight_decay, learning_rate, device_type):
        # start with all of the candidate parameters (that require grad)
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]

        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)

        if master_process:
            print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
            print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == "cuda"

        if master_process:
            print(f"using fused AdamW: {use_fused}")
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer


model = GPT(vocab_size)
model = model.to(device = device)
model = torch.compile(model) # Works with A100, not much help for T4

total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params}")



# Training loop
warmup_steps = 10      # learning rate increase upto this step
max_learning_rate = 6e-4
min_learning_rate = 0.1 * max_learning_rate
max_steps = 50

def get_learning_rate(it):
    if it < warmup_steps:
        return max_learning_rate * (it + 1) / warmup_steps

    if it > max_steps:
        return min_learning_rate

    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_learning_rate + coeff * (max_learning_rate - min_learning_rate)

# optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4, betas = (0.9, 0.95), eps = 1e-8)
optimizer = model.configure_optimizers(weight_decay = 0.1, learning_rate = 6e-4, device = device)


# In order to simulate GPT-3 0.5M batch size
total_batch_size = 524288
assert total_batch_size % (B * T) == 0, "check total_batch_size portion again"
grad_accumulation_steps = total_batch_size // (B * T)
print(f"Gradient accumulation steps: {grad_accumulation_steps}")
print(f"=> calculated gradient accumulation steps = {grad_accumulation_steps}")



# try to overfit model over single batch
train_loader = DataLoader(B = batch_size, T = block_size)

torch.set_float32_matmul_precision('high')


for step in range(50):
    t0 = time.time()
    optimizer.zero_grad()

    loss_accum = 0.0
    for micro_step in range(grad_accumulation_steps):
        x_batch, y_batch = train_loader.next_batch()
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)


        logits, loss = model(x_batch, y_batch)
        loss = loss / grad_accumulation_steps
        loss_accum += loss.detach()
        loss.backward()

    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    learning_rate = get_learning_rate(step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate

    optimizer.step()


    torch.cuda.synchronize()
    T = (time.time() - t0) * 1000
    tokens_per_second = (train_loader.B * train_loader.T) / T * 1000
    print(f"Step {step}, Loss: {loss.item()} | Learning Rate = {learning_rate:.4f} | Norm = {norm:.4f} | Time: {T:.2f}ms | tok/sec = {tokens_per_second}")

print(loss.item())
