In [5]:
# Complete tinyGPT
# Then build and train gpt2
# Papers - Attention is all you need
# GPT 2/3 paper

# Llamac/llama2.c
# Cuda 

#### commands
""""
To see the gpu stats
nvidia-smi 
# To see al the cpu stats


"""

'"\nTo see the gpu stats\nnvidia-smi \n# To see al the cpu stats\n\n\n'

In [6]:
import torch

print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0))
print("Supports BF16:", torch.cuda.is_bf16_supported())
print("Supports TF32:", torch.backends.cuda.matmul.allow_tf32)
print("Supports FP16:", torch.cuda.get_device_capability(0)[0] >= 7)

if torch.cuda.is_available():
    gpu_id = 0  # Change if you have multiple GPUs
    total_memory = torch.cuda.get_device_properties(gpu_id).total_memory
    print(f"Total GPU memory: {total_memory / (1024**3):.2f} GB")
    # print(f"Allocated memory: {torch.cuda.memory_allocated(gpu_id) / (1024**2):.2f} MB")
    # print(f"Reserved memory : {torch.cuda.memory_reserved(gpu_id) / (1024**2):.2f} MB")
else:
    print("CUDA not available")

CUDA available: True
GPU name: NVIDIA GeForce GTX 1650
Supports BF16: True
Supports TF32: False
Supports FP16: True
Total GPU memory: 4.00 GB


In [7]:
# Calculation of Time and money needed to run the model
# Calculate the number of parmameters that model contains


In [8]:
# import torch
# how many neurons and number of values needed to represent that neuron
# GPT-> Token embedding, position embedding, Module List, lmhead
# Module List -> n_layer number of blocks
# Block -> Casual self attention + Feed forward network
# FeedForward network -> 2 linear model with gelu in between
# Self attention block -> casual attention + projection
vocab_size = 50304 
block_size = 1024
n_embd = 768
n_layer = 12
n_head = 12
batch_size = 12



In [10]:
# calculate in a single loop run how much memory is required
# model_size = (total_parameters * 2) / (2 ** 20)
# Size of the model if it is stored in FP16
# print(f"{model_size:.2f} MB")

In [13]:
# input_size = (batch_size * block_size * n_embd) * 2
# for activation input size becomes
# input_size = input_size * 3
# model_weight_size = total_parameters * 2
# optimizer (AdamW) stores 3 copies(weights + momentum + velocity)
# optimizer_size = model_weight_size * 3

# total_memory = input_size + model_weight_size + optimizer_size
# print(f"total size: {total_memory//(2**20):.2f}")

In [27]:
# model config
vocab_size = 50257
block_size = 1024
n_embd = 768
n_layer = 12
n_head = 12
batch_size = 12

In [28]:
# with Bias calculation
def params():
    out = dict()
    # print(out)
    qkv_proj_param = (n_embd * 3 * n_embd) + (3 * n_embd)
    out_proj_param = (n_embd * n_embd) + (n_embd)
    attn_param = qkv_proj_param + out_proj_param

    feed_ln1_param = (n_embd * 4 * n_embd) + (4 * n_embd)
    feed_ln2_param = (4 * n_embd * n_embd) + n_embd
    feed_param = feed_ln1_param + feed_ln2_param

    layer_norm_param = 2 * 2 * n_embd
    block_param =  layer_norm_param + attn_param + feed_param # a layer norm contains 2 param
    wte = vocab_size * n_embd # token embeddings
    wpe = block_size * n_embd # position embeddings
    # decay_param = (3 * n_embd) + (n_embd) + (4 * n_embd) + n_embd + 4 * n_embd
    non_decay_param = n_layer * (13 * n_embd)
    total_parameters = (wte + wpe + 2 + n_layer * block_param)
    print(f"wte: {wte}, wpe: {wpe}")
    print(f"total parameters: {total_parameters}")
    print(f"non decay parameters: {non_decay_param}")
    print(f"non decay parameters: {non_decay_param//n_layer}")
    print(f"decay parameters: {total_parameters - non_decay_param}")
    # find out decayed and non decay params of the model
    return total_parameters

total_parameters = params()

wte: 38597376, wpe: 786432
total parameters: 124438274
non decay parameters: 119808
non decay parameters: 9984
decay parameters: 124318466


In [29]:
# suppose model saved in FP32 - 4 bytes
param_bytes = total_parameters * 4
params_buffer_bytes = param_bytes + 2 * param_bytes
print(f"checkpoint size: {params_buffer_bytes/1e9:.2f} GB")
# param_bytes

checkpoint size: 1.49 GB


In [30]:
# my laptop gpu memory
total_gpu_memory = torch.cuda.get_device_properties(gpu_id).total_memory
print(f"Total GPU memory: {total_memory / (1024**3):.2f} GB")
# 4 GB

print(f"memory taken up just for parameters: {params_buffer_bytes / total_gpu_memory * 100:.2f}")



Total GPU memory: 4.00 GB
memory taken up just for parameters: 34.77


In [31]:
# estimating flops for single forward pass
def flops():
    out = {}
    head_size = n_embd // n_head

    # attention block
    # 1 projections of input token
    # B, T, C
    # (T, C) (c, 3*c) -> 2 * T * C  3*C
    kqv = 2 * block_size * (n_embd * 3 * n_embd)
    # 2 attention socre calculation    
    # (B, T, C), (B, C, T)= (B, nh, T, ns) @ (B, nh, ns, T) -> (B, nh, T, T) 
    attn_sc = 2 * block_size * block_size * n_embd
    # value calculation
    val = 2 * n_head * (block_size * block_size * head_size)
    proj = 2 * block_size * (n_embd * n_embd)
    attn_flops = kqv + attn_sc + val + proj

    # MLP block / Feed forward block
    # ignoring non linear and dropout because they have very less impact
    ffw_size = 4 * n_embd
    ffw1 = 2 * block_size * (n_embd * ffw_size)
    ffw2 = 2 * block_size * (ffw_size * n_embd)
    ffw_flops = ffw1 + ffw2

    # transformer block and other operatoins
    block = attn_flops + ffw_flops
    transformer = n_layer * block
    dense = 2 * block_size * (n_embd * vocab_size)

    # forward, backward total
    forward_total = transformer + dense
    backward_total = 2 * forward_total
    total = forward_total + backward_total

    return (forward_total, backward_total, total)
f = flops()
print(f)

(291648307200, 583296614400, 874944921600)


In [37]:
# calculating flops using PaLM paper
N = total_parameters - (786432) # positional token
L, H, Q, T = n_layer, n_head, n_embd//n_head, block_size
mf_per_token = 6*N + 12*L*H*Q*T
mf = mf_per_token * block_size
print(mf)
print(f"ratio: {mf/f[2]}")

875681034240
ratio: 1.0008413245472114


In [40]:
# My GPU rate performance fp16: 6 TFLOps
# fp32: 3 TFlops
batch_size = 20 * 5 # 100 batch size
measured_time = 0.755 # seconds per iteration
measured_throughput = batch_size / measured_time
flops_achieved = f[2] * measured_throughput


nvidia_1650_rtx_flops = 3e12

print(f"fraction of my pc gpu used: {flops_achieved / nvidia_1650_rtx_flops * 100:.2f}")

fraction of my pc gpu used: 3862.89


In [None]:
model_size = total_parameters
token_num = 300e9 # 300B token
nvidia_1650 = 6e12
assumed_mfu = 1
flops_throughput = nvidia_1650 * 1 * assumed_mfu
flops_needed = 6 * model_size * token_num
time_needed_s = flops_needed / flops_throughput
print(f"time needed to train the model: {time_needed_s/3600/24:.2f} days")

time needed to train the model: 432.08 days


In [47]:
# how to calculate gpu flops

import torch
import torch.nn as nn
from torch.profiler import profile, record_function, ProfilerActivity

class MySimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(64 * 32 * 32, 10) # Assuming 32x32 input after conv

    def forward(self, x):
        x = self.conv(x)
        x = self.relu(x)
        x = x.view(x.size(0), -1) # Flatten
        x = self.fc(x)
        return x

model = MySimpleModel().cuda()
# Dummy input: batch_size=1, 3 channels, 32x32 image
inputs = torch.randn(1, 3, 32, 32).cuda()

with profile(activities=[
    ProfilerActivity.CPU, ProfilerActivity.CUDA], with_flops=True) as prof:
    model(inputs)

# Print a summary table sorted by FLOPs
print(prof.key_averages().table(sort_by="flops", row_limit=10))

# To get the total FLOPs, you might need to sum them up or look at the prof.total_average() if available for flops.
# Note: ReLU and other element-wise operations might be counted as 0 FLOPs by some profilers,
# or a very small constant per element.

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  Total MFLOPs  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::conv2d         0.62%       2.706ms        97.74%     426.997ms     426.997ms             1         3.539  
                                            aten::addmm         1.33%       5.825ms         1.37%       6.000ms       6.000ms             1         1.311  
                                      aten::convolution         0.32%       1.377ms        97.12%     424.291ms     424.291ms             1            --  
                                     aten::_convolution         