In [None]:
import torch
import torch.nn as nn
import timeit
from typing import Iterable
from jaxtyping import Float

from einops import rearrange, einsum, reduce

In [None]:
# https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
a100_flop_per_sec = 312e12  # 312 TFLOP/s

# https://resources.nvidia.com/en-us-tensor-core/nvidia-tensor-core-gpu-datasheet
h100_flop_per_sec = 1979e12 / 2  # 1979 TFLOP/s with sparsity (BF16 tensor core)

In [None]:
total_flops = 6 * 70e9 * 15e12  # @inspect total_flops
assert h100_flop_per_sec == 1979e12 / 2
mfu = 0.5
flops_per_day = h100_flop_per_sec * mfu * 1024 * 60 * 60 * 24  # @inspect flops_per_day
days = total_flops / flops_per_day  # @inspect days
print(f"days: {days}")

In [None]:
h100_bytes = 80e9
bytes_per_parameter = 4 + 4 + (4 + 4)
num_params = h100_bytes / bytes_per_parameter
print(f"num_params: {num_params}")

In [None]:
def get_memory_usage(x: torch.tensor):
    return x.numel() * x.element_size()

In [None]:
x = torch.zeros(4, 8)
x.dtype, x.numel(), get_memory_usage(x)

In [None]:
a = torch.empty(12288 * 4, 12288)
a.element_size(), a.numel()

In [None]:
get_memory_usage(a) / (1e9)

In [None]:
x = torch.zeros(4, 8, dtype=torch.float16)
x.element_size()

In [None]:
x = torch.tensor([1e-8], dtype=torch.float16)
x

In [None]:
x = torch.tensor([1e-8], dtype=torch.bfloat16); x


In [None]:
torch.finfo(torch.bfloat16)

In [None]:
x.device

In [None]:
num_gpus = torch.cuda.device_count(); num_gpus

In [None]:
for i in range(num_gpus):
    properties = torch.cuda.get_device_name(i)

In [None]:
properties

In [None]:
memory_allocated = torch.cuda.memory_allocated(); memory_allocated

In [None]:
y = x.to('cuda:0')
# y.device
# assert y.device == torch.device("cuda", 0)


In [None]:
get_memory_usage(x)

In [None]:
new_memory_usage = torch.cuda.memory_allocated()
memory_used = new_memory_usage - memory_allocated; memory_used

In [None]:
new_memory_usage

In [None]:
x

In [None]:
torch.cuda.memory_allocated()

In [None]:
memory_allocated = torch.cuda.memory_allocated()  # @inspect memory_allocated
x = torch.zeros(32, 32)
y = x.to("cuda:0")
assert y.device == torch.device("cuda", 0)

z = torch.zeros(32, 32, device="cuda:0")

new_memory_allocated = torch.cuda.memory_allocated()  # @inspect new_memory_allocated
memory_used = new_memory_allocated - memory_allocated  # @inspect memory_used
# assert memory_used == 2 * (32 * 32 * 4)  # 2 32x32 matrices of 4-byte floats


In [None]:
memory_used

In [None]:
x = torch.ones(4, 8, 16, 32)
w = torch.ones(32, 2)
y = x@w
y.shape

In [None]:
# MFU is actual flops / promised flops

In [None]:
x = torch.tensor([1., 2., 3.])
w = torch.tensor([1., 1., 1.], requires_grad=True)
pred_y = x @ w
pred_y

In [None]:
input_dim = 16384
output_dim = 32
w = nn.Parameter(torch.randn(input_dim, output_dim))
x = nn.Parameter(torch.randn(input_dim))
w.shape, x.shape

In [None]:
output = x @ w
output.shape

In [None]:
output[0]

In [None]:
import numpy as np

w = nn.Parameter(torch.randn(input_dim, output_dim) / np.sqrt(input_dim))
output = x @ w  # @inspect output
output[0]

In [None]:
w = torch.tensor([1., 1., 1.], requires_grad=True)

In [None]:
class SGD(torch.optim.Optimizer):
    def __init__(self, params: Iterable[nn.Parameter], lr: float = 0.01):
        super(SGD, self).__init__(params, dict(lr=lr))

    def step(self):
        for group in self.param_groups:
            lr = group['lr']
            for p in group['params']:
                grad = p.grad.data
                p.data -= lr * grad

In [None]:
sgd = SGD([w])
sgd.param_groups