In [1]:
!nvidia-smi

Tue Jul 30 14:16:04 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off | 00000000:01:00.0  On |                  Off |
| 30%   44C    P5              33W / 450W |    430MiB / 24564MiB |     24%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import math
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import DataLoader
from dataset import TextFileDataset
from gpt import GPT2
from transformers import pipeline, set_seed



In [3]:
hf_model = GPT2LMHeadModel.from_pretrained('gpt2', resume_download=None).cuda()
hf_model.eval()

gpt2_model = GPT2.from_pretrained().cuda()
gpt2_model.eval()

print("Loaded models")


Loaded models


## Sanity Checks

In [4]:
# for every power of 2 token size up to 1024, let's compare the output of the two models

def compare_models(hf_model, gpt2_model, tokenizer, max_token_size=1024):
    for token_size in [2 ** i for i in range(int(math.log2(max_token_size)) + 1)]:
        input_ids = torch.randint(0, tokenizer.vocab_size, (1, token_size)).cuda()
        print(f"Token size: {token_size}, input_ids size {input_ids.size()}")
        with torch.no_grad():
            hf_output = hf_model(input_ids)
            gpt2_output = gpt2_model(input_ids)
        assert torch.allclose(hf_output.logits, gpt2_output, atol=1e-4)

In [5]:
compare_models(hf_model, gpt2_model, GPT2Tokenizer.from_pretrained('gpt2'))

Token size: 1, input_ids size torch.Size([1, 1])




Token size: 2, input_ids size torch.Size([1, 2])
Token size: 4, input_ids size torch.Size([1, 4])
Token size: 8, input_ids size torch.Size([1, 8])
Token size: 16, input_ids size torch.Size([1, 16])
Token size: 32, input_ids size torch.Size([1, 32])
Token size: 64, input_ids size torch.Size([1, 64])
Token size: 128, input_ids size torch.Size([1, 128])
Token size: 256, input_ids size torch.Size([1, 256])
Token size: 512, input_ids size torch.Size([1, 512])
Token size: 1024, input_ids size torch.Size([1, 1024])


## torch.compile

In [6]:
gpt2_model = gpt2_model.cuda()
gpt2_model_compiled = torch.compile(gpt2_model.cuda(), mode="max-autotune")
gpt2_model_compiled

OptimizedModule(
  (_orig_mod): GPT2(
    (token_embedding): Embedding(50257, 768)
    (positional_embedding): Embedding(1024, 768)
    (embedding_dropout): Dropout(p=0.1, inplace=False)
    (layers): ModuleList(
      (0-11): 12 x GPT2Layer(
        (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attention): CausalMultiHeadAttention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (out): Linear(in_features=768, out_features=768, bias=True)
          (residual_dropout): Dropout(p=0.1, inplace=False)
        )
        (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (activation): GELU(approximate='tanh')
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_

## Timeit

In [7]:
torch.cuda.empty_cache()

In [8]:
from dataset import TextFileDataset
dataset = TextFileDataset("data/1984.txt", sequence_length=1024)
dl = iter(DataLoader(dataset, batch_size=8, shuffle=False))

Token indices sequence length is longer than the specified maximum sequence length for this model (141170 > 1024). Running this sequence through the model will result in indexing errors


In [9]:
input_tokens, next_token = next(dl)

In [10]:
input_tokens.size()

torch.Size([8, 1024])

In [14]:
for i in range(100):
    time_start = time.time()
    input_tokens, next_token = next(dl)
    _ = gpt2_model_compiled(input_tokens.cuda())
    tokens_processed = input_tokens.size(1) * input_tokens.size(0) / (time.time() - time_start)
    print(f"Tokens processed per second: {tokens_processed:.0f}")


Tokens processed per second: 35621
Tokens processed per second: 27847
Tokens processed per second: 36239
Tokens processed per second: 33874
Tokens processed per second: 32038
Tokens processed per second: 30608
Tokens processed per second: 28986
Tokens processed per second: 27800
Tokens processed per second: 26262
Tokens processed per second: 25368
Tokens processed per second: 24330
Tokens processed per second: 23415
Tokens processed per second: 22534
Tokens processed per second: 21721
Tokens processed per second: 20923
Tokens processed per second: 20150
Tokens processed per second: 19465
Tokens processed per second: 18924
Tokens processed per second: 18316
Tokens processed per second: 17814
Tokens processed per second: 17088
Tokens processed per second: 16792
Tokens processed per second: 15795
Tokens processed per second: 15798
Tokens processed per second: 15237
Tokens processed per second: 14894
Tokens processed per second: 14581
Tokens processed per second: 14279
Tokens processed per

KeyboardInterrupt: 

## Pytorch Profiler

In [None]:
# generate sample inputs for 1024 tokens and batch size 1
input_ids_list = [torch.randint(0, 50256, (16, 1024)).cuda() for _ in range(10)]

In [None]:
def get_profiler() -> torch.profiler.profile:
    return torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],

        # In this example with wait=1, warmup=1, active=2, repeat=1,
        # profiler will skip the first step/iteration,
        # start warming up on the second, record
        # the third and the forth iterations,
        # after which the trace will become available
        # and on_trace_ready (when set) is called;
        # the cycle repeats starting with the next step

        schedule=torch.profiler.schedule(
            wait=1,
            warmup=1,
            active=3,
            repeat=1),
        on_trace_ready=torch.profiler.tensorboard_trace_handler("./performance_trace"),
        # record_shapes=True,
        profile_memory=True,
        with_stack=True
        # used when outputting for tensorboard
    )

### Profile Eager Model

In [None]:
from torch.profiler import profile, record_function, ProfilerActivity
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(log_dir="./performance_trace", flush_secs=30)

prof = get_profiler()
prof.start()
for input_ids in input_ids_list:
    with torch.no_grad():
        _ = gpt2_model(input_ids)
        prof.step()
prof.stop()
writer.flush()

### Profile the compiled model

In [None]:
writer = SummaryWriter(log_dir="./performance_trace", flush_secs=30)

prof = get_profiler()
prof.start()
for input_ids in input_ids_list:
    with torch.no_grad():
        _ = gpt2_model_compiled(input_ids)
        prof.step()
prof.stop()
writer.flush()