In [1]:
import math
from dataclasses import dataclass

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from gpt import GPT2
from transformers import pipeline, set_seed

In [2]:
hf_model = GPT2LMHeadModel.from_pretrained('gpt2', resume_download=None).cuda()
hf_model.eval()

gpt2_model = GPT2.from_pretrained().cuda()
gpt2_model.eval()

print("Loaded models")


Loaded models


## Sanity Checks

In [3]:
# for every power of 2 token size up to 1024, let's compare the output of the two models

def compare_models(hf_model, gpt2_model, tokenizer, max_token_size=1024):
    for token_size in [2 ** i for i in range(int(math.log2(max_token_size)) + 1)]:
        input_ids = torch.randint(0, tokenizer.vocab_size, (1, token_size)).cuda()
        print(f"Token size: {token_size}, input_ids size {input_ids.size()}")
        with torch.no_grad():
            hf_output = hf_model(input_ids)
            gpt2_output = gpt2_model(input_ids)
        assert torch.allclose(hf_output.logits, gpt2_output, atol=1e-4)

In [4]:
compare_models(hf_model, gpt2_model, GPT2Tokenizer.from_pretrained('gpt2'))



Token size: 1, input_ids size torch.Size([1, 1])
Token size: 2, input_ids size torch.Size([1, 2])
Token size: 4, input_ids size torch.Size([1, 4])
Token size: 8, input_ids size torch.Size([1, 8])
Token size: 16, input_ids size torch.Size([1, 16])
Token size: 32, input_ids size torch.Size([1, 32])
Token size: 64, input_ids size torch.Size([1, 64])
Token size: 128, input_ids size torch.Size([1, 128])
Token size: 256, input_ids size torch.Size([1, 256])
Token size: 512, input_ids size torch.Size([1, 512])
Token size: 1024, input_ids size torch.Size([1, 1024])


## Pytorch Profiler

In [5]:
# generate sample inputs for 1024 tokens and batch size 1
input_ids_list = [torch.randint(0, 50256, (1, 1024)).cuda() for _ in range(10)]

In [6]:
gpt2_model = gpt2_model.cuda()

In [7]:
from torch.profiler import profile, record_function, ProfilerActivity
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(log_dir="./performance_trace", flush_secs=30)

prof = torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ],

    # In this example with wait=1, warmup=1, active=2, repeat=1,
    # profiler will skip the first step/iteration,
    # start warming up on the second, record
    # the third and the forth iterations,
    # after which the trace will become available
    # and on_trace_ready (when set) is called;
    # the cycle repeats starting with the next step

    schedule=torch.profiler.schedule(
        wait=1,
        warmup=1,
        active=3,
        repeat=1),
    on_trace_ready=torch.profiler.tensorboard_trace_handler("./performance_trace"),
    # record_shapes=True,
    profile_memory=True,
    with_stack=True
    # used when outputting for tensorboard
)
prof.start()
for input_ids in input_ids_list:
    with torch.no_grad():
        _ = gpt2_model(input_ids)
        prof.step()
prof.stop()
writer.flush()

STAGE:2024-07-27 07:48:07 127521:127521 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-07-27 07:48:07 127521:127521 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-07-27 07:48:07 127521:127521 ActivityProfilerController.cpp:324] Completed Stage: Post Processing
