In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from pprint import pprint
import time
import gc

In [None]:
model_base = "gpt2-xl"
model = GPT2LMHeadModel.from_pretrained(model_base)

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

class DummyContext:
    def __enter__(self):
        pass

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass

def get_memory_usage(device):
    mem_alloc = torch.cuda.memory_allocated(device)
    mem_reserved = torch.cuda.memory_reserved(device)
    return mem_alloc, mem_reserved

def memory_footprint_gpu(model, batch_size, data_type=torch.float32, device=torch.device('cuda'), mode="train"):
    assert mode in ("train", "inference")
    model.to(device)
    # Calculate the size of the model parameters
    num_parameters = sum(p.numel() for p in model.parameters())
    dtype_size = torch.tensor(1, dtype=data_type).element_size()
    parameter_memory = num_parameters * dtype_size

    # Perform a forward pass to estimate activations size
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    input_text = "This is a sample text."
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    print(f"number of tokens: {input_ids.shape[1]}")

    context = torch.no_grad() if mode == "inference" else DummyContext()
    with torch.no_grad():
        # Repeat the input to match the batch size of 1
        input_ids_batch = input_ids

        # Clear any existing CUDA cache
        torch.cuda.empty_cache()
        gc.collect()

        # Monitor memory usage before and after running the model
        torch.cuda.synchronize()
        mem_before, _ = get_memory_usage(device)
        model(input_ids_batch)
        torch.cuda.synchronize()
        mem_after, _ = get_memory_usage(device)

        print("mem_before")
        print(mem_before)
        print("mem_Afeter")
        print(mem_after)

        # Calculate the size of activations
        activations_memory = (mem_after - mem_before) * batch_size

    # Calculate the total memory footprint
    total_memory = parameter_memory + activations_memory
    infos = {"parameter_memory": parameter_memory, "activation_memory": activations_memory}
    return num_parameters, total_memory, infos

In [None]:
batch_size = 128
dtype = torch.float32
print("[Mode: Train]")
nparams, mem_ftp, infos = memory_footprint_gpu(model, batch_size, data_type=dtype, mode="train")
print(f"Num parameters: {nparams / 1e9:.2f}B, Mem footprint for inference with bsize = {batch_size}: { mem_ftp / 2**30 :.2f} GB")

print("[Mode: Inference]")
nparams, mem_ftp, infos = memory_footprint_gpu(model, batch_size, data_type=dtype, mode="inference")
print(f"Num parameters: {nparams / 1e9:.2f}B, Mem footprint for inference with bsize = {batch_size}: { mem_ftp / 2**30 :.2f} GB")

In [None]:
pprint(infos)

In [None]:
model.to("cpu")

In [None]:
device = torch.device("cuda:1")
tensor = torch.randn((1024, 1024, 1024)).to(torch.float16).to(device)
# print(torch.cuda.memory_allocated(device))
print(f"cache: {torch.cuda.memory_reserved(device)}")
del tensor
gc.collect()
# torch.cuda.empty_cache()
print(torch.cuda.memory_reserved(device))

In [None]:
device = torch.device("cuda:1")
layer = torch.nn.Sequential(
    torch.nn.Linear(1024, 1024),
    torch.nn.ReLU(),
    torch.nn.Linear(1024, 1)
).to(torch.float16).to(device)
tensor = torch.randn((1024, 1024, 1024)).to(torch.float16).to(device)
torch.cuda.synchronize()  # Force synchronization
_, mem_before = get_memory_usage(device)
loss = layer(tensor)
torch.cuda.synchronize()  # Force synchronization
_, mem_after = get_memory_usage(device)
print(mem_before)
print(mem_after)

print(f"mem_usage: {mem_after - mem_before}")

In [None]:
del tensor
del layer

In [None]:
import torch
from torch import nn

def estimate_memory_training(model, sample_input, optimizer_type=torch.optim.Adam, batch_size=1, use_amp=False, device=0):
    """Predict the maximum memory usage of the model. 
    Args:
        optimizer_type (Type): the class name of the optimizer to instantiate
        model (nn.Module): the neural network model
        sample_input (torch.Tensor): A sample input to the network. It should be 
            a single item, not a batch, and it will be replicated batch_size times.
        batch_size (int): the batch size
        use_amp (bool): whether to estimate based on using mixed precision
        device (torch.device): the device to use
    """
    # Reset model and optimizer
    model.cpu()
    optimizer = optimizer_type(model.parameters(), lr=.001)
    a = torch.cuda.memory_allocated(device)
    model.to(device)
    b = torch.cuda.memory_allocated(device)
    model_memory = b - a
    model_input = sample_input.unsqueeze(0).repeat(batch_size, 1)
    output = model(model_input.to(device)).sum()
    c = torch.cuda.memory_allocated(device)
    if use_amp:
        amp_multiplier = .5
    else:
        amp_multiplier = 1
    forward_pass_memory = (c - b)*amp_multiplier
    gradient_memory = model_memory
    if isinstance(optimizer, torch.optim.Adam):
        o = 2
    elif isinstance(optimizer, torch.optim.RMSprop):
        o = 1
    elif isinstance(optimizer, torch.optim.SGD):
        o = 0
    elif isinstance(optimizer, torch.optim.Adagrad):
        o = 1
    else:
        raise ValueError("Unsupported optimizer. Look up how many moments are" +
            "stored by your optimizer and add a case to the optimizer checker.")
    gradient_moment_memory = o*gradient_memory
    total_memory = model_memory + forward_pass_memory + gradient_memory + gradient_moment_memory

    return total_memory

def estimate_memory_inference(model, sample_input, batch_size=1, use_amp=False, device=0):
    """Predict the maximum memory usage of the model. 
    Args:
        optimizer_type (Type): the class name of the optimizer to instantiate
        model (nn.Module): the neural network model
        sample_input (torch.Tensor): A sample input to the network. It should be 
            a single item, not a batch, and it will be replicated batch_size times.
        batch_size (int): the batch size
        use_amp (bool): whether to estimate based on using mixed precision
        device (torch.device): the device to use
    """
    # Reset model and optimizer
    model.cpu()
    a = torch.cuda.memory_allocated(device)
    model.to(device)
    b = torch.cuda.memory_allocated(device)
    model_memory = b - a
    model_input = sample_input.unsqueeze(0).repeat(batch_size, 1)
    output = model(model_input.to(device)).sum()
    total_memory = model_memory

    return total_memory

def test_memory_training(in_size=100, out_size=10, hidden_size=100, optimizer_type=torch.optim.Adam, batch_size=1, use_amp=False, device=0):
    sample_input = torch.randn(batch_size, in_size, dtype=torch.float32)
    model = nn.Sequential(nn.Linear(in_size, hidden_size),
                        *[nn.Linear(hidden_size, hidden_size) for _ in range(200)],
                        nn.Linear(hidden_size, out_size))
    max_mem_est = estimate_memory_training(model, sample_input[0], optimizer_type=optimizer_type, batch_size=batch_size, use_amp=use_amp)
    print("Maximum Memory Estimate", max_mem_est)
    optimizer = optimizer_type(model.parameters(), lr=.001)
    print("Beginning mem:", torch.cuda.memory_allocated(device), "Note - this may be higher than 0, which is due to PyTorch caching. Don't worry too much about this number")
    model.to(device)
    print("After model to device:", torch.cuda.memory_allocated(device))
    for i in range(3):
        optimizer.zero_grad()
        print("Iteration", i)
        with torch.cuda.amp.autocast(enabled=use_amp):
            a = torch.cuda.memory_allocated(device)
            out = model(sample_input.to(device)).sum() # Taking the sum here just to get a scalar output
            b = torch.cuda.memory_allocated(device)
        print("1 - After forward pass", torch.cuda.memory_allocated(device))
        print("2 - Memory consumed by forward pass", b - a)
        out.backward()
        print("3 - After backward pass", torch.cuda.memory_allocated(device))
        optimizer.step()
        print("4 - After optimizer step", torch.cuda.memory_allocated(device))

def test_memory_inference(in_size=100, out_size=10, hidden_size=100, batch_size=1, use_amp=False, device=0):
    sample_input = torch.randn(batch_size, in_size, dtype=torch.float32)
    model = nn.Sequential(nn.Linear(in_size, hidden_size),
                        *[nn.Linear(hidden_size, hidden_size) for _ in range(200)],
                        nn.Linear(hidden_size, out_size))
    max_mem_est = estimate_memory_inference(model, sample_input[0], batch_size=batch_size, use_amp=use_amp)
    print("Maximum Memory Estimate", max_mem_est)
    print("Beginning mem:", torch.cuda.memory_allocated(device), "Note - this may be higher than 0, which is due to PyTorch caching. Don't worry too much about this number")
    model.to(device)
    print("After model to device:", torch.cuda.memory_allocated(device))
    with torch.no_grad():
        for i in range(3):
            print("Iteration", i)
            with torch.cuda.amp.autocast(enabled=use_amp):
                a = torch.cuda.memory_allocated(device)
                out = model(sample_input.to(device)).sum() # Taking the sum here just to get a scalar output
                b = torch.cuda.memory_allocated(device)
            print("1 - After forward pass", torch.cuda.memory_allocated(device))
            print("2 - Memory consumed by forward pass", b - a)

In [None]:
test_memory_inference(batch_size=64)
