In [21]:
import torch
import torch.nn.functional as F
import gc

# General Memory Functions

In [22]:
def sizeof_fmt(num, suffix="B"):
    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
        if abs(num) < 1024.0:
            return f"{num:3.1f}{unit}{suffix}"
        num /= 1024.0
    return f"{num:.1f}Yi{suffix}"


def mem_size(t: torch.Tensor):
    return sizeof_fmt(t.element_size() * t.numel())

def mem_summary():
    for d in range(torch.cuda.device_count()):
        avail, total = torch.cuda.mem_get_info(d)
        used = total - avail
        print(f"cuda:{d}: {sizeof_fmt(used)} ({(used / total)*100:0.2f}%)")


In [23]:
mem_summary()

cuda:0: 0.0B (0.00%)
cuda:1: 0.0B (0.00%)


# RAM Use

Resting RAM Use 2.02gb (jupyter, OS, other random stuff)

In [24]:
# RAM jumps up here to 5.76gb
t1 = torch.randn((100000, 10000))
mem_size(t1)

'3.7GiB'

In [25]:
# RAM jumps up to ~9.3gb, until execution completes and then drops
# to 5.76gb again
# memory of previous variable not released until new allocation 
# complete (or the GC is a bit slow)
t1 = torch.randn((100000, 10000))

In [26]:
# Drops to 2.02gb, after GC
t1 = None

In [27]:
# Getting back to 5.76gb
t1 = torch.randn((100000, 10000))

In [28]:
# Memory drops before rising again
t1 = None
t1 = torch.randn((100000, 10000))

In [29]:
# Could also force a gc with:
t1 = None
gc.collect()

0

# VRAM Use

Investigating VRAM usage on a AMD GPU, with ROCm 5.4.3.50403-121~20.04

In [30]:
device = "cuda:1"

In [31]:
t1 = torch.randn((100000, 10000), device=device)
mem_summary()

# VRAM goes to ~4gb

cuda:0: 0.0B (0.00%)
cuda:1: 3.7GiB (23.31%)


In [32]:
t1 = torch.randn((100000, 10000), device=device)
mem_summary()

# VRAM jumps up to ~7.5gb??

cuda:0: 0.0B (0.00%)
cuda:1: 7.5GiB (46.63%)


In [33]:
t1 = torch.randn((100000, 10000), device=device)
mem_summary()

# Stays at 7.5gb?

cuda:0: 0.0B (0.00%)
cuda:1: 7.5GiB (46.63%)


In [34]:
t1 = None
gc.collect()
mem_summary()
# GC doesn't clear the VRAM

cuda:0: 0.0B (0.00%)
cuda:1: 7.5GiB (46.63%)


In [35]:
t1 = None
torch.cuda.ipc_collect()
torch.cuda.empty_cache()

mem_summary()

# this does?

cuda:0: 0.0B (0.00%)
cuda:1: 0.0B (0.00%)


# Avoiding GPU memory leaks

Assigning an old var to None and running gc doesn't clear VRAM

But calling: 
```python
torch.cuda.ipc_collect()
torch.cuda.empty_cache()
```
does

Turns out you only need
```python
torch.cuda.empty_cache()
```

Following is a quick look into how this works

In [36]:
t1 = torch.randn((100000, 10000), device=device)
t1 = None
mem_summary()

cuda:0: 0.0B (0.00%)
cuda:1: 3.7GiB (23.31%)


In [37]:
torch.cuda.empty_cache()
mem_summary()
# Empty cache does it without ipc_collect?

cuda:0: 0.0B (0.00%)
cuda:1: 0.0B (0.00%)


In [19]:
t1 = torch.randn((100000, 10000), device=device)
t1 = torch.randn((100000, 10000), device=device)
t1 = torch.randn((100000, 10000), device=device)
t1 = torch.randn((100000, 10000), device=device)
t1 = torch.randn((100000, 10000), device=device)
t1 = None

mem_summary()

torch.cuda.empty_cache()
print("\nafter empty_cache:")
mem_summary()

cuda:0: 0.0B (0.00%)
cuda:1: 7.5GiB (46.63%)

after empty_cache:
cuda:0: 0.0B (0.00%)
cuda:1: 0.0B (0.00%)


In [20]:
# try making it on cpu first
t1 = torch.randn((100000, 10000))
t1 = t1.to(device)

print("\nafter empty_cache:")
mem_summary()

t1 = torch.randn((100000, 10000))
t1 = t1.to(device)

t1 = None
mem_summary()

torch.cuda.empty_cache()

print("\nafter empty_cache:")
mem_summary()


after empty_cache:
cuda:0: 0.0B (0.00%)
cuda:1: 3.7GiB (23.31%)
cuda:0: 0.0B (0.00%)
cuda:1: 3.7GiB (23.31%)

after empty_cache:
cuda:0: 0.0B (0.00%)
cuda:1: 0.0B (0.00%)


In [107]:
torch.__version__

'2.1.0.dev20230303+rocm5.4.2'