Load HuggingFace `transformers` models over multiple GPUs with a custom `device_map`.
First, explore how `accelerate` calculates its `max_memory` (a mapping between devices and their maximum available memory), following https://github.com/huggingface/accelerate/blob/v1.0.0rc1/src/accelerate/utils/modeling.py#L842C37-L842C63

In [1]:
import torch
from transformers import AutoModelForCausalLM
from dsi import garbage_collect
from dsi import print_gpu_memory

print_gpu_memory()

  from .autonotebook import tqdm as notebook_tqdm


The current device is 0
GPU 0: 44.09 GB free, 44.35 GB total
GPU 1: 44.09 GB free, 44.35 GB total
GPU 2: 44.09 GB free, 44.35 GB total
GPU 3: 44.09 GB free, 44.35 GB total
GPU 4: 44.09 GB free, 44.35 GB total


In [2]:
def test_memory_leak(model_name="gpt2"):
    print_gpu_memory()
    print("Loading model...")
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="balanced_low_0", cache_dir="/workspace/hf_cache")
    print_gpu_memory()
    print("Collecting garbage...")
    garbage_collect()
    print_gpu_memory()
    print("Deleting model...")
    del model
    garbage_collect()
    print_gpu_memory()

test_memory_leak(model_name="gpt2")

The current device is 0
GPU 0: 44.09 GB free, 44.35 GB total
GPU 1: 44.09 GB free, 44.35 GB total
GPU 2: 44.09 GB free, 44.35 GB total
GPU 3: 44.09 GB free, 44.35 GB total
GPU 4: 44.09 GB free, 44.35 GB total
Loading model...
The current device is 0
GPU 0: 44.09 GB free, 44.35 GB total
GPU 1: 43.90 GB free, 44.35 GB total
GPU 2: 43.90 GB free, 44.35 GB total
GPU 3: 43.92 GB free, 44.35 GB total
GPU 4: 44.09 GB free, 44.35 GB total
Collecting garbage...
Collecting garbage...
The current device is 0
GPU 0: 44.09 GB free, 44.35 GB total
GPU 1: 43.90 GB free, 44.35 GB total
GPU 2: 43.90 GB free, 44.35 GB total
GPU 3: 43.92 GB free, 44.35 GB total
GPU 4: 44.09 GB free, 44.35 GB total
Deleting model...
Collecting garbage...
The current device is 0
GPU 0: 44.09 GB free, 44.35 GB total
GPU 1: 44.09 GB free, 44.35 GB total
GPU 2: 44.09 GB free, 44.35 GB total
GPU 3: 44.09 GB free, 44.35 GB total
GPU 4: 44.09 GB free, 44.35 GB total


In [3]:
# # login to huggingface using python and getpass
# from getpass import getpass
# import os

# if not os.environ.get("HUGGINGFACE_TOKEN"):
#     huggingface_token = getpass("Enter your HuggingFace token: ")
#     os.environ["HUGGINGFACE_TOKEN"] = huggingface_token
    
# !huggingface-cli login --token $HUGGINGFACE_TOKEN

In [4]:
# test_memory_leak(model_name="meta-llama/Meta-Llama-3.1-70B-Instruct")

# Load on GPU 0 only

In [5]:
# from accelerate import infer_auto_device_map, init_empty_weights


# def get_device_map_with_only_gpu_0(model_name):
#     with init_empty_weights():
#         model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir="/workspace/hf_cache")
#         max_memory = {i: 0 for i in range(1, torch.cuda.device_count())}
#         max_memory[0] = f"{torch.cuda.mem_get_info(0)[0] / 1024 / 1024 / 1024:.2f} GB"
#         return infer_auto_device_map(model, max_memory=max_memory)


# def test_loading(model_name):
#     garbage_collect()
#     print_gpu_memory()
#     print("Loading model with auto device map...")
#     with init_empty_weights():
#         model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir="/workspace/hf_cache", device_map="auto")
#         max_memory = infer_auto_device_map(model)
#         print(f"{max_memory=}")
#         try:
#             print(f"{model.device_map=}")
#         except AttributeError:
#             print("model.device_map does not exist")
#         try:
#             print(f"{model.hf_device_map=}")
#         except AttributeError:
#             print("model.hf_device_map does not exist")
#     print_gpu_memory()
#     del model
#     garbage_collect()
#     print_gpu_memory()
#     print("Loading model without specifying device map...")
#     with init_empty_weights():
#         model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir="/workspace/hf_cache")
#         max_memory = infer_auto_device_map(model)
#         print(f"{max_memory=}")
#         try:
#             print(f"{model.device_map=}")
#         except AttributeError:
#             print("model.device_map does not exist")
#         try:
#             print(f"{model.hf_device_map=}")
#         except AttributeError:
#             print("model.hf_device_map does not exist")
#         print_gpu_memory()
#     max_memory = {i: 0 for i in range(1, torch.cuda.device_count())}
#     max_memory[0] = f"{torch.cuda.mem_get_info(0)[0] / 1024 / 1024 / 1024:.2f} GB"
#     print(f"{max_memory=}")
#     device_map = infer_auto_device_map(model, max_memory=max_memory)
#     print(f"{device_map=}")
#     del model
#     garbage_collect()
#     print_gpu_memory()
#     model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir="/workspace/hf_cache", device_map=device_map)
#     print_gpu_memory()
#     del model
#     garbage_collect()
#     print_gpu_memory()


# test_loading(model_name="meta-llama/Meta-Llama-3.1-8B-Instruct")

In [6]:
from dsi import get_device_map_without_gpu_0


def test_loading_on_all_gpus_except_0(model_name, dtype, load_in_8bit):
    garbage_collect()
    print_gpu_memory()
    device_map = get_device_map_without_gpu_0(model_name, dtype=dtype, load_in_8bit=load_in_8bit)
    print(f"{device_map=}")
    model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir="/workspace/hf_cache", device_map=device_map)
    print_gpu_memory()
    del model
    garbage_collect()
    print_gpu_memory()


dtype = torch.float16
load_in_8bit = True

test_loading_on_all_gpus_except_0(model_name="meta-llama/Meta-Llama-3.1-70B-Instruct", dtype=dtype, load_in_8bit=load_in_8bit)

Collecting garbage...
The current device is 0
GPU 0: 44.09 GB free, 44.35 GB total
GPU 1: 44.09 GB free, 44.35 GB total
GPU 2: 44.09 GB free, 44.35 GB total
GPU 3: 44.09 GB free, 44.35 GB total
GPU 4: 44.09 GB free, 44.35 GB total


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards:  63%|██████▎   | 19/30 [00:26<00:15,  1.37s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 44.35 GiB of which 143.38 MiB is free. Process 2046325 has 44.20 GiB memory in use. Of the allocated memory 43.85 GiB is allocated by PyTorch, and 53.56 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
from dsi import get_device_map_with_only_gpu_0


def test_loading_on_gpu_0_only(model_name, dtype, load_in_8bit):
    garbage_collect()
    print_gpu_memory()
    device_map = get_device_map_with_only_gpu_0(model_name, dtype=dtype, load_in_8bit=load_in_8bit)
    print(f"{device_map=}")
    model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir="/workspace/hf_cache", device_map=device_map)
    print_gpu_memory()
    del model
    garbage_collect()
    print_gpu_memory()

test_loading_on_gpu_0_only(model_name="meta-llama/Meta-Llama-3.1-8B-Instruct", dtype=dtype, load_in_8bit=load_in_8bit)

# Load in 8bit

In [None]:
garbage_collect()
print_gpu_memory()
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", cache_dir="/workspace/hf_cache", torch_dtype=torch.float16, load_in_8bit=True)
print_gpu_memory()