Load HuggingFace `transformers` models over multiple GPUs with a custom `device_map`.
First, explore how `accelerate` calculates its `max_memory` (a mapping between devices and their maximum available memory), following https://github.com/huggingface/accelerate/blob/v1.0.0rc1/src/accelerate/utils/modeling.py#L842C37-L842C63

In [1]:
import torch

from dsi import print_gpu_memory

print_gpu_memory()

  from .autonotebook import tqdm as notebook_tqdm


The current device is 0
GPU 0: 44.09 GB free, 44.35 GB total
GPU 1: 44.09 GB free, 44.35 GB total
GPU 2: 44.09 GB free, 44.35 GB total
GPU 3: 44.09 GB free, 44.35 GB total
GPU 4: 44.09 GB free, 44.35 GB total


In [7]:
from transformers import AutoModelForCausalLM
from dsi import garbage_collect


def test_memory_leak(model_name="gpt2"):
    print_gpu_memory()
    print("Loading model...")
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="balanced_low_0", cache_dir="/workspace/hf_cache")
    print_gpu_memory()
    print("Collecting garbage...")
    garbage_collect()
    print_gpu_memory()
    print("Deleting model...")
    del model
    garbage_collect()
    print_gpu_memory()

test_memory_leak(model_name="gpt2")

The current device is 0
GPU 0: 39.69 GB free, 44.35 GB total
GPU 1: 37.58 GB free, 44.35 GB total
GPU 2: 37.58 GB free, 44.35 GB total
GPU 3: 37.58 GB free, 44.35 GB total
GPU 4: 38.06 GB free, 44.35 GB total
Loading model...
The current device is 0
GPU 0: 44.09 GB free, 44.35 GB total
GPU 1: 43.90 GB free, 44.35 GB total
GPU 2: 43.90 GB free, 44.35 GB total
GPU 3: 43.92 GB free, 44.35 GB total
GPU 4: 44.09 GB free, 44.35 GB total
Collecting garbage...
Collecting garbage...
The current device is 0
GPU 0: 44.09 GB free, 44.35 GB total
GPU 1: 43.90 GB free, 44.35 GB total
GPU 2: 43.90 GB free, 44.35 GB total
GPU 3: 43.92 GB free, 44.35 GB total
GPU 4: 44.09 GB free, 44.35 GB total
Deleting model...
Collecting garbage...
The current device is 0
GPU 0: 44.09 GB free, 44.35 GB total
GPU 1: 44.09 GB free, 44.35 GB total
GPU 2: 44.09 GB free, 44.35 GB total
GPU 3: 44.09 GB free, 44.35 GB total
GPU 4: 44.09 GB free, 44.35 GB total


In [3]:
# # login to huggingface using python and getpass
# from getpass import getpass
# import os

# if not os.environ.get("HUGGINGFACE_TOKEN"):
#     huggingface_token = getpass("Enter your HuggingFace token: ")
#     os.environ["HUGGINGFACE_TOKEN"] = huggingface_token
    
# !huggingface-cli login --token $HUGGINGFACE_TOKEN

In [4]:
# test_memory_leak(model_name="meta-llama/Meta-Llama-3.1-70B-Instruct")

In [12]:
from accelerate import infer_auto_device_map, init_empty_weights


def test_loading_on_gpu_0_only(model_name):
    garbage_collect()
    print_gpu_memory()
    print("Loading model with auto device map...")
    with init_empty_weights():
        model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir="/workspace/hf_cache", device_map="auto")
        max_memory = infer_auto_device_map(model)
        print(f"{max_memory=}")
        try:
            print(f"{model.device_map=}")
        except AttributeError:
            print("model.device_map does not exist")
        try:
            print(f"{model.hf_device_map=}")
        except AttributeError:
            print("model.hf_device_map does not exist")
    print_gpu_memory()
    del model
    garbage_collect()
    print_gpu_memory()
    print("Loading model without specifying device map...")
    with init_empty_weights():
        model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir="/workspace/hf_cache")
        max_memory = infer_auto_device_map(model)
        print(f"{max_memory=}")
        try:
            print(f"{model.device_map=}")
        except AttributeError:
            print("model.device_map does not exist")
        try:
            print(f"{model.hf_device_map=}")
        except AttributeError:
            print("model.hf_device_map does not exist")
    print_gpu_memory()
    max_memory = {i: 0 for i in range(1, torch.cuda.device_count())}
    max_memory[0] = f"{torch.cuda.mem_get_info(0)[0] / 1024 / 1024 / 1024:.2f} GB"
    print(f"{max_memory=}")
    device_map = infer_auto_device_map(model, max_memory=max_memory)
    print(f"{device_map=}")
    del model
    garbage_collect()
    print_gpu_memory()
    model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir="/workspace/hf_cache", device_map=device_map)
    print_gpu_memory()
    del model
    garbage_collect()
    print_gpu_memory()


test_loading_on_gpu_0_only(model_name="meta-llama/Meta-Llama-3.1-8B-Instruct")

Collecting garbage...
The current device is 0
GPU 0: 44.09 GB free, 44.35 GB total
GPU 1: 44.09 GB free, 44.35 GB total
GPU 2: 44.09 GB free, 44.35 GB total
GPU 3: 44.09 GB free, 44.35 GB total
GPU 4: 44.09 GB free, 44.35 GB total
Loading model with auto device map...


Loading checkpoint shards: 100%|██████████| 4/4 [00:14<00:00,  3.59s/it]


max_memory=OrderedDict([('', 0)])
model.device_map does not exist
model.hf_device_map={'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 1, 'model.layers.4': 1, 'model.layers.5': 1, 'model.layers.6': 1, 'model.layers.7': 1, 'model.layers.8': 1, 'model.layers.9': 1, 'model.layers.10': 1, 'model.layers.11': 2, 'model.layers.12': 2, 'model.layers.13': 2, 'model.layers.14': 2, 'model.layers.15': 2, 'model.layers.16': 2, 'model.layers.17': 2, 'model.layers.18': 2, 'model.layers.19': 3, 'model.layers.20': 3, 'model.layers.21': 3, 'model.layers.22': 3, 'model.layers.23': 3, 'model.layers.24': 3, 'model.layers.25': 3, 'model.layers.26': 3, 'model.layers.27': 4, 'model.layers.28': 4, 'model.layers.29': 4, 'model.layers.30': 4, 'model.layers.31': 4, 'model.norm': 4, 'model.rotary_emb': 4, 'lm_head': 4}
The current device is 0
GPU 0: 39.69 GB free, 44.35 GB total
GPU 1: 37.58 GB free, 44.35 GB total
GPU 2: 37.58 GB free, 44.35 GB total
GPU 3

Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00,  5.47it/s]


max_memory=OrderedDict([('', 0)])
model.device_map does not exist
model.hf_device_map does not exist
The current device is 0
GPU 0: 44.08 GB free, 44.35 GB total
GPU 1: 44.08 GB free, 44.35 GB total
GPU 2: 44.08 GB free, 44.35 GB total
GPU 3: 44.08 GB free, 44.35 GB total
GPU 4: 44.08 GB free, 44.35 GB total
max_memory={1: 0, 2: 0, 3: 0, 4: 0, 0: '44.08 GB'}
device_map=OrderedDict([('', 0)])
Collecting garbage...
The current device is 0
GPU 0: 44.09 GB free, 44.35 GB total
GPU 1: 44.09 GB free, 44.35 GB total
GPU 2: 44.09 GB free, 44.35 GB total
GPU 3: 44.09 GB free, 44.35 GB total
GPU 4: 44.09 GB free, 44.35 GB total


Loading checkpoint shards: 100%|██████████| 4/4 [00:13<00:00,  3.46s/it]


The current device is 0
GPU 0: 14.17 GB free, 44.35 GB total
GPU 1: 44.09 GB free, 44.35 GB total
GPU 2: 44.09 GB free, 44.35 GB total
GPU 3: 44.09 GB free, 44.35 GB total
GPU 4: 44.09 GB free, 44.35 GB total
Collecting garbage...
The current device is 0
GPU 0: 44.09 GB free, 44.35 GB total
GPU 1: 44.09 GB free, 44.35 GB total
GPU 2: 44.09 GB free, 44.35 GB total
GPU 3: 44.09 GB free, 44.35 GB total
GPU 4: 44.09 GB free, 44.35 GB total


In [6]:
def get_device_map_with_only_gpu_0(model):
    max_memory = {i: 0 for i in range(1, torch.cuda.device_count())}
    max_memory[0] = f"{torch.cuda.mem_get_info(0)[0] / 1024 / 1024 / 1024:.2f} GB"
    device_map = infer_auto_device_map(model, max_memory=max_memory)
    return device_map

get_device_map_with_only_gpu_0(model)

NameError: name 'model' is not defined