# Balance Mixtral GPU Memory usage

## Goal

Can I have a more balanced GPU memory usage?

It will allow to use a bigger input size or batch size on submission.

## Imports

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import torch
import gc
import time
import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Code

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.float16,
    bnb_4bit_use_double_quant= True,
    llm_int8_enable_fp32_cpu_offload= True)

torch.cuda.empty_cache()
gc.collect()

In [None]:
auto_device_map = {
    'model.embed_tokens': 0,
    'model.layers.0': 0,
    'model.layers.1': 0,
    'model.layers.2': 0,
    'model.layers.3': 0,
    'model.layers.4': 0,
    'model.layers.5': 0,
    'model.layers.6': 0,
    'model.layers.7': 0,
    'model.layers.8': 0,
    'model.layers.9': 0,
    'model.layers.10': 0,
    'model.layers.11': 0,
    'model.layers.12': 0,
    'model.layers.13': 0,
    'model.layers.14': 1,
    'model.layers.15': 1,
    'model.layers.16': 1,
    'model.layers.17': 1,
    'model.layers.18': 1,
    'model.layers.19': 1,
    'model.layers.20': 1,
    'model.layers.21': 1,
    'model.layers.22': 1,
    'model.layers.23': 1,
    'model.layers.24': 1,
    'model.layers.25': 1,
    'model.layers.26': 1,
    'model.layers.27': 1,
    'model.layers.28': 1,
    'model.layers.29': 1,
    'model.layers.30': 1,
    'model.layers.31': 1,
    'model.norm': 1,
    'lm_head': 1
 }

def create_shared_device_map(transition_layer):
    shared_device_map = {}
    for idx, key in enumerate(auto_device_map):
        if idx <= transition_layer:
            shared_device_map[key] = 0
        else:
            shared_device_map[key] = 1
    return shared_device_map

def create_intertwined_device_map():
    device_map = {}
    for idx, key in enumerate(auto_device_map):
        if idx == 0:
            device_map[key] = 1
        elif idx >= 33:
            device_map[key] = 0
        else:
            device_map[key] = idx % 2
    return device_map

In [None]:
model_path = '/mnt/hdd0/Kaggle/llm_prompt_recovery/models/mixtral-8x7b-instruct-v0.1-hf'
model_path = '/home/gbarbadillo/data/mixtral-8x7b-instruct-v0.1-hf/'
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    #device_map="auto",
    device_map=create_intertwined_device_map(),
    #device_map=create_shared_device_map(16),
    trust_remote_code=True,
    )

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eos_token_id # this is needed to do batch inference
gc.collect()

In [None]:
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

def chat_with_mixtral(prompt, max_new_tokens=200, verbose=True, do_sample=False, temperature=0.7, top_p=0.95):
    if not prompt.startswith('<s>[INST]'):
        print('Formatting the prompt to Mixtral needs.')
        prompt = f'<s>[INST] {prompt} [/INST]'
    start = time.time()

    if do_sample:
        sampling_kwargs = dict(do_sample=True, temperature=temperature, top_p=top_p)
    else:
        sampling_kwargs = dict(do_sample=False)

    sequences = pipe(
        prompt ,
        max_new_tokens=max_new_tokens,
        # https://www.reddit.com/r/LocalLLaMA/comments/184g120/mistral_fine_tuning_eos_and_padding/
        # https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1/discussions/106
        pad_token_id=tokenizer.eos_token_id,
        **sampling_kwargs,
        return_full_text=False,
    )
    response = sequences[0]['generated_text']
    response = re.sub(r'[\'"]', '', response)
    if verbose:
        stop = time.time()
        time_taken = stop-start
        n_tokens = len(tokenizer.tokenize(response))
        print(f"Execution Time : {time_taken:.1f} s, tokens per second: {n_tokens/time_taken:.1f}")
    return response

In [None]:
def print_gpu_memory():
    for device in range(torch.cuda.device_count()):
        print(f'GPU {device} memory allocated: {torch.cuda.memory_allocated(device)/1024**3:.1f} GB, max memory allocated: {torch.cuda.max_memory_allocated(device)/1024**3:.1f} GB')

## Experiments

In [None]:
for _ in range(2):
    print(chat_with_mixtral('write a poem about real madrid', max_new_tokens=25))
print_gpu_memory()

```
# device_map='auto'
Execution Time : 2.9 s, tokens per second: 9.1
 In the heart of Spain, where the passion runs deep,
Lies a team called Real Madrid, a treasure to keep
GPU 0 memory allocated: 10.2 GB, max memory allocated: 10.3 GB
GPU 1 memory allocated: 13.1 GB, max memory allocated: 13.2 GB

# device_map=auto_device_map
Execution Time : 2.7 s, tokens per second: 9.5
 In the heart of Spain, where the passion runs deep,
Lies a team called Real Madrid, a treasure to keep
GPU 0 memory allocated: 10.2 GB, max memory allocated: 10.3 GB
GPU 1 memory allocated: 13.1 GB, max memory allocated: 13.2 GB

# device_map=create_shared_device_map(16)
Execution Time : 2.7 s, tokens per second: 9.6
 In the heart of Spain, where the passion runs deep,
Lies a team called Real Madrid, a treasure to keep
GPU 0 memory allocated: 11.7 GB, max memory allocated: 11.8 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 11.8 GB
Execution Time : 10.7 s, tokens per second: 9.4

# device_map=create_intertwined_device_map(),
Execution Time : 3.1 s, tokens per second: 8.5
 In the heart of Spain, where the passion runs deep,
Lies a team called Real Madrid, a treasure to keep
GPU 0 memory allocated: 11.7 GB, max memory allocated: 11.8 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 11.8 GB
Execution Time : 10.2 s, tokens per second: 9.9
```

In [None]:
with open('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/bible.txt', 'r') as f:
    bible = f.read()
print(bible[:100])

In [None]:
input_tokens, max_memory, inference_time = [], [], []

for i in range(1, 10):
    text = bible[:int(i*4000)]
    input_tokens.append(len(tokenizer.tokenize(text)))
    print(f'Input tokens: {input_tokens[-1]}')
    t0 = time.time()
    chat_with_mixtral(text, max_new_tokens=25)
    inference_time.append(time.time() - t0)
    print_gpu_memory()
    max_memory.append(sum(torch.cuda.max_memory_allocated(device) for device in range(torch.cuda.device_count())))

```
# device_map=create_shared_device_map(16)
Input tokens: 1129
Formatting the prompt to Mixtral needs.
Execution Time : 3.6 s, tokens per second: 7.3
GPU 0 memory allocated: 11.7 GB, max memory allocated: 12.0 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 11.9 GB
Input tokens: 2320
Formatting the prompt to Mixtral needs.
Execution Time : 3.9 s, tokens per second: 6.7
GPU 0 memory allocated: 11.7 GB, max memory allocated: 12.3 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 12.2 GB
Input tokens: 3543
Formatting the prompt to Mixtral needs.
Execution Time : 4.4 s, tokens per second: 5.9
GPU 0 memory allocated: 11.7 GB, max memory allocated: 12.6 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 12.5 GB
Input tokens: 4830
Formatting the prompt to Mixtral needs.
Execution Time : 5.1 s, tokens per second: 5.1
GPU 0 memory allocated: 11.7 GB, max memory allocated: 12.9 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 12.9 GB
Input tokens: 6046
Formatting the prompt to Mixtral needs.
Execution Time : 5.8 s, tokens per second: 4.5
GPU 0 memory allocated: 11.7 GB, max memory allocated: 13.2 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 13.2 GB
Input tokens: 7202
Formatting the prompt to Mixtral needs.
Execution Time : 6.4 s, tokens per second: 4.1
GPU 0 memory allocated: 11.7 GB, max memory allocated: 13.5 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 13.4 GB
Input tokens: 8389
Formatting the prompt to Mixtral needs.
Execution Time : 7.0 s, tokens per second: 3.7
GPU 0 memory allocated: 11.7 GB, max memory allocated: 13.8 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 13.7 GB
Input tokens: 9743
Formatting the prompt to Mixtral needs.
Execution Time : 8.0 s, tokens per second: 3.3
GPU 0 memory allocated: 11.7 GB, max memory allocated: 14.2 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 14.1 GB
Input tokens: 11073
Formatting the prompt to Mixtral needs.
Execution Time : 8.7 s, tokens per second: 3.0
GPU 0 memory allocated: 11.7 GB, max memory allocated: 14.5 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 14.4 GB


# device_map=create_intertwined_device_map(),
Input tokens: 1129
Formatting the prompt to Mixtral needs.
Execution Time : 3.9 s, tokens per second: 6.7
GPU 0 memory allocated: 11.7 GB, max memory allocated: 11.9 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 12.0 GB
Input tokens: 2320
Formatting the prompt to Mixtral needs.
Execution Time : 4.2 s, tokens per second: 6.2
GPU 0 memory allocated: 11.7 GB, max memory allocated: 12.2 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 12.3 GB
Input tokens: 3543
Formatting the prompt to Mixtral needs.
Execution Time : 4.7 s, tokens per second: 5.5
GPU 0 memory allocated: 11.7 GB, max memory allocated: 12.5 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 12.6 GB
Input tokens: 4830
Formatting the prompt to Mixtral needs.
Execution Time : 5.3 s, tokens per second: 4.9
GPU 0 memory allocated: 11.7 GB, max memory allocated: 12.9 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 12.9 GB
Input tokens: 6046
Formatting the prompt to Mixtral needs.
Execution Time : 6.0 s, tokens per second: 4.3
GPU 0 memory allocated: 11.7 GB, max memory allocated: 13.2 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 13.2 GB
Input tokens: 7202
Formatting the prompt to Mixtral needs.
Execution Time : 6.7 s, tokens per second: 3.9
GPU 0 memory allocated: 11.7 GB, max memory allocated: 13.4 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 13.5 GB
Input tokens: 8389
Formatting the prompt to Mixtral needs.
Execution Time : 7.4 s, tokens per second: 3.5
GPU 0 memory allocated: 11.7 GB, max memory allocated: 13.7 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 13.8 GB
Input tokens: 9743
Formatting the prompt to Mixtral needs.
Execution Time : 8.2 s, tokens per second: 3.2
GPU 0 memory allocated: 11.7 GB, max memory allocated: 14.1 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 14.2 GB
Input tokens: 11073
Formatting the prompt to Mixtral needs.
Execution Time : 9.0 s, tokens per second: 2.9
GPU 0 memory allocated: 11.7 GB, max memory allocated: 14.4 GB
GPU 1 memory allocated: 11.7 GB, max memory allocated: 14.5 GB
```

Both methods use the same memory, but the intertwined device map is slower. This suggests that the bottleneck of passing information througth the GPUs has a bigger weight than the more even GPU utilization (and likely lower temperature)

However on Kaggle the result was the opposite, so maybe I'm just observing random variations on inference time.

## Problem definition

I'm loading a Mixtral model on two gpus using the following code:

```python
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,)
```

The problem is that the use of GPU memory is not balanced:

- GPU 0 memory allocated: 10.2 GB, max memory allocated: 10.3 GB
- GPU 1 memory allocated: 13.1 GB, max memory allocated: 13.2 GB

How could I have a more even use of GPU memory?

## TODO

- [ ] Does using `gradient_checkpointing=True` improve memory usage? I don't believe that since I'm doing inference.