# Inference time comparision
I was wondering if using hugginface transformers built in model parllelism feature slows down inference. This notebook is used to compare the inference time needed to process the same prompt with an llm utilizing only one or mulitple gpus in parallel.

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
hf_model_id =  "mistralai/Mistral-7B-Instruct-v0.1"
precision = "16" # 4, 8, 16, 32 in bits
max_tokens = 4096 # max tokens generated by model
temperature = 1E-10 # set near 0 to make sure same output is generated in booth runs.

In [3]:
messages = [
    {"role": "user", "content": "Count from 1 to 200. DO NOT LEAVE OUT ANY NUMBER."}
    ] # dummy generation task which takes some time

tokenizer = AutoTokenizer.from_pretrained(hf_model_id)
encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt").to('cuda')

In [4]:
import time
# measure time with decorator
def measure_time(func):
    def time_it(*args, **kwargs):
        time_started = time.time()
        result = func(*args, **kwargs)
        time_elapsed = time.time()
        print(
            "{execute} running time is {sec} seconds"
            .format(
                execute=func.__name__,
                sec=round(time_elapsed - time_started,4) 
                )
            )
        return result
    return time_it

In [5]:
@measure_time
def load_hf_LM_model(model_id, precision, device_map = 'auto'):
    if precision == "32":
        model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device_map)
    elif precision == "16":
        model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device_map, torch_dtype=torch.float16)
    elif precision == "8":
        model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device_map, load_in_8bit=True)
    elif precision == "4":
        model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device_map, load_in_4bit=True)
    else:
        raise ValueError("Invalid precision value")
    return model

In [6]:
device_map = 'cuda:0' # use one gpu only
model = load_hf_LM_model(hf_model_id, precision, device_map)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

load_hf_LM_model running time is 4.8644 seconds


In [7]:
@measure_time
def inference():
    return model.generate(encodeds, max_new_tokens=max_tokens, do_sample=True, pad_token_id=tokenizer.eos_token_id, temperature = temperature)

generated_ids = inference()

inference running time is 17.2025 seconds


In [8]:
decoded = tokenizer.batch_decode(generated_ids)
reply = decoded[0].split('[/INST]')[-1].replace('</s>', '')
reply

' 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200.'

In [9]:
del model
import gc
gc.collect()
import torch
torch.cuda.empty_cache()

In [10]:
device_map = 'auto' # use device map auto which splits the model to two gpus
model = load_hf_LM_model(hf_model_id, precision, device_map)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

load_hf_LM_model running time is 4.4114 seconds


In [11]:
generated_ids = inference()

inference running time is 17.5554 seconds


In [12]:
decoded = tokenizer.batch_decode(generated_ids)
reply = decoded[0].split('[/INST]')[-1].replace('</s>', '')
reply

' 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200.'

Inference time is nearly the same, seems like model paralellism is implemented quite efficiently at least for this use case.