# Instructlab local - 02 Test inference

## 1. VLLM inference

In [1]:
models = {
"mistral":"mistralai/Mistral-7B-v0.3",
"mistral-instruct":"mistralai/Mistral-7B-Instruct-v0.3",
"llama3":"meta-llama/Meta-Llama-3-8B",
"llama3-instruct":"meta-llama/Meta-Llama-3-8B-Instruct",
"phi3-mini":"microsoft/Phi-3-mini-4k-instruct",
"phi3-small":"microsoft/Phi-3-small-8k-instruct",
"mixtral-q3":"mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1-hf-attn-4bit-moe-3bit-metaoffload-HQQ",
"mixtral-q2":"mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1-hf-2bit_g16_s128-HQQ"
}

**IMPORTANT: always set the local download cache directory explicitly**

In [2]:
DOWNLOAD_CACHE_DIR = "/models/huggingface/transformers"

from pathlib import Path
print(f"Download cache dir: {DOWNLOAD_CACHE_DIR} {('OK' if Path(DOWNLOAD_CACHE_DIR).exists() else 'KO')}")

Download cache dir: /models/huggingface/transformers OK


In [6]:
# You can then check the download_config of VLLM with :
# llm.llm_engine.load_config

### 1.1 Mistral 7B instruct v0.3

***TEMPORARY BUG FIX for Mistral 7B v0.3*** with VLLM v0.4.2

https://github.com/vllm-project/vllm/pull/5005

vi /workspace/instructlab-local/.venv/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py

vi /workspace/instructlab-local/.venv/lib/python3.10/site-packages/vllm/model_executor/model_loader/weight_utils.py

In [3]:
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

model_name = models["mistral-instruct"]

print(f"Loading model {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
llm = LLM(model_name, kv_cache_dtype="fp8", gpu_memory_utilization=0.99, download_dir=DOWNLOAD_CACHE_DIR)



Loading model mistralai/Mistral-7B-Instruct-v0.3
INFO 05-26 15:24:51 config.py:379] Using fp8 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. But it may cause slight accuracy drop without scaling factors. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.
INFO 05-26 15:24:51 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='mistralai/Mistral-7B-Instruct-v0.3', speculative_config=None, tokenizer='mistralai/Mistral-7B-Instruct-v0.3', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir='/models/huggingface/transformers', load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=fp8, quantization_param_path=None, device



INFO 05-26 15:24:51 utils.py:660] Found nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 05-26 15:24:51 selector.py:27] Using FlashAttention-2 backend.
INFO 05-26 15:24:52 weight_utils.py:200] Using model weights format ['*.safetensors']
INFO 05-26 15:25:53 model_runner.py:175] Loading model weights took 13.5083 GB
INFO 05-26 15:25:58 gpu_executor.py:114] # GPU blocks: 6129, # CPU blocks: 4096
INFO 05-26 15:25:58 model_runner.py:937] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-26 15:25:58 model_runner.py:941] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 05-26 15:26:03 model_runner.py:1017] Graph capturing finishe

In [None]:
pip install mistral_common==1.1.0

In [14]:
from transformers.utils.hub import cached_file 

tokenizer_model_file = cached_file(model_name, "tokenizer.model.v3", cache_dir=DOWNLOAD_CACHE_DIR)
tokenizer_model_file

tokenizer.model.v3:   0%|          | 0.00/587k [00:00<?, ?B/s]

'/models/huggingface/transformers/models--mistralai--Mistral-7B-Instruct-v0.3/snapshots/83e9aa141f2e28c82232fea5325f54edf17c43de/tokenizer.model.v3'

In [15]:
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer

tokenizer = MistralTokenizer.from_file(tokenizer_model_file)

In [18]:
from mistral_common.protocol.instruct.messages import UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest

completion_request = ChatCompletionRequest(messages=[UserMessage(content="quelles sont les principales ouvertures des échecs ?")])

tokens = tokenizer.encode_chat_completion(completion_request)
tokens.text

'<s>[INST]▁quelles▁sont▁les▁principales▁ouvertures▁des▁échecs▁?[/INST]'

In [19]:
max_tokens = 1024
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=max_tokens)

requests_results = llm.generate(tokens.text, sampling_params=sampling_params)
result = requests_results[0]
output = result.outputs[0]
print(f"{len(output.token_ids)} tokens generated in {result.metrics.finished_time-result.metrics.arrival_time} sec")
print(output.text)

Processed prompts: 100%|██████████| 1/1 [00:11<00:00, 11.49s/it]

628 tokens generated in 11.488973617553711 sec
 Les principales ouvertures des échecs, également appelées systèmes d'ouverture, sont :

1. Défense française (1. e4 e6)
2. Défense sicilienne (1. e4 c5)
3. Défense indienne (1. d4 Cf6)
4. Défense espagnole (1. e4 e5 2. Cf3 Cc6)
5. Défense Caro-Kann (1. e4 c6)
6. Défense Grünfeld (1. d4 Cf6 2. c4 g6 3. Cc3 d5)
7. Défense russe (1. d4 d5)
8. Défense scandinave (1. e4 d5)
9. Défense benoni (1. d4 Cf6 2. c4 c5)
10. Défense philidorienne (1. e4 e5 2. Cf3 d6 3. d4)
11. Défense du roi (1. d4 d5 2. c4 dxc4 3. a4)
12. Défense sicilienne, variante Najdorf (1. e4 c5 2. d4 cxd4 3. Cc3 dxc3 4. Cxc3 a6)
13. Défense sicilienne, variante Scheveningen (1. e4 c5 2. Nf3 Nc6 3. d4 cxd4 4. Nxd4 Nf6 5. Nc3 a6)
14. Défense sicilienne, variante Dragon (1. e4 c5 2. Nf3 Nc6 3. d4 cxd4 4. Nxd4 Nf6 5. Nc3 g6 6. Be3 Gg7 7. Fd3 O-O 8. O-O d5)
15. Défense sicilienne, variante Najdorf, variante de la Main-Pari (1. e4 c5 2. Nf3 Nc6 3. d4 cxd4 4. Nxd4 Nf6 5. Nc3 a6 6. Be3




In [10]:
print(output)

[RequestOutput(request_id=1, prompt='quelles sont les principales ouvertures des échecs ?', prompt_token_ids=[128000, 447, 37907, 15132, 3625, 82512, 6033, 1653, 1439, 951, 4046, 331, 54817, 949], prompt_logprobs=None, outputs=[CompletionOutput(index=0, text=" ( French )\nWhat are the main openings in chess?\nThere are many openings in chess, and it's difficult to give a complete list. However, here are some of the most popular and well-known openings:\n1. Ruy Lopez: This is one of the oldest and most popular openings, named after the 16th-century Spanish priest Ruy Lopez de Segura. It starts with the moves 1.e4 e5 2.Nf3 Nc6 3.Bb5.\n2. Sicilian Defense: This is one of the most aggressive and complex openings, starting with the moves 1.e4 c5. It's a favorite among many top players.\n3. Italian Game: This opening starts with the moves 1.e4 e5 2.Nf3 Nc6 3.Bc4, aiming to quickly develop the bishop and knight.\n4. King's Pawn Opening: This is one of the simplest and most common openings, st

In [7]:
for response_chunk in response_generator:
    print(response_chunk, end="", flush=True)

[0;31mSignature:[0m
[0mLLM[0m[0;34m.[0m[0m__init__[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mself[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmodel[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtokenizer[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtokenizer_mode[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mskip_tokenizer_init[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtrust_remote_code[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtensor_parallel_size[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mquantization[0m[0;34m:[0m [0mOptiona

In [None]:
import time
prompt = "Quels sont les avantages du Crédit Mutuel ?"

# System prompt
messages = [
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": prompt}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages du Crédit Agricole ?"}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages de la Société Générale ?"}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages de la BNP ?"}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages de la Banque populaire ?"}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages de la Caise d'épargne ?"}
]
]

# Generate outputs
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

start_time = time.time()  # Record the start time
outputs = llm.generate(text, sampling_params)
end_time = time.time()  # Record the end time
    
# Print the outputs.
tokenscount = 0
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    tokenscount = tokenscount + len(output.outputs[0].token_ids)
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    
print(f"Performance: {int(tokenscount/(end_time-start_time))} tokens/sec")