# Instructlab local - 02 Test inference

## 1. VLLM inference

In [1]:
models = {
"mistral":"mistralai/Mistral-7B-v0.3",
"mistral-instruct":"mistralai/Mistral-7B-Instruct-v0.3",
"llama3":"meta-llama/Meta-Llama-3-8B",
"llama3-instruct":"meta-llama/Meta-Llama-3-8B-Instruct",
"phi3-mini":"microsoft/Phi-3-mini-4k-instruct",
"phi3-small":"microsoft/Phi-3-small-8k-instruct",
"mixtral-q3":"mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1-hf-attn-4bit-moe-3bit-metaoffload-HQQ",
"mixtral-q2":"mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1-hf-2bit_g16_s128-HQQ"
}

### 1.1 Mistral 7B instruct v0.3

In [2]:
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

model_name = models["mistral-instruct"]
max_model_len = 32768
#model_name = models["llama3-instruct"]
#max_model_len = 8192

print(f"Loading model {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
llm = LLM(model_name, kv_cache_dtype="fp8", gpu_memory_utilization=0.99, max_model_len=max_model_len)



Loading model mistralai/Mistral-7B-Instruct-v0.3
INFO 05-25 21:13:56 config.py:379] Using fp8 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. But it may cause slight accuracy drop without scaling factors. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.
INFO 05-25 21:13:56 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='mistralai/Mistral-7B-Instruct-v0.3', speculative_config=None, tokenizer='mistralai/Mistral-7B-Instruct-v0.3', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=fp8, quantization_param_path=None, device_config=cuda, decoding_config=



INFO 05-25 21:13:57 utils.py:660] Found nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 05-25 21:13:57 selector.py:27] Using FlashAttention-2 backend.
INFO 05-25 21:13:58 weight_utils.py:199] Using model weights format ['*.safetensors']


model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

consolidated.safetensors:   0%|          | 0.00/14.5G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
pip install mistral_common==1.1.0

In [13]:
from transformers.utils.hub import cached_file 

tokenizer_model_file = cached_file(model_name, "tokenizer.model.v3", local_files_only=True)
tokenizer_model_file

OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like meta-llama/Meta-Llama-3-8B-Instruct is not the path to a directory containing a file named tokenizer.model.v3.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

In [3]:
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.protocol.instruct.messages import UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest

tokenizer = MistralTokenizer.from_file(f"{mistral_models_path}/tokenizer.model.v3")

"{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"

In [15]:
max_tokens = 1024
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=max_tokens)



requests_results = llm.generate("quelles sont les principales ouvertures des échecs ?", sampling_params=sampling_params)
result = requests_results[0]
output = result.outputs[0]
print(f"{len(output.token_ids)} tokens generated in {result.metrics.finished_time-result.metrics.arrival_time} sec")
print(output.text)

Processed prompts: 100%|██████████| 1/1 [00:19<00:00, 19.45s/it]

1024 tokens generated in 19.449058294296265 sec
 (1)
Les échecs sont des pièces majeures de l'ensemble des pièces de l'échiquier. Les principales ouvertures des échecs sont les suivantes :

1. Ouverture d'Écossaise : 1.e4 e5 2.Ff3 Fc5
2. Ouverture italienne : 1.e4 e5 2.e4 Fc5
3. Ouverture sicilienne : 1.e4 c5
4. Ouverture française : 1.e4 e6
5. Ouverture espagnole : 1.e4 e5 2.Nf3 Nc6
6. Ouverture anglaise : 1.c4 e5
7. Ouverture royale : 1.d4 d5 2.c4

Il est important de noter que ces ouvertures sont considérées comme des ouvertures "classiques" et qu'il existe de nombreuses autres ouvertures qui peuvent être utilisées. Il est également important de comprendre les principes de base de l'ouverture, tels que la maîtrise du centre, la protection des pièces et la création de possibilités d'attaque.

Il est également important de noter que les ouvertures des échecs ne sont pas figées et qu'il est possible de jouer des parties très différentes avec la même ouverture, en fonction des choix fai




In [10]:
print(output)

[RequestOutput(request_id=1, prompt='quelles sont les principales ouvertures des échecs ?', prompt_token_ids=[128000, 447, 37907, 15132, 3625, 82512, 6033, 1653, 1439, 951, 4046, 331, 54817, 949], prompt_logprobs=None, outputs=[CompletionOutput(index=0, text=" ( French )\nWhat are the main openings in chess?\nThere are many openings in chess, and it's difficult to give a complete list. However, here are some of the most popular and well-known openings:\n1. Ruy Lopez: This is one of the oldest and most popular openings, named after the 16th-century Spanish priest Ruy Lopez de Segura. It starts with the moves 1.e4 e5 2.Nf3 Nc6 3.Bb5.\n2. Sicilian Defense: This is one of the most aggressive and complex openings, starting with the moves 1.e4 c5. It's a favorite among many top players.\n3. Italian Game: This opening starts with the moves 1.e4 e5 2.Nf3 Nc6 3.Bc4, aiming to quickly develop the bishop and knight.\n4. King's Pawn Opening: This is one of the simplest and most common openings, st

In [7]:
for response_chunk in response_generator:
    print(response_chunk, end="", flush=True)

[0;31mSignature:[0m
[0mLLM[0m[0;34m.[0m[0m__init__[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mself[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmodel[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtokenizer[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtokenizer_mode[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mskip_tokenizer_init[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtrust_remote_code[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtensor_parallel_size[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mquantization[0m[0;34m:[0m [0mOptiona

In [None]:
import time
prompt = "Quels sont les avantages du Crédit Mutuel ?"

# System prompt
messages = [
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": prompt}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages du Crédit Agricole ?"}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages de la Société Générale ?"}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages de la BNP ?"}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages de la Banque populaire ?"}
],
[
    {"role": "system", "content": "Tu es un assistant utile et professionnel qui répond toujours en français."},
    {"role": "user", "content": "Quels sont les avantages de la Caise d'épargne ?"}
]
]

# Generate outputs
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

start_time = time.time()  # Record the start time
outputs = llm.generate(text, sampling_params)
end_time = time.time()  # Record the end time
    
# Print the outputs.
tokenscount = 0
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    tokenscount = tokenscount + len(output.outputs[0].token_ids)
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    
print(f"Performance: {int(tokenscount/(end_time-start_time))} tokens/sec")