# Instructlab local - 02 Test inference

In [1]:
models = {
    "mistral":"mistralai/Mistral-7B-v0.3",
    "mistral-instruct":"mistralai/Mistral-7B-Instruct-v0.3",
    "llama3":"meta-llama/Meta-Llama-3-8B",
    "llama3-instruct":"meta-llama/Meta-Llama-3-8B-Instruct",
    "phi3-mini":"microsoft/Phi-3-mini-4k-instruct",
    "phi3-small":"microsoft/Phi-3-small-8k-instruct",
    
    "mistral-instruct-gptq-marlin":"neuralmagic/Mistral-7B-Instruct-v0.3-GPTQ-4bit",
    "llama3-instruct-gptq":"astronomer/Llama-3-8B-Instruct-GPTQ-4-Bit",
    "llama3-instruct-fp8":"neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
    "phi3-medium-gptq":"Phi-3-medium-4k-instruct-gptq-4bit",
    
    # Big ones
    "qwen-awq":"Qwen/Qwen1.5-32B-Chat-AWQ",
    # Too SLOW
    "mixtral-q3":"mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1-hf-attn-4bit-moe-3bit-metaoffload-HQQ",
    "mixtral-q2":"mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1-hf-attn-4bit-moe-2bit-HQQ",    
}

**IMPORTANT: always set the local download cache directory explicitly**

In [2]:
DOWNLOAD_CACHE_DIR = "/models/huggingface/transformers"

from pathlib import Path
print(f"Download cache dir: {DOWNLOAD_CACHE_DIR} {('OK' if Path(DOWNLOAD_CACHE_DIR).exists() else 'KO')}")

Download cache dir: /models/huggingface/transformers OK


## 1. VLLM inference

### 1.1 Mistral 7B instruct v0.3

In [4]:
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

model_name = models["mistral-instruct"]

print(f"Loading model {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
llm = LLM(model_name, kv_cache_dtype="fp8", gpu_memory_utilization=0.99, download_dir=DOWNLOAD_CACHE_DIR)



Loading model mistralai/Mistral-7B-Instruct-v0.3
INFO 06-01 21:46:47 config.py:390] Using fp8 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. Meanwhile, it may cause accuracy drop without a proper scaling factor
INFO 06-01 21:46:47 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='mistralai/Mistral-7B-Instruct-v0.3', speculative_config=None, tokenizer='mistralai/Mistral-7B-Instruct-v0.3', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir='/models/huggingface/transformers', load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=fp8, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=mistralai/Mistral-7B-Instr



INFO 06-01 21:46:48 selector.py:130] Cannot use FlashAttention-2 backend for FP8 KV cache.
INFO 06-01 21:46:48 selector.py:51] Using XFormers backend.




INFO 06-01 21:46:49 selector.py:130] Cannot use FlashAttention-2 backend for FP8 KV cache.
INFO 06-01 21:46:49 selector.py:51] Using XFormers backend.
INFO 06-01 21:46:49 weight_utils.py:207] Using model weights format ['*.safetensors']
INFO 06-01 21:46:52 model_runner.py:146] Loading model weights took 13.5083 GB
INFO 06-01 21:46:55 gpu_executor.py:83] # GPU blocks: 6082, # CPU blocks: 4096
INFO 06-01 21:46:55 model_runner.py:854] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-01 21:46:55 model_runner.py:858] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-01 21:46:59 model_runner.py:924] Graph capturing finished in 4 secs.


In [5]:
from transformers.utils.hub import cached_file 

tokenizer_model_file = cached_file(model_name, "tokenizer.model.v3", cache_dir=DOWNLOAD_CACHE_DIR)
tokenizer_model_file

'/models/huggingface/transformers/models--mistralai--Mistral-7B-Instruct-v0.3/snapshots/83e9aa141f2e28c82232fea5325f54edf17c43de/tokenizer.model.v3'

In [6]:
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer

tokenizer = MistralTokenizer.from_file(tokenizer_model_file)

In [7]:
from mistral_common.protocol.instruct.messages import SystemMessage, UserMessage, AssistantMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest

completion_request = ChatCompletionRequest(messages=[UserMessage(content="quelles sont les principales ouvertures des échecs ?")])

tokens = tokenizer.encode_chat_completion(completion_request)
tokens.text

'<s>[INST]▁quelles▁sont▁les▁principales▁ouvertures▁des▁échecs▁?[/INST]'

In [8]:
from typing import Optional
from vllm.lora.request import LoRARequest

def vllm_stream_output(
    llm: LLM,
    prompt: str,
    sampling_params: Optional[SamplingParams] = None,
    lora_request: Optional[LoRARequest] = None
    ):

    if sampling_params is None:
        # Use default sampling params.
        sampling_params = SamplingParams()

    llm._validate_and_add_requests(
        inputs=prompt,
        params=sampling_params,
        lora_request=lora_request,
    )

    while llm.llm_engine.has_unfinished_requests():
        step_outputs = llm.llm_engine.step()
        output = step_outputs[0]
        yield output.outputs[0].text
        if output.finished:
            yield output

def vllm_display_output(
    llm: LLM,
    prompt: str,
    sampling_params: Optional[SamplingParams] = None,
    lora_request: Optional[LoRARequest] = None
    ):
          
    last_length = 0
    for result in vllm_stream_output(llm, prompt, sampling_params=sampling_params, lora_request=lora_request):
        if isinstance(result,str):
            print(result[last_length:], end="", flush=True)
            last_length = len(result)
        else:
            print()
            print("--------------------------")
            input_tokens = len(result.prompt_token_ids)
            output_tokens = len(result.outputs[0].token_ids)
            total_time = result.metrics.finished_time-result.metrics.arrival_time
            time_to_first_token = result.metrics.first_token_time-result.metrics.arrival_time
            tokens_per_sec = (output_tokens-1)/(total_time-time_to_first_token)
            print(f"{output_tokens} tokens generated in {total_time:.2f} sec | prompt: {input_tokens:.2f} tokens | time to first token: {time_to_first_token:.2f} sec | throughput: {tokens_per_sec:.2f} tokens/sec")

In [9]:
max_tokens = 1024
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=max_tokens)

vllm_display_output(llm, tokens.text,sampling_params=sampling_params)

 Les principales ouvertures d'échecs, classées par ordre de popularité, sont :

1. Ouverture française (ECO C00) : 1.e4 e6 2.d4 d5
2. Défense sicilienne (ECO B00-B99) : 1.e4 c5
3. Ouverture du roi (ECO A00-A99) : 1.d4
4. Défense indienne (ECO E00-E99) : 1.d4 Cf6
5. Défense gréco (ECO D00-D99) : 1.e4 e5
6. Défense caro-kann (ECO B10-B99) : 1.e4 c6
7. Ouverture scandinave (ECO A10-A99) : 1.e4 d5 2.exd5 Cf6
8. Ouverture espagnole (ECO C80-C99) : 1.e4 e5 2.Cf3 Cc6
9. Défense sicilienne variante Najdorf (ECO B97) : 1.e4 c5 2.d4 cxd4 3.Cc3 a6 4.Cf3 Cf6 5.dxc5
10. Défense sicilienne variante Scheveningen (ECO B84) : 1.e4 c5 2.Nf3 d6 3.d4 cxd4 4.Nxd4 Nf6 5.Nc3 a6 6.f3.
--------------------------
403 tokens generated in 7.75 sec | prompt: 18.00 tokens | time to first token: 0.27 sec | throughput: 53.72 tokens/sec


In [11]:
requests_results = llm.generate(tokens.text, sampling_params=sampling_params)
result = requests_results[0]
output = result.outputs[0]
print(output.text)
print("-------------------------")
print(f"{len(output.token_ids)} tokens generated in {result.metrics.finished_time-result.metrics.arrival_time:.2f} sec")

Processed prompts: 100%|██████████| 1/1 [00:18<00:00, 18.70s/it, Generation Speed: 54.77 toks/s]

 Les principales ouvertures d'échecs sont les suivantes :

1. Ouverture française (ECO C00-C99) : 1. e4 e6
2. Ouverture anglaise (ECO A00-A99) : 1. c4
3. Défense sicilienne (ECO B00-B99) : 1. e4 c5
4. Défense indienne (ECO E00-E99) : 1. d4
5. Défense sicilienne, variante Najdorf (ECO B97) : 1. e4 c5 2. Nf3 d6 3. d4 cxd4 4. Nxd4 Cf6 5. Nc3 a6 6. Rd1
6. Défense sicilienne, variante Scheveningen (ECO B84) : 1. e4 c5 2. Nf3 d6 3. d4 cxd4 4. Nxd4 Nf6 5. Nc3 a6 6. Be2
7. Ouverture du roi (ECO D00-D99) : 1. d4 d5
8. Défense russe (ECO E10-E99) : 1. d4 d5 2. c4 c6
9. Défense gréco (ECO D05-D09) : 1. d4 d5 2. c4 c6 3. Cf3
10. Défense caro-kann (ECO B10-B99) : 1. e4 c6 2. d4 d5
11. Défense benoni (ECO A60-A69) : 1. d4 c5 2. d5 e6 3. c4
12. Défense scandinave (ECO B03) : 1. e4 d5 2. exd5 Nf6 3. Nc3
13. Défense danoise (ECO B04) : 1. e4 e5 2. d4 exd4 3. Qxd4 Nc6
14. Défense sicilienne, variante Taimanov (ECO B88) : 1. e4 c5 2. Nf3 d6 3. d4 cxd4 4. Nxd4 Nf6 5. Nc3 a6 6. Bd3
15. Défense sicilienne, 




In [10]:
import os

gradio_server_port = int(os.environ.get('GRADIO_PORT'))
gradio_root_path = os.environ.get('GRADIO_BASE_URL')

print(f"This port used by gradio apps in the container: {gradio_server_port}")
print(f"Will be exposed at the following url on the virtual machine: {gradio_root_path}")

from IPython.display import display, HTML

display(HTML(f'<a href="{gradio_root_path}/" target="_blank">Click here to access the gradio app</a>'))

This port used by gradio apps in the container: 7860
Will be exposed at the following url on the virtual machine: /notebooks/gradio


In [11]:
import gradio as gr

system_prompt = "Tu es MonIA, un assistant expert qui essaie toujours d'être concis, exact, utile et poli."

def apply_chat_template(system_prompt, history, message):
    messages = [SystemMessage(content=system_prompt)]
    for turn in history:
        messages.append(UserMessage(content=turn[0]))
        messages.append(AssistantMessage(content=turn[1]))
    messages.append(UserMessage(content=message))
    completion_request = ChatCompletionRequest(messages=messages)
    tokens = tokenizer.encode_chat_completion(completion_request)
    return tokens.text

def process_message(message, history, system_prompt):
    prompt = apply_chat_template(system_prompt, history, message)
    for result in vllm_stream_output(llm, prompt, sampling_params=sampling_params):
        if isinstance(result,str):
            yield result
        else:
            print()
            print("--------------------------")
            input_tokens = len(result.prompt_token_ids)
            output_tokens = len(result.outputs[0].token_ids)
            total_time = result.metrics.finished_time-result.metrics.arrival_time
            time_to_first_token = result.metrics.first_token_time-result.metrics.arrival_time
            tokens_per_sec = (output_tokens-1)/(total_time-time_to_first_token)
            print(f"{output_tokens} tokens generated in {total_time:.2f} sec | prompt: {input_tokens:.2f} tokens | time to first token: {time_to_first_token:.2f} sec | throughput: {tokens_per_sec:.2f} tokens/sec")


demo = gr.ChatInterface(process_message, title=model_name, additional_inputs=[
        gr.Textbox(system_prompt, label="System Prompt")
    ], additional_inputs_accordion="Configuration").queue()
demo.launch(inline=False, share=False, server_name="0.0.0.0", server_port=gradio_server_port, root_path=gradio_root_path)

Running on local URL:  http://0.0.0.0:7860

To create a public link, set `share=True` in `launch()`.





--------------------------
69 tokens generated in 1.58 sec | prompt: 49.00 tokens | time to first token: 0.34 sec | throughput: 54.76 tokens/sec

--------------------------
121 tokens generated in 2.47 sec | prompt: 129.00 tokens | time to first token: 0.31 sec | throughput: 55.61 tokens/sec

--------------------------
195 tokens generated in 3.74 sec | prompt: 267.00 tokens | time to first token: 0.23 sec | throughput: 55.21 tokens/sec

--------------------------
195 tokens generated in 3.95 sec | prompt: 277.00 tokens | time to first token: 0.33 sec | throughput: 53.73 tokens/sec


In [34]:
demo.close()

Closing server running on port: 7860


## 2. Huggingface inference

### 2.1 Mistral 7B instruct v0.3

In [6]:
import time
from threading import Thread
from transformers import TextIteratorStreamer

def hf_display_output(model, tokenizer, prompt, max_new_tokens=1024):
    
    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

    generate_params = dict(
        tokenizer(prompt, return_tensors="pt").to('cuda'),
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=0.90,
        top_k=50,
        temperature= 0.6,
        num_beams=1,
        repetition_penalty=1.2,
    )

    t = Thread(target=model.generate, kwargs=generate_params)
    t.start()
    
    start_time = time.time()
    tokens_count = 0
    outputs = []
    for text in streamer:
        tokens_count += 1
        outputs.append(text)
        if tokens_count%50 == 0:
            current_time = time.time()
            tokens_per_sec = tokens_count/(current_time - start_time)
            print(f" --> {tokens_per_sec:.2f} tokens/sec")
        print(text, end="", flush=True)

    return ''.join(outputs)

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

model_name = models["mistral-instruct"]

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=DOWNLOAD_CACHE_DIR)
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=DOWNLOAD_CACHE_DIR, use_safetensors=True, device_map=0, torch_dtype="auto", attn_implementation="flash_attention_2")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
hf_display_output(model, tokenizer, '<s>[INST]▁quelles▁sont▁les▁principales▁ouvertures▁des▁échecs▁?[/INST]')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Les principales ouvertures d'échecs, classées par ordre de popularité croissante et en commençant par la plus connue, sont :

1. Ouverture du roi  --> 22.77 tokens/sec
(1. e4)
  - Variantes :
    - Ouverture française (1.e4 e6 2.d4 d5)
      - Défense sicilienne (1.e4  --> 24.46 tokens/sec
c5)
        - Variantes telles que la défense Najdorf, la variante Scheveningen, la variante Taimanov, etc.
    - Ouverture italienne (1.e4 e5)
 --> 25.07 tokens/sec
      - Gambit de Dame (1.e4 e5 2.Fc4 Fc5)
        - Gambit de Lopez (3.d4 exd4  --> 24.75 tokens/sec
4.Cf3)
    - Ouverture anglaise (1.e4 c4)
      - Défense scandinave (1.e4 e5 2.Fc4 Ec5)
        - Variantes  --> 24.52 tokens/sec
comme le gambit Evans (3.c3) ou le gambit Albin (3.d4)

2. Ouverture du Damier (1.d4)
  - Variantes :
     --> 24.41 tokens/sec
- Défense indienne (1.d4 Cf6 2.c4 e6)
      - Partie espagnole (1.d4 Cf6 2.c4 e6  --> 24.38 tokens/sec
3.Cc3 d5)
    - Ouverture du Damier flamande (1.d4 d5)
      - Défense Scotch (

"Les principales ouvertures d'échecs, classées par ordre de popularité croissante et en commençant par la plus connue, sont :\n\n1. Ouverture du roi (1. e4)\n  - Variantes :\n    - Ouverture française (1.e4 e6 2.d4 d5)\n      - Défense sicilienne (1.e4 c5)\n        - Variantes telles que la défense Najdorf, la variante Scheveningen, la variante Taimanov, etc.\n    - Ouverture italienne (1.e4 e5)\n      - Gambit de Dame (1.e4 e5 2.Fc4 Fc5)\n        - Gambit de Lopez (3.d4 exd4 4.Cf3)\n    - Ouverture anglaise (1.e4 c4)\n      - Défense scandinave (1.e4 e5 2.Fc4 Ec5)\n        - Variantes comme le gambit Evans (3.c3) ou le gambit Albin (3.d4)\n\n2. Ouverture du Damier (1.d4)\n  - Variantes :\n    - Défense indienne (1.d4 Cf6 2.c4 e6)\n      - Partie espagnole (1.d4 Cf6 2.c4 e6 3.Cc3 d5)\n    - Ouverture du Damier flamande (1.d4 d5)\n      - Défense Scotch (1.d4 d5 2.c4 c6)\n        - Variante Winawer de la partie Espagnole (1.d4 d5 2.c4 c6 3.Cf3 dxc4 4.a4)\n    - Défense tarrasque (1.d4 d

### 2.2 Mixtral 8x7B Instruct v0.1 HQQ

**NOTE: HQQ quantization has the best precision for Mixtral on 24 GB, but is not yet supported in VLLM.**

https://github.com/vllm-project/vllm/issues/2871

https://github.com/mobiusml/hqq

*Installing HQQ ATEN backend*

```bash
cd /workspace/instructlab-local/
source .venv/bin/activate

git clone https://github.com/mobiusml/hqq/

cd hqq/hqq/kernels
python setup_cuda.py install
```

In [3]:
import os
os.environ["LD_LIBRARY_PATH"] = os.getenv("LD_LIBRARY_PATH") + ":/workspace/instructlab-local/.venv/lib/python3.10/site-packages/torch/lib"

from importlib.metadata import version
print(version('hqq-aten'))

import hqq_aten

0.0.0


ImportError: libc10.so: cannot open shared object file: No such file or directory

In [None]:
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer as HQQAutoTokenizer
from hqq.core.quantize import HQQLinear, HQQBackend
from hqq.utils.patching import prepare_for_inference

from transformers import TextIteratorStreamer
from threading import Thread

model_name = models["mixtral-q2"]

HQQLinear.set_backend(HQQBackend.ATEN)

tokenizer = HQQAutoTokenizer.from_pretrained(model_name, cache_dir=DOWNLOAD_CACHE_DIR)
model = HQQModelForCausalLM.from_quantized(model_name, cache_dir=DOWNLOAD_CACHE_DIR)

# prepare_for_inference(model, backend="marlin")

In [4]:
model.model.layers[0].self_attn.q_proj.backend

<HQQBackend.PYTORCH_COMPILE: 'forward_pytorch_backprop_compile'>

In [5]:
{"nbits":model.model.layers[0].self_attn.q_proj.meta['nbits'], "group_size":model.model.layers[0].self_attn.q_proj.meta['group_size'], "axis":model.model.layers[0].self_attn.q_proj.meta['axis']}

{'nbits': 4, 'group_size': 64, 'axis': 0}

In [6]:
# Warmup model: !!! IMPORTANT when using Pytorch compile !!!
import torch

model.eval()
dummy_input = tokenizer.encode("This is a dummy input", return_tensors='pt').to(model.device)
with torch.no_grad(): 
    outputs = model(dummy_input, use_cache=False, output_attentions=False, output_hidden_states=False)

**Performance tests**

mixtral-q2
- PYTORCH backend : 1.18 tokens/sec
- PYTORCH_COMPILE backend : 1.41 tokens/sec
- "marlin" backend : 1.30 tokens/sec
- ATEN backend : 3 tokens /sec (from previous test)

prepare_for_inference(model, backend="torchao_int4") => does nothing, all layers skipped (because axis=0)

=> TOO SLOW for local usage, we will test GPTQ versions of smaller models instead

In [8]:
import time

def chat_processor(message, max_new_tokens=1024, do_sample=True):
    tokenizer.use_default_system_prompt = False
    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

    generate_params = dict(
        tokenizer("<s> [INST] " + message + " [/INST] ", return_tensors="pt").to('cuda'),
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        top_p=0.90,
        top_k=50,
        temperature= 0.6,
        num_beams=1,
        repetition_penalty=1.2,
    )

    t = Thread(target=model.generate, kwargs=generate_params)
    t.start()
    start_time = time.time()
    tokens_count = 0
    outputs = []
    for text in streamer:
        tokens_count += 1
        outputs.append(text)
        if tokens_count%10 == 0:
            current_time = time.time()
            tokens_per_sec = tokens_count/(current_time - start_time)
            print(f" --> {tokens_per_sec:.2f} tokens/sec")
        print(text, end="", flush=True)

    return outputs

# Generation
outputs = chat_processor("Quelles sont les principales ouvertures aux échecs ?", max_new_tokens=100, do_sample=True)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


1. Ouverture du  --> 1.27 tokens/sec
pion de roi (e4) :
-  --> 1.31 tokens/sec
La partie espagnole (Cf3,  --> 1.32 tokens/sec
d4) ;
- La partie italienne  --> 1.32 tokens/sec
(Cf3, e4) ;
 --> 1.33 tokens/sec
- Le gambit des deux épées  --> 1.33 tokens/sec
(Cf3, Cf3,  --> 1.33 tokens/sec
d4) ;
- L'attaque  --> 1.33 tokens/sec
viennoise (Cf3, d3).
 --> 1.33 tokens/sec
2. Ouverture du pion  --> 1.33 tokens/sec
de d