# Venice some4dem workshop

## vLLM inference

### (1) Promts preparation

In [1]:
import os
import time
import json
from tqdm import tqdm
from string import Template

from essential_generators import DocumentGenerator

In [2]:
system_content = "You are an expert in politics."

instructions = """Please classify the following social media profile according
to whether it expresses support or positive attitudes towards  French righ wing parties.
Here is the message: '${text}'"""

def make_promts(text):
    return [
        {"role": "system", "content": system_content},
        {"role": "user", "content": Template(instructions).substitute(text=text)},
    ]

In [3]:
text1 = """Gaulliste/souveraineté nationale et populaire.
Cofondateur du Mouvement Politique Citoyen.
Rejoignez-nous, Adhérer ⤵️, Aidez-nous à sauver la France !"""

text2 = """Le coeur qui bat à gauche • Avec @egregoire
 pour un @paris_en_grand
 • Anime @coeurgauche75
 •🗣️🇪🇺 au @partisocialiste
 et 🇫🇷au @pes_pse
 • Militant @psparis10"""

In [4]:
make_promts(text1)

[{'role': 'system', 'content': 'You are an expert in politics.'},
 {'role': 'user',
  'content': "Please classify the following social media profile according\nto whether it expresses support or positive attitudes towards  French righ wing parties.\nHere is the message: 'Gaulliste/souveraineté nationale et populaire.\nCofondateur du Mouvement Politique Citoyen.\nRejoignez-nous, Adhérer ⤵️, Aidez-nous à sauver la France !'"}]

### (2) vllM model

In [5]:
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
!nvidia-smi

Mon Jul  7 00:04:25 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A4500               Off |   00000000:0A:00.0 Off |                  Off |
| 35%   63C    P0             83W /  200W |       1MiB /  20470MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA RTX A4500               Off |   00

### (2.1) Load the model

#### First attempt fails because of memory problems

#### Reduce model context length to fit it in memory  

In [6]:
from vllm import LLM

model_id = "meta-llama/Llama-3.1-8B-Instruct"
llm = LLM(
    model=model_id,
    dtype="bfloat16",
    max_model_len=10000
)


INFO 07-07 00:04:51 [__init__.py:244] Automatically detected platform cuda.
INFO 07-07 00:05:01 [config.py:823] This model supports multiple tasks: {'score', 'classify', 'embed', 'reward', 'generate'}. Defaulting to 'generate'.
INFO 07-07 00:05:01 [config.py:2195] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 07-07 00:05:04 [core.py:455] Waiting for init message from front-end.
INFO 07-07 00:05:04 [core.py:70] Initializing a V1 LLM engine (v0.9.1) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=10000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 07-07 00:05:10 [default_loader.py:272] Loading weights took 2.61 seconds
INFO 07-07 00:05:11 [gpu_model_runner.py:1624] Model loading took 14.9889 GiB and 4.070573 seconds
INFO 07-07 00:05:17 [backends.py:462] Using cache directory: /home/jimena.royoletelier/storage/torch_compile_cache/1fd98ba2be/rank_0_0 for vLLM's torch.compile
INFO 07-07 00:05:17 [backends.py:472] Dynamo bytecode transform time: 5.82 s
INFO 07-07 00:05:23 [backends.py:135] Directly load the compiled graph(s) for shape None from the cache, took 5.599 s
INFO 07-07 00:05:24 [monitor.py:34] torch.compile takes 5.82 s in total
INFO 07-07 00:05:27 [gpu_worker.py:227] Available KV cache memory: 1.44 GiB
INFO 07-07 00:05:27 [kv_cache_utils.py:715] GPU KV cache size: 11,744 tokens
INFO 07-07 00:05:27 [kv_cache_utils.py:719] Maximum concurrency for 10,000 tokens per request: 1.17x
INFO 07-07 00:05:54 [gpu_model_runner.py:2048] Graph capturing finished in 26 secs, took 0.51 GiB
INFO 07-07 00:05:54 [core.py:171] init engin

### (2.2) Monitor the gpu card memory usage 

In [7]:
!nvidia-smi

Mon Jul  7 00:06:09 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A4500               Off |   00000000:0A:00.0 Off |                  Off |
| 37%   67C    P2            105W /  200W |   18859MiB /  20470MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA RTX A4500               Off |   00

### (3) Request the model answer

#### (3.1) Creates a sampling parameters object

In [8]:
from vllm import SamplingParams

sampling_params = SamplingParams(
    temperature=0.6,
    top_p=0.9, # The model will only consider the results of the tokens with top_p 90% probability mass.
    max_tokens=32 # ~ 0.75 * 32 = 24 words
)

#### (3.1) A single answer

In [9]:
start = time.time()
outputs = llm.chat(messages=make_promts(text1), sampling_params=sampling_params)
end = time.time()

INFO 07-07 00:06:29 [chat_utils.py:420] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|                                                                           | 0/1 [00:0…

In [16]:
answer = outputs[0].outputs[0].text
print(f"Took {end - start} seconds.\n")
print(f"OUTPUT:\n\t{outputs}\n")
print(f"ANSWER:\n\t{answer}")

Took 2.4875199794769287 seconds.

OUTPUT:
	[RequestOutput(request_id=0, prompt=None, prompt_token_ids=[128000, 128006, 9125, 128007, 271, 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 198, 15724, 2696, 25, 220, 1627, 10263, 220, 2366, 19, 271, 2675, 527, 459, 6335, 304, 11759, 13, 128009, 128006, 882, 128007, 271, 5618, 49229, 279, 2768, 3674, 3772, 5643, 4184, 198, 998, 3508, 433, 61120, 1862, 477, 6928, 33726, 7119, 220, 8753, 436, 1108, 20611, 9875, 627, 8586, 374, 279, 1984, 25, 364, 81888, 620, 17194, 2754, 15170, 467, 295, 978, 7140, 1604, 1880, 2477, 74775, 627, 34, 1073, 263, 1045, 324, 3930, 73934, 7986, 16307, 2428, 18002, 2303, 268, 627, 697, 7453, 625, 10333, 5392, 788, 11, 2467, 71, 52424, 2928, 97, 113, 31643, 11, 362, 579, 89, 5392, 788, 3869, 33254, 424, 1208, 9822, 758, 6, 128009, 128006, 78191, 128007, 271], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text='Based on the content of the message, I wou

In [17]:
print(f"{end - start:.2f} seconds for one request means {1 * 5.5e6 / (24 * 60 * 60):.1f} days per 5.5 millons of requests.")

2.49 seconds for one request means 63.7 days per 5.5 millons of requests.


#### (3.2) Many answers

In [18]:
from string import Template

nb_texts = 1000
gen = DocumentGenerator()
list_of_texts = [gen.sentence() for _ in range(nb_texts)]
list_of_messages = [make_promts(text) for text in list_of_texts]

In [19]:
print(f"Mean text lenght is {sum([len(t) for t in list_of_texts]) / nb_texts} words.")

Mean text lenght is 59.538 words.


In [20]:
start = time.time()
outputs = llm.chat(messages=list_of_messages, sampling_params=sampling_params)
took  = time.time() - start
per_request_time = took / nb_texts

Adding requests:   0%|          | 0/1000 [00:00<?, ?it/s]

Processed prompts:   0%|                                                                        | 0/1000 [00:0…

In [21]:
print(f"Took {per_request_time:.2f} seconds per request.")

Took 0.02 seconds per request.


In [22]:
print(f"{per_request_time:.2f} seconds per request means {per_request_time * 5.5e6 / (24 * 60 * 60):.1f} days per 5.5 millons of requests.")

0.02 seconds per request means 1.2 days per 5.5 millons of requests.
