# Venice some4dem workshop

## Transformer inference

### (1) Promts preparation

In [1]:
import os
import time
import json
from tqdm import tqdm
from string import Template

from essential_generators import DocumentGenerator

In [2]:
system_content = "You are an expert in politics."

instructions = """Please classify the following social media profile according
to whether it expresses support or positive attitudes towards  French righ wing parties.
Here is the message: '${text}'"""

def make_promts(text):
    return [
        {"role": "system", "content": system_content},
        {"role": "user", "content": Template(instructions).substitute(text=text)},
    ]

In [3]:
text1 = """Gaulliste/souveraineté nationale et populaire.
Cofondateur du Mouvement Politique Citoyen.
Rejoignez-nous, Adhérer ⤵️, Aidez-nous à sauver la France !"""

text2 = """Le coeur qui bat à gauche • Avec @egregoire
 pour un @paris_en_grand
 • Anime @coeurgauche75
 •🗣️🇪🇺 au @partisocialiste
 et 🇫🇷au @pes_pse
 • Militant @psparis10"""

In [4]:
make_promts(text1)

[{'role': 'system', 'content': 'You are an expert in politics.'},
 {'role': 'user',
  'content': "Please classify the following social media profile according\nto whether it expresses support or positive attitudes towards  French righ wing parties.\nHere is the message: 'Gaulliste/souveraineté nationale et populaire.\nCofondateur du Mouvement Politique Citoyen.\nRejoignez-nous, Adhérer ⤵️, Aidez-nous à sauver la France !'"}]

### (2) Transformer model

### (2.0) Monitor the gpu card memory usage 

In [5]:
os.environ["CUDA_VISIBLE_DEVICES"] = '1'
!nvidia-smi

Mon Jul  7 00:00:57 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A4500               Off |   00000000:0A:00.0 Off |                  Off |
| 37%   61C    P0             87W /  200W |       1MiB /  20470MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA RTX A4500               Off |   00

### (2.1) Load the model

In [6]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
model_path = "/store/huggingface_cache/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659"
pipeline = transformers.pipeline(
    "text-generation",
    model=model_path,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


In [7]:
!nvidia-smi

Mon Jul  7 00:01:47 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A4500               Off |   00000000:0A:00.0 Off |                  Off |
| 36%   63C    P0            101W /  200W |       1MiB /  20470MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA RTX A4500               Off |   00

### (3) Request the model answer

#### (3.1) A single answer

In [8]:
start = time.time()
outputs = pipeline(
    make_promts(text1),
    temperature=0.6,
    top_p=0.9, # The model will only consider the results of the tokens with top_p 90% probability mass.
    max_new_tokens=32 # ~ 0.75 * 32 = 24 words
)
end = time.time()

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [9]:
print(f"Took {end - start} seconds.")
print(outputs)
answer = outputs[0]["generated_text"][-1]["content"]
print(f"ANSWER:\n\t{answer}")

Took 2.4522247314453125 seconds.
[{'generated_text': [{'role': 'system', 'content': 'You are an expert in politics.'}, {'role': 'user', 'content': "Please classify the following social media profile according\nto whether it expresses support or positive attitudes towards  French righ wing parties.\nHere is the message: 'Gaulliste/souveraineté nationale et populaire.\nCofondateur du Mouvement Politique Citoyen.\nRejoignez-nous, Adhérer ⤵️, Aidez-nous à sauver la France !'"}, {'role': 'assistant', 'content': 'Based on the information provided, I would classify this social media profile as expressing support for the French right-wing parties, specifically those with Gaullist and nationalist ideologies'}]}]
ANSWER:
	Based on the information provided, I would classify this social media profile as expressing support for the French right-wing parties, specifically those with Gaullist and nationalist ideologies


In [10]:
print(f"{end - start:.2f} seconds for one request means {1 * 5.5e6 / (24 * 60 * 60):.1f} days per 5.5 millons of requests.")

2.45 seconds for one request means 63.7 days per 5.5 millons of requests.


#### (3.2) Many answers

In [12]:
from string import Template

nb_texts = 10
gen = DocumentGenerator()
list_of_texts = [gen.sentence() for _ in range(nb_texts)]
list_of_messages = [make_promts(text) for text in list_of_texts]

In [13]:
print(f"Mean text lenght is {sum([len(t) for t in list_of_texts]) / nb_texts} words.")

Mean text lenght is 57.6 words.


In [14]:
start = time.time()
outputs = pipeline(
        list_of_messages,
        temperature=0.6,
        top_p=0.9,
        max_new_tokens=32
    )
took  = time.time() - start
per_request_time = took / nb_texts

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [16]:
print(f"Took {per_request_time:.2f} seconds per request.")

Took 0.97 seconds per request.


In [19]:
print(f"{per_request_time:.2f} seconds per reques means {per_request_time * 5.5e6 / (24 * 60 * 60):.1f} days per 5.5 millons of requests.")

0.97 seconds per reques means 61.9 days per 5.5 millons of requests.
