In [None]:
!pip install vllm==0.5.5 --default-timeout=100

In [5]:
!pip install git+https://github.com/ozeliger/pyairports.git

Collecting git+https://github.com/ozeliger/pyairports.git
  Cloning https://github.com/ozeliger/pyairports.git to /tmp/pip-req-build-apfi2w69
  Running command git clone --filter=blob:none --quiet https://github.com/ozeliger/pyairports.git /tmp/pip-req-build-apfi2w69
  Resolved https://github.com/ozeliger/pyairports.git to commit f611ee5a5a82b4e98b22641bb99693d862c802e4
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyairports
  Building wheel for pyairports (setup.py) ... [?25l[?25hdone
  Created wheel for pyairports: filename=pyairports-2.1.1-py3-none-any.whl size=371696 sha256=672331867742d36f693f938d5537b75e10bbf3252dfb9531c9db03a32dfdda09
  Stored in directory: /tmp/pip-ephem-wheel-cache-grjkdhs4/wheels/62/2b/97/a9e6762aaa320863b0c2cd3b9e1a65c842df244a714a8b6342
Successfully built pyairports
Installing collected packages: pyairports
  Attempting uninstall: pyairports
    Found existing installation: pyairports 0.0.1
    Uninstalling 

In [1]:
from vllm import LLM, SamplingParams
import time
import torch
import statistics

model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"

llm = LLM(
    model=model_id,
    quantization="awq",
    dtype="auto",
    max_model_len=53200
)
params = SamplingParams(max_tokens=1, temperature=0)
prompt = "What is LLM?"

# --- warmup ---
_ = llm.generate([prompt], params)
torch.cuda.synchronize()

# --- repeated TTFT measurement ---
iters = 20
latencies = []

for _ in range(iters):
    torch.cuda.synchronize()
    start = time.time()

    _ = llm.generate([prompt], params)

    torch.cuda.synchronize()
    end = time.time()

    latencies.append((end - start) * 1000)  # ms

# --- statistics ---
mean = statistics.mean(latencies)
median = statistics.median(latencies)

print(f"TTFT mean   : {mean:.2f} ms")
print(f"TTFT median : {median:.2f} ms")
print(f"Raw first 5 latencies: {[round(x,2) for x in latencies[:5]]}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


INFO 12-10 13:03:45 config.py:911] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 12-10 13:03:45 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4', speculative_config=None, tokenizer='hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=53200, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=hugging

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 12-10 13:03:48 model_runner.py:879] Starting to load model hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4...
INFO 12-10 13:03:48 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 12-10 13:03:48 selector.py:116] Using XFormers backend.
INFO 12-10 13:03:49 weight_utils.py:236] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 12-10 13:04:03 model_runner.py:890] Loading model weights took 5.3735 GB
INFO 12-10 13:04:04 gpu_executor.py:121] # GPU blocks: 3325, # CPU blocks: 2048
INFO 12-10 13:04:08 model_runner.py:1181] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 12-10 13:04:08 model_runner.py:1185] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 12-10 13:05:04 model_runner.py:1300] Graph capturing finished in 56 secs.


Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.08s/it, est. speed input: 0.99 toks/s, output: 0.16 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 16.84it/s, est. speed input: 101.34 toks/s, output: 16.88 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 17.52it/s, est. speed input: 105.67 toks/s, output: 17.60 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 23.27it/s, est. speed input: 140.37 toks/s, output: 23.39 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 24.37it/s, est. speed input: 147.28 toks/s, output: 24.53 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 23.56it/s, est. speed input: 142.35 toks/s, output: 23.71 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 24.79it/s, est. speed input: 149.50 toks/s, output: 24.91 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 21.87it/s, est. speed input: 131.94 toks/s, output: 21.98 toks/s]
Processed prompts: 100%|██████████| 1/1 [00

TTFT mean   : 46.99 ms
TTFT median : 44.54 ms
Raw first 5 latencies: [65.84, 61.93, 46.66, 44.45, 46.64]



