In [None]:
!pip install -U vllm lmcache transformers accelerate pandas


Collecting transformers
  Downloading transformers-5.0.0-py3-none-any.whl.metadata (37 kB)


In [None]:
import os
import time
import gc
import subprocess
import torch
import numpy as np
import pandas as pd
from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig

In [None]:
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"


def get_gpu_memory():
   result = subprocess.check_output(
       ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,noheader,nounits"]
   )
   return int(result.decode("utf-8").strip())


In [None]:
shared_prefix = """
You are an AI assistant that answers questions based on the following document.
""" + (
   """
Machine learning is a field of artificial intelligence that focuses on building systems that learn from data.
Supervised learning uses labeled datasets to train models.
Unsupervised learning finds patterns without labels.
Reinforcement learning is based on reward-driven agents interacting with environments.
Neural networks are inspired by biological neurons and consist of layers of interconnected nodes.
Deep learning uses many layers to model complex patterns.
""" * 400   # long context to expose lmcache benefit
)


questions = [
   "What is supervised learning?",
   "What is unsupervised learning?",
   "What is reinforcement learning?",
   "What are neural networks?",
   "What is deep learning?"
]


NUM_REQUESTS = 20


prompts = [
   shared_prefix + "\nQuestion: " + questions[i % len(questions)]
   for i in range(NUM_REQUESTS)
]


sampling_params = SamplingParams(
   temperature=0.0,
   max_tokens=50
)


In [None]:
def run_benchmark(llm, prompts, label):


   torch.cuda.synchronize()


   start_time = time.time()
   vllm_results = llm.generate(prompts, sampling_params)
   torch.cuda.synchronize()
   end_time = time.time()


   total_time = end_time - start_time
   latency = total_time / len(prompts)
   throughput = len(prompts) / total_time




   print(f"\n--- {label} ---")
   print(f"Total time: {total_time:.3f} s")
   print(f"Latency: {latency:.2f} s")
   print(f"Throughput: {throughput:.2f} req/s")


   return {
       "latency": round(latency,2),
       "throughput": round(throughput,2),
   }


In [None]:
print("\n===== BASELINE (vLLM only) =====")
mem_before_vllm = get_gpu_memory()
llm_baseline = LLM(
    model=MODEL_NAME,
    gpu_memory_utilization=0.7,
    enable_prefix_caching=False
)
mem_after_vllm = get_gpu_memory()
vllm_mem_used = mem_after_vllm - mem_before_vllm
print(f"vLLM reserved: {vllm_mem_used} MB")

baseline_results = run_benchmark(llm_baseline, prompts, "vLLM Cold Run")

# Cleanup
del llm_baseline
gc.collect()
torch.cuda.empty_cache()



===== BASELINE (vLLM only) =====
INFO 02-05 14:16:53 [utils.py:261] non-default args: {'enable_prefix_caching': False, 'gpu_memory_utilization': 0.7, 'disable_log_stats': True, 'model': 'Qwen/Qwen2.5-1.5B-Instruct'}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

INFO 02-05 14:17:16 [model.py:541] Resolved architecture: Qwen2ForCausalLM
INFO 02-05 14:17:16 [model.py:1561] Using max model len 32768
INFO 02-05 14:17:16 [scheduler.py:226] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 02-05 14:17:16 [vllm.py:624] Asynchronous scheduling is enabled.


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

INFO 02-05 14:22:05 [llm.py:343] Supported tasks: ['generate']
vLLM reserved: 12100 MB


Adding requests:   0%|          | 0/20 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/20 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


--- vLLM Cold Run ---
Total time: 181.152 s
Latency: 9.06 s
Throughput: 0.11 req/s


In [None]:
# ==========================================
# Run vLLM + LMCache
# ==========================================
print("\n===== vLLM + LMCache =====")
mem_before_lmcache = get_gpu_memory()
kv_config = KVTransferConfig(
    kv_connector="LMCacheConnectorV1",
    kv_role="kv_both"
)

llm_lmcache = LLM(
    model=MODEL_NAME,
    gpu_memory_utilization=0.8,
    enable_prefix_caching=True,
    kv_transfer_config=kv_config
)
mem_after_lmcache = get_gpu_memory()
lmcache_mem_used = mem_after_lmcache - mem_before_lmcache
print(f"vLLM + LMCache reserved: {lmcache_mem_used} MB")

# Cold run to fill cache
print("\n>> LMCache Cold Run (Warmup)")
_ = run_benchmark(llm_lmcache, prompts, "LMCache Cold")

# Warm run (cached)
print("\n>> LMCache Warm Run (Cached)")
lmcache_results = run_benchmark(llm_lmcache, prompts, "LMCache Warm")

# Cleanup
del llm_lmcache
gc.collect()
torch.cuda.empty_cache()




===== vLLM + LMCache =====
INFO 02-05 14:25:32 [utils.py:261] non-default args: {'enable_prefix_caching': True, 'gpu_memory_utilization': 0.8, 'disable_log_stats': True, 'kv_transfer_config': KVTransferConfig(kv_connector='LMCacheConnectorV1', engine_id='eee3ceae-bcfa-4122-a59f-a81970ff6424', kv_buffer_device='cuda', kv_buffer_size=1000000000.0, kv_role='kv_both', kv_rank=None, kv_parallel_size=1, kv_ip='127.0.0.1', kv_port=14579, kv_connector_extra_config={}, kv_connector_module_path=None, enable_permute_local_kv=False, kv_load_failure_policy='recompute'), 'model': 'Qwen/Qwen2.5-1.5B-Instruct'}
INFO 02-05 14:25:33 [model.py:541] Resolved architecture: Qwen2ForCausalLM
INFO 02-05 14:25:33 [model.py:1561] Using max model len 32768
INFO 02-05 14:25:33 [scheduler.py:226] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 02-05 14:26:44 [llm.py:343] Supported tasks: ['generate']
vLLM + LMCache reserved: 13622 MB

>> LMCache Cold Run (Warmup)


Adding requests:   0%|          | 0/20 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/20 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


--- LMCache Cold ---
Total time: 13.187 s
Latency: 0.66 s
Throughput: 1.52 req/s

>> LMCache Warm Run (Cached)


Adding requests:   0%|          | 0/20 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/20 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


--- LMCache Warm ---
Total time: 6.164 s
Latency: 0.31 s
Throughput: 3.24 req/s


In [None]:

# ==========================================
# Final comparison with dataframe
# ==========================================
data = {
    "Method": ["vLLM Cold", "LMCache Warm"],
    "Latency (s)": [baseline_results["latency"], lmcache_results["latency"]],
    "Throughput (req/s)": [baseline_results["throughput"], lmcache_results["throughput"]],
    "GPU Memory Used (MB)": [vllm_mem_used, lmcache_mem_used]
}

df_results = pd.DataFrame(data)
print("\n===== FINAL RESULTS =====")
print(df_results)

speedup = lmcache_results["throughput"] / baseline_results["throughput"]
print(f"\nSPEEDUP: {speedup:.2f}x faster with LMCache")


===== FINAL RESULTS =====
         Method  Latency (s)  Throughput (req/s)  GPU Memory Used (MB)
0     vLLM Cold         9.06                0.11                 12100
1  LMCache Warm         0.31                3.24                 13622

SPEEDUP: 29.45x faster with LMCache
