**Environment Setup and Library Imports**




In [None]:
!pip install -U -q vllm lmcache transformers accelerate pandas

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.9/87.9 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.2/509.2 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.6/192.6 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m79.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.0/111.0 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import time
import gc
import subprocess
import torch
import numpy as np
import pandas as pd
from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig

This block specifies the Hugging Face model repository to be used for inference and the *get_gpu_memory* function checks the current GPU memory usage (in MB)


In [None]:
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"


def get_gpu_memory():
   result = subprocess.check_output(
       ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,noheader,nounits"]
   )
   return int(result.decode("utf-8").strip())


Define a document about machine learning fundamentals and repeat it 400 times. This creates a massive context window.

In [None]:
shared_prefix = """
You are an AI assistant that answers questions based on the following document.
""" + (
   """
Machine learning is a field of artificial intelligence that focuses on building systems that learn from data.
Supervised learning uses labeled datasets to train models.
Unsupervised learning finds patterns without labels.
Reinforcement learning is based on reward-driven agents interacting with environments.
Neural networks are inspired by biological neurons and consist of layers of interconnected nodes.
Deep learning uses many layers to model complex patterns.
""" * 400   # long context to expose lmcache benefit
)


questions = [
   "What is supervised learning?",
   "What is unsupervised learning?",
   "What is reinforcement learning?",
   "What are neural networks?",
   "What is deep learning?"
]


NUM_REQUESTS = 20


prompts = [
   shared_prefix + "\nQuestion: " + questions[i % len(questions)]
   for i in range(NUM_REQUESTS)
]


sampling_params = SamplingParams(
   temperature=0.0,
   max_tokens=50
)


Define a benchmarking function that measures how quickly the model processes the batch of requests.

In [None]:
def run_benchmark(llm, prompts, label):


   torch.cuda.synchronize()


   start_time = time.time()
   vllm_results = llm.generate(prompts, sampling_params)
   torch.cuda.synchronize()
   end_time = time.time()


   total_time = end_time - start_time
   latency = total_time / len(prompts)
   throughput = len(prompts) / total_time




   print(f"\n--- {label} ---")
   print(f"Total time: {total_time:.3f} s")
   print(f"Latency: {latency:.2f} s")
   print(f"Throughput: {throughput:.2f} req/s")


   return {
       "latency": round(latency,2),
       "throughput": round(throughput,2),
   }


This block runs the model using standard vLLM settings without any external caching mechanisms. This represents a "Cold Run" where the system must re-process the long context for every request in the batch.

In [None]:
print("\n===== BASELINE (vLLM only) =====")
mem_before_vllm = get_gpu_memory()
llm_baseline = LLM(
    model=MODEL_NAME,
    gpu_memory_utilization=0.7,
    enable_prefix_caching=False
)
mem_after_vllm = get_gpu_memory()
vllm_mem_used = mem_after_vllm - mem_before_vllm
print(f"vLLM reserved: {vllm_mem_used} MB")

baseline_results = run_benchmark(llm_baseline, prompts, "vLLM Cold Run")

# Cleanup
del llm_baseline
gc.collect()
torch.cuda.empty_cache()



===== BASELINE (vLLM only) =====
INFO 02-20 16:56:47 [utils.py:261] non-default args: {'enable_prefix_caching': False, 'gpu_memory_utilization': 0.7, 'disable_log_stats': True, 'model': 'Qwen/Qwen2.5-1.5B-Instruct'}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

INFO 02-20 16:57:26 [model.py:541] Resolved architecture: Qwen2ForCausalLM
INFO 02-20 16:57:26 [model.py:1561] Using max model len 32768
INFO 02-20 16:57:27 [scheduler.py:226] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 02-20 16:57:27 [vllm.py:624] Asynchronous scheduling is enabled.


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

INFO 02-20 17:03:04 [llm.py:343] Supported tasks: ['generate']
vLLM reserved: 11950 MB


Adding requests:   0%|          | 0/20 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/20 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


--- vLLM Cold Run ---
Total time: 163.025 s
Latency: 8.15 s
Throughput: 0.12 req/s


This block enables the LMCache connector and internal prefix caching. This allows the system to store the computed KV (Key-Value) states of our massive document in memory rather than re-computing them for every request.

In [None]:
# ==========================================
# Run vLLM + LMCache
# ==========================================
print("\n===== vLLM + LMCache =====")
mem_before_lmcache = get_gpu_memory()
kv_config = KVTransferConfig(
    kv_connector="LMCacheConnectorV1",
    kv_role="kv_both"
)

llm_lmcache = LLM(
    model=MODEL_NAME,
    gpu_memory_utilization=0.8,
    enable_prefix_caching=True,
    kv_transfer_config=kv_config
)
mem_after_lmcache = get_gpu_memory()
lmcache_mem_used = mem_after_lmcache - mem_before_lmcache
print(f"vLLM + LMCache reserved: {lmcache_mem_used} MB")

# Cold run to fill cache
print("\n>> LMCache Cold Run (Warmup)")
_ = run_benchmark(llm_lmcache, prompts, "LMCache Cold")

# Warm run (cached)
print("\n>> LMCache Warm Run (Cached)")
lmcache_results = run_benchmark(llm_lmcache, prompts, "LMCache Warm")

# Cleanup
del llm_lmcache
gc.collect()
torch.cuda.empty_cache()




===== vLLM + LMCache =====
INFO 02-20 17:06:07 [utils.py:261] non-default args: {'enable_prefix_caching': True, 'gpu_memory_utilization': 0.8, 'disable_log_stats': True, 'kv_transfer_config': KVTransferConfig(kv_connector='LMCacheConnectorV1', engine_id='4007737e-49fb-4fc4-8331-8d5a6f742593', kv_buffer_device='cuda', kv_buffer_size=1000000000.0, kv_role='kv_both', kv_rank=None, kv_parallel_size=1, kv_ip='127.0.0.1', kv_port=14579, kv_connector_extra_config={}, kv_connector_module_path=None, enable_permute_local_kv=False, kv_load_failure_policy='recompute'), 'model': 'Qwen/Qwen2.5-1.5B-Instruct'}
INFO 02-20 17:06:08 [model.py:541] Resolved architecture: Qwen2ForCausalLM
INFO 02-20 17:06:08 [model.py:1561] Using max model len 32768
INFO 02-20 17:06:08 [scheduler.py:226] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 02-20 17:07:23 [llm.py:343] Supported tasks: ['generate']
vLLM + LMCache reserved: 13416 MB

>> LMCache Cold Run (Warmup)


Adding requests:   0%|          | 0/20 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/20 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


--- LMCache Cold ---
Total time: 14.672 s
Latency: 0.73 s
Throughput: 1.36 req/s

>> LMCache Warm Run (Cached)


Adding requests:   0%|          | 0/20 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/20 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


--- LMCache Warm ---
Total time: 6.160 s
Latency: 0.31 s
Throughput: 3.25 req/s


It aggregates the performance metrics from both the Baseline (vLLM) and the Optimized (LMCache) runs into a single DataFrame for side-by-side comparison.

In [None]:

# ==========================================
# Final comparison with dataframe
# ==========================================
data = {
    "Method": ["vLLM Cold", "LMCache Warm"],
    "Latency (s)": [baseline_results["latency"], lmcache_results["latency"]],
    "Throughput (req/s)": [baseline_results["throughput"], lmcache_results["throughput"]],
    "GPU Memory Used (MB)": [vllm_mem_used, lmcache_mem_used]
}

df_results = pd.DataFrame(data)
print("\n===== FINAL RESULTS =====")
print(df_results)

speedup = lmcache_results["throughput"] / baseline_results["throughput"]
print(f"\nSPEEDUP: {speedup:.2f}x faster with LMCache")


===== FINAL RESULTS =====
         Method  Latency (s)  Throughput (req/s)  GPU Memory Used (MB)
0     vLLM Cold         8.15                0.12                 11950
1  LMCache Warm         0.31                3.25                 13416

SPEEDUP: 27.08x faster with LMCache
