# Inference Optimization: Quantization & Efficient Generation

In [None]:
!pip -q install -U transformers accelerate bitsandbytes


In [None]:
import torch, time
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "gpt2"
tok = AutoTokenizer.from_pretrained(model_name)
tok.pad_token = tok.eos_token

print("Loading 8-bit model…")
model_8 = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
prompt = "Explain parameter-efficient fine-tuning in one sentence:"
inputs = tok(prompt, return_tensors="pt").to(model_8.device)

start = time.time()
_ = model_8.generate(**inputs, max_new_tokens=64)
t8 = time.time()-start
print("8-bit latency (one pass):", round(t8,3),"s")

print("Loading 4-bit model…")
model_4 = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
inputs = tok(prompt, return_tensors="pt").to(model_4.device)

start = time.time()
_ = model_4.generate(**inputs, max_new_tokens=64)
t4 = time.time()-start
print("4-bit latency (one pass):", round(t4,3),"s")

# Optional: torch.compile for supported backends (PyTorch 2.x)
try:
    model_c = torch.compile(model_4)  # may be a no-op on some environments
    start = time.time()
    _ = model_c.generate(**inputs, max_new_tokens=64)
    tc = time.time()-start
    print("Compiled latency (one pass):", round(tc,3),"s")
except Exception as e:
    print("torch.compile not available or failed:", e)


> Tips: Use KV-cache, larger batch sizes, and continuous batching for throughput. For serving, consider vLLM/TensorRT-LLM.