In [2]:
# !pip install transformers torch accelerate bitsandbytes

In [3]:
# !pip install transformers --upgrade

Import libraries

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

Load GPT-2 Model

In [29]:
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [30]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [31]:
# !pip install torchinfo

In [32]:
from torchinfo import summary
summary(model, input_size=(1, 10), dtypes=[torch.long])

Layer (type:depth-idx)                             Output Shape              Param #
GPT2LMHeadModel                                    [1, 12, 10, 64]           --
├─GPT2Model: 1-1                                   [1, 12, 10, 64]           --
│    └─Embedding: 2-1                              [1, 10, 768]              38,597,376
│    └─Embedding: 2-2                              [1, 10, 768]              786,432
│    └─Dropout: 2-3                                [1, 10, 768]              --
│    └─ModuleList: 2-4                             --                        --
│    │    └─GPT2Block: 3-1                         [1, 10, 768]              7,087,872
│    │    └─GPT2Block: 3-2                         [1, 10, 768]              7,087,872
│    │    └─GPT2Block: 3-3                         [1, 10, 768]              7,087,872
│    │    └─GPT2Block: 3-4                         [1, 10, 768]              7,087,872
│    │    └─GPT2Block: 3-5                         [1, 10, 768]           

Check model size

In [33]:
param_count = sum(p.numel() for p in model.parameters())
print(f"Parameters: {param_count:,}")
print(f"Memory usage: ~{param_count * 4 / 1e9:.2f} GB (FP32)")


# Define bit-widths for different precisions
# precisions = {
#     "FP32": 32,
#     "FP16": 16,
#     "INT8": 8,
#     "INT4": 4,
# }

# Print memory usage for each precision
# print("Approximate Memory Usage:")
# for precision, bits in precisions.items():
#     bytes_per_param = bits / 8
#     memory_gb = param_count * bytes_per_param / 1e9
#     print(f"├── {precision}: ~{memory_gb:.2f} GB")

Parameters: 124,439,808
Memory usage: ~0.50 GB (FP32)


Inference timing with the trained GPT-2 model

In [9]:
import time

`model:` preloaded HuggingFace causal language model (e.g., GPT-2)
`tokenizer:` convert text into model inputs
`prompt:` text input to feed the model
`num_runs:` times to measure the inference (default: 10 times for averaging)

In [10]:
def measure_inference_time(model, tokenizer, prompt, num_runs=10):
    model.eval()
    times = [] #init to store individual inference time

    inputs = tokenizer(prompt, return_tensors="pt")

    # Warmup inference without recording time to help eliminating the one-time memory allocation overhead. torch.no_grad for memory efficient inference, max token len set to 50
    with torch.no_grad():
        _ = model.generate(**inputs, max_length=50)

    # record the inference times
    for _ in range(num_runs):
        start = time.time()
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=50)
        end = time.time()
        times.append(end - start)

    return sum(times) / len(times)

avg_time = measure_inference_time(model, tokenizer, "Hello, I am Jahid Hasan, AI researcher and entreprenuer.")
print(f"Average inference time: {avg_time:.3f} seconds")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Average inference time: 2.050 seconds


Compare the precision formats, experiment with quantization

In [20]:
from transformers import BitsAndBytesConfig
import torch

In [21]:
def load_model_with_precision(model_name, precision="fp32"):
    if precision == "fp16":
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="auto"
        )
    elif precision == "int8":
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            device_map="auto"
        )
    elif precision == "int4":
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16
        )
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            device_map="auto"
        )
    else:  # fp32
        model = AutoModelForCausalLM.from_pretrained(model_name)

    return model

In [22]:
precisions = ["fp32", "fp16", "int8", "int4"]
results = {}

In [23]:
for precision in precisions:
    print(f"\nTesting {precision.upper()}...")
    try:
        model = load_model_with_precision("microsoft/DialoGPT-small", precision)

        # Measure memory and speed
        memory_mb = torch.cuda.memory_allocated() / 1024**2 if torch.cuda.is_available() else 0
        inference_time = measure_inference_time(model, tokenizer, "Hello")

        results[precision] = {
            'memory_mb': memory_mb,
            'inference_time': inference_time
        }

        del model
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

    except Exception as e:
        print(f"Error with {precision}: {str(e)}")


Testing FP32...


config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Testing FP16...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Testing INT8...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Testing INT4...
Error with int4: quant_type must be nf4 on CPU, got fp4


In [34]:
import pandas as pd
df = pd.DataFrame(results).T
print("\nQuantization Comparison:")
print(df)


Quantization Comparison:
      memory_mb  inference_time
fp32        0.0        0.219912
fp16        0.0        1.879117
int8        0.0        8.840728


Post-Training Quantization (PTQ)

In [36]:
# Using GPTQ for better 4-bit quantization
# !pip install auto-gptq

In [37]:
from auto_gptq import AutoGPTQForCausalLM
from transformers import AutoTokenizer

  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)


Static Batch inference

In [38]:
def static_batch_inference(model, tokenizer, prompts, batch_size=4):
    """Process prompts in fixed-size batches"""
    results = []

    # Pad tokenizer for batching
    tokenizer.pad_token = tokenizer.eos_token

    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i+batch_size]

        # Tokenize batch
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        )

        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=50,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )

        # Decode results
        batch_results = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        results.extend(batch_results)

    return results

In [39]:
prompts = [
    "The future of AI is",
    "Machine learning helps us",
    "Deep learning models can",
    "Neural networks are",
    "Artificial intelligence will"
]

batch_results = static_batch_inference(model, tokenizer, prompts, batch_size=2)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Dynamic batch inference

In [40]:
import asyncio
import random
from queue import Queue
from threading import Thread
import time

In [41]:
class DynamicBatcher:
    def __init__(self, model, tokenizer, max_batch_size=4, max_wait_time=0.1):
        self.model = model
        self.tokenizer = tokenizer
        self.max_batch_size = max_batch_size
        self.max_wait_time = max_wait_time
        self.request_queue = Queue()
        self.running = True

        # Start background processing
        self.processor_thread = Thread(target=self._process_batches)
        self.processor_thread.start()
    def _process_batches(self):
        while self.running:
            batch = []
            start_time = time.time()

            # Collect requests for batch
            while (len(batch) < self.max_batch_size and
                   time.time() - start_time < self.max_wait_time):

                if not self.request_queue.empty():
                    request = self.request_queue.get()
                    batch.append(request)
                else:
                    time.sleep(0.001)  # Small sleep to prevent busy waiting

            if batch:
                self._process_batch(batch)

    def _process_batch(self, batch):
        prompts = [req['prompt'] for req in batch]

        # Tokenize
        inputs = self.tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True
        )

        # Generate
        start_time = time.time()
        with torch.no_grad():
            outputs = self.model.generate(**inputs, max_new_tokens=20)

        processing_time = time.time() - start_time

        # Return results
        results = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for i, req in enumerate(batch):
            req['result'] = results[i]
            req['processing_time'] = processing_time
            req['batch_size'] = len(batch)

    def inference(self, prompt):
        request = {'prompt': prompt, 'result': None, 'processing_time': None}
        self.request_queue.put(request)

        # Wait for result
        while request['result'] is None:
            time.sleep(0.001)

        return request

    def stop(self):
        self.running = False
        self.processor_thread.join()

In [42]:
# Test dynamic batching
batcher = DynamicBatcher(model, tokenizer)

In [43]:
# Simulate concurrent requests
results = []
for i in range(10):
    result = batcher.inference(f"Request {i}: The answer is")
    results.append(result)
    time.sleep(random.uniform(0.01, 0.05))  # Random arrival times

batcher.stop()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [45]:
import numpy as np

In [46]:
batch_sizes = [r['batch_size'] for r in results]
print(f"Average batch size: {sum(batch_sizes) / len(batch_sizes):.2f}")
print(f"Batch size distribution: {dict(zip(*np.unique(batch_sizes, return_counts=True)))}")

Average batch size: 1.00
Batch size distribution: {np.int64(1): np.int64(10)}


Key-Value Cache Management for continuous batching and quantization for advance optimization

In [47]:
class KVCacheManager:
    """Manages KV cache for efficient generation"""

    def __init__(self):
        self.cache = {}

    def get_cache_size(self, past_key_values):
        """Calculate memory usage of KV cache"""
        if past_key_values is None:
            return 0

        total_size = 0
        for layer_cache in past_key_values:
            for tensor in layer_cache:
                total_size += tensor.numel() * tensor.element_size()

        return total_size / (1024**2)  # MB

    def efficient_generate(self, model, tokenizer, prompt, max_new_tokens=50):
        """Generate with KV cache tracking"""
        inputs = tokenizer(prompt, return_tensors="pt")

        generated_tokens = []
        past_key_values = None
        current_input = inputs['input_ids']

        for step in range(max_new_tokens):
            # Forward pass
            outputs = model(
                input_ids=current_input,
                past_key_values=past_key_values,
                use_cache=True
            )

            # Get next token
            next_token_logits = outputs.logits[:, -1, :]
            next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)

            generated_tokens.append(next_token.item())

            # Update for next iteration
            past_key_values = outputs.past_key_values
            current_input = next_token

            # Track cache size
            cache_size = self.get_cache_size(past_key_values)

            if step % 10 == 0:
                print(f"Step {step}: Cache size = {cache_size:.2f} MB")

        # Decode result
        full_sequence = torch.cat([inputs['input_ids'], torch.tensor([generated_tokens])], dim=1)
        return tokenizer.decode(full_sequence[0], skip_special_tokens=True)

Efficiency test KV

In [48]:
cache_manager = KVCacheManager()
result = cache_manager.efficient_generate(model, tokenizer, "The benefits of caching are")
print(result)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.53.0. You should pass an instance of `Cache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.


Step 0: Cache size = 0.35 MB
Step 10: Cache size = 1.05 MB
Step 20: Cache size = 1.76 MB
Step 30: Cache size = 2.46 MB
Step 40: Cache size = 3.16 MB
The benefits of caching are that you can easily see the data in the cache and you can easily see the changes in the cache.

The caching is done by using a cache object that is a collection of objects that are stored in the cache. The objects are stored in


FastAPI Experiment

In [50]:
# !pip install fastapi uvicorn nest-asyncio pyngrok transformers accelerate torch

In [93]:
!ngrok config add-authtoken API_KEY_HERE

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [94]:
from pyngrok import ngrok

# Set up FastAPI server
public_url = ngrok.connect(8000)
print(f"Public URL: {public_url}")

Public URL: NgrokTunnel: "https://e211-34-86-98-232.ngrok-free.app" -> "http://localhost:8000"


In [95]:
import torch
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
import nest_asyncio
import uvicorn
import time

# Patch asyncio event loop for Colab
nest_asyncio.apply()

# Inference input model
class InferenceRequest(BaseModel):
    prompt: str
    max_tokens: int = 50
    temperature: float = 0.7
    top_p: float = 0.9

# Inference output model
class InferenceResponse(BaseModel):
    generated_text: str
    tokens_generated: int
    inference_time: float

# Initialize FastAPI
app = FastAPI()

# Load model and tokenizer (once)
model_name = "microsoft/DialoGPT-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model.eval()

@app.get("/")
def read_root():
    return {"message": "Inference server is running!"}

@app.post("/generate", response_model=InferenceResponse)
def generate(request: InferenceRequest):
    try:
        start = time.time()
        inputs = tokenizer(request.prompt, return_tensors="pt")
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=request.max_tokens,
                temperature=request.temperature,
                top_p=request.top_p,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        end = time.time()
        return InferenceResponse(
            generated_text=generated_text,
            tokens_generated=len(output[0]) - len(inputs["input_ids"][0]),
            inference_time=end - start
        )
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


In [96]:
from typing import List

class BatchInferenceRequest(BaseModel):
    prompts: List[str]
    max_tokens: int = 50
    temperature: float = 0.7

@app.post("/batch_generate")
def batch_generate(request: BatchInferenceRequest):
    try:
        start_time = time.time()

        inputs = tokenizer(
            request.prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        )

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=request.max_tokens,
                temperature=request.temperature,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )

        results = []
        for i, output in enumerate(outputs):
            text = tokenizer.decode(output, skip_special_tokens=True)
            results.append({
                "prompt_index": i,
                "generated_text": text
            })

        inference_time = time.time() - start_time

        return {
            "results": results,
            "batch_size": len(request.prompts),
            "total_inference_time": inference_time,
            "average_time_per_prompt": inference_time / len(request.prompts)
        }

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


In [97]:
uvicorn.run(app, host="0.0.0.0", port=8000)

INFO:     Started server process [1330]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     34.69.214.222:0 - "POST /generate HTTP/1.1" 200 OK


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


INFO:     34.69.214.222:0 - "POST /batch_generate HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [1330]
