In [None]:
import os
from dotenv import load_dotenv

# Load HF token from .env
load_dotenv()
hf_token = os.getenv('HUGGINGFACE_TOKEN')

# Login to HuggingFace
from huggingface_hub import login
login(token=hf_token)

In [1]:
import time
import psutil
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from dataclasses import dataclass
import numpy as np

model_name = "meta-llama/Llama-3.2-1B"
decode_on_cpu = True
max_new_tokens = 50
input_text = "This is a test prompt. The model should continue this text with a meaningful completion."




#################################################################
# Helper Classes and Methods
#################################################################

In [2]:
@dataclass
class PhaseMetrics:
    phase: str
    start_time: float
    end_time: float
    tokens_processed: int

@dataclass
class TokenMetrics:
    token_index: int
    latency: float
    cpu_memory_mb: float

class EnhancedMetricsTracker:
    def __init__(self):
        self.phases = {}
        self.token_metrics = []
        self.start_time = time.perf_counter()

    def start_phase(self, phase: str):
        self.phases[phase] = PhaseMetrics(phase, time.perf_counter(), None, 0)

    def end_phase(self, phase: str, tokens_processed: int):
        self.phases[phase].end_time = time.perf_counter()
        self.phases[phase].tokens_processed = tokens_processed

    def sample_token(self, token_index: int, latency: float):
        # CPU memory usage
        process = psutil.Process()
        mem_info = process.memory_info().rss / (1024**2)
        self.token_metrics.append(TokenMetrics(
            token_index=token_index,
            latency=latency,
            cpu_memory_mb=mem_info
        ))

    def get_summary(self):
        summary = {}
        for phase, pm in self.phases.items():
            duration = pm.end_time - pm.start_time if pm.end_time else 0
            summary[phase] = {
                'duration_sec': duration,
                'tokens_processed': pm.tokens_processed,
                'tokens_per_sec': pm.tokens_processed / duration if duration > 0 else 0.0
            }

        # Overall token stats
        if self.token_metrics:
            latencies = [t.latency for t in self.token_metrics]
            mem_usages = [t.cpu_memory_mb for t in self.token_metrics]
            summary['token_stats'] = {
                'mean_latency_sec': np.mean(latencies),
                'p90_latency_sec': np.percentile(latencies, 90),
                'peak_cpu_memory_mb': max(mem_usages),
                'final_cpu_memory_mb': mem_usages[-1]
            }
        else:
            summary['token_stats'] = {}

        return summary

#################################################################
# Model Setup
#################################################################

In [4]:
# Load model and tokenizer on CPU directly
# Using `device_map='cpu'` ensures the model and weights load into CPU memory.
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map='cpu',
    low_cpu_mem_usage=True
)

model = model.float()

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

#################################################################
# Inference on CPU
#################################################################


In [5]:
# Tokenize on CPU
inputs = tokenizer(input_text, return_tensors="pt", truncation=True)

metrics_tracker = EnhancedMetricsTracker()

# Prefill phase
metrics_tracker.start_phase('prefill')

prefill_start = time.perf_counter()
with torch.inference_mode():
    outputs = model(**inputs, use_cache=True)
prefill_end = time.perf_counter()

metrics_tracker.end_phase('prefill', tokens_processed=inputs["input_ids"].shape[1])

# Decode phase
metrics_tracker.start_phase('decode')

input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

for i in range(max_new_tokens):
    token_start = time.perf_counter()
    # Get next token logits from last output
    next_token_logits = outputs.logits[:, -1, :]
    # Greedy sampling
    next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)

    # Append next token to input_ids and attention_mask
    input_ids = torch.cat([input_ids, next_token], dim=-1)
    attention_mask = torch.cat([attention_mask, torch.ones((1,1), dtype=attention_mask.dtype)], dim=-1)

    # Forward pass for next step
    with torch.inference_mode():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, use_cache=True)

    token_end = time.perf_counter()
    token_latency = token_end - token_start
    metrics_tracker.sample_token(token_index=i, latency=token_latency)

    # Check for EOS
    if next_token.item() == tokenizer.eos_token_id:
        break

metrics_tracker.end_phase('decode', tokens_processed=i+1)

generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)

#################################################################
# Results
#################################################################

In [6]:

print("Generated text:", generated_text)
summary = metrics_tracker.get_summary()
print("Metrics Summary:", summary)

Generated text: This is a test prompt. The model should continue this text with a meaningful completion.
Metrics Summary: {'prefill': {'duration_sec': 0.6922823674976826, 'tokens_processed': 18, 'tokens_per_sec': 26.00095112210157}, 'decode': {'duration_sec': 0.6258094608783722, 'tokens_processed': 1, 'tokens_per_sec': 1.5979304604893994}, 'token_stats': {'mean_latency_sec': 0.6247733123600483, 'p90_latency_sec': 0.6247733123600483, 'peak_cpu_memory_mb': 5515.12890625, 'final_cpu_memory_mb': 5515.12890625}}
