<a href="https://colab.research.google.com/github/krishnayah/urp-snippets/blob/main/Benchmarking_Pipeline_vs_Manual.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Testing Pipeline vs Manual on classifier

In [1]:
import time
import random
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Generating Prompts

In [2]:

# Words!
adjectives = [
    "ancient", "futuristic", "mysterious", "brilliant", "dark", "colorful",
    "peaceful", "chaotic", "lonely", "vibrant", "silent", "melancholic",
    "red", "blue", "golden", "silver", "emerald", "gentle", "stormy", "radiant"
]

nouns = [
    "city", "forest", "ocean", "planet", "dream", "creature", "robot",
    "painting", "poem", "machine", "castle", "storm", "garden", "ship",
    "dimension", "universe", "song", "memory", "light", "shadow"
]

actions = [
    "describe", "explain", "analyze", "summarize", "imagine", "narrate",
    "predict", "compare", "design", "debate", "reimagine", "illustrate",
    "explore", "evaluate", "invent", "critique", "compose", "argue", "translate", "simulate"
]

# Generate a lot of prompts
num_prompts = 2500
prompts = []
for _ in range(num_prompts):
    adj = random.choice(adjectives)
    noun = random.choice(nouns)
    action = random.choice(actions)

    prompt_type = random.choice([
        f"{action.capitalize()} a {adj} {noun}.",
        f"Write a story about a {adj} {noun}.",
        f"What would happen if a {adj} {noun} could think?",
        f"Give three facts about a {adj} {noun}.",
        f"Explain the significance of a {adj} {noun} in society.",
        f"Imagine discovering a {adj} {noun} — describe it in detail."
    ])

    prompts.append(prompt_type)

# for the classifier
candidate_labels = [
    "Unsafe or harmful content",
    "Refusal or safe alternative"
]


# Expand each premise × label for the manual approach
pairs = [(p, l) for p in prompts for l in candidate_labels]
texts = [p for p, _ in pairs]
labels = [l for _, l in pairs]


# Benchmark on pipeline! much simpler




# Benchmark Manually
Just copied a lot of this code from some base examples

In [None]:
#Load the tokenizer within this cell
model_id = "facebook/bart-large-mnli"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id).to("cuda")


start = time.time()

# Tokenize all pairs at once
inputs = tokenizer(texts, labels, return_tensors="pt", padding=True, truncation=True).to("cuda")

with torch.no_grad():
    logits = model(**inputs).logits

# Convert logits → entailment probabilities
entail_contr_logits = logits[:, [0, 2]]  # contradiction vs entailment
probs = F.softmax(entail_contr_logits, dim=1)[:, 1]
probs = probs.reshape(len(prompts), len(candidate_labels))

elapsed = time.time() - start

print(f"Batch size on manual: {len(prompts)} | Time: {elapsed:.2f}s")

Batch size on manual: 2500 | Time: 4.62s


In [None]:
torch.cuda.empty_cache()

# Benchmark on Pipeline

In [7]:
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=0,          # use GPU
    batch_size=2500       # increase?
)

start = time.time()
pipeline_results = classifier(prompts, candidate_labels)
elapsed = time.time() - start

print(f"Batch size on pipeline: {len(prompts)} | Time: {elapsed:.2f}s")

Device set to use cuda:0


Batch size on pipeline: 2500 | Time: 75.73s


# Testing Generative Pipelines

In [5]:
from huggingface_hub import login

# Paste your token here (keep it private!)
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
classifier = pipeline(
    "text-generation",
    model="google/gemma-3-1b-it",
    device=0,          # use GPU
    batch_size=512,       # increase?
    max_new_tokens=150
)

start = time.time()
pipeline_results = classifier(prompts)
elapsed = time.time() - start

print(f"Batch size on pipeline: {len(prompts)} | Time: {elapsed:.2f}s")

# Manual

In [8]:
import torch, time
from transformers import AutoTokenizer, AutoModelForCausalLM

# --------------------------
# Load model & tokenizer
# --------------------------
model_id = "google/gemma-3-1b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)


# --------------------------
# Manual batched generation
# --------------------------
batch_size = 512
max_new_tokens = 150

all_outputs = []

start = time.time()

for i in range(0, len(prompts), batch_size):
    batch = prompts[i : i + batch_size]

    # Tokenize batch
    inputs = tokenizer(
        batch,
        return_tensors="pt",
        padding=True,
        truncation=True,
    ).to(model.device)

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode
    texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    all_outputs.extend(texts)

elapsed = time.time() - start

print(f"Batch size on manual: {batch_size} | Total: {len(prompts)} | Time: {elapsed:.2f}s")

# Print sample output
print("\nSample output:\n", all_outputs[0])


`torch_dtype` is deprecated! Use `dtype` instead!
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Batch size on manual: 512 | Total: 2500 | Time: 46.92s
Avg per batch: 9.61s

Sample output:
 Give three facts about a vibrant robot.

1.  **Polished Chrome Shell:** The robot's exterior is meticulously crafted from polished chrome, giving it a sleek, futuristic appearance.
2.  **Adaptive Lighting:**  It features a network of micro-LEDs that can dynamically adjust its color and intensity, creating a mesmerizing display.
3.  **Sonic Resonance System:**  The robot utilizes a sophisticated sonic resonance system to communicate and interact with its environment, producing a gentle, melodic hum.

---

Would you like me to create a short story based on these facts?


# OBSERVATIONS

Pipelines is arguably slightly slower than using the manual, by about two seconds, but it manually takes care of cache handling and allows me to specify a batch size.

**This will be incredibly important if I use it within a reward function, as the GPU will be finetuning and caching the new model** Manually freeing up CUDA cache may interfere with that.

And, since it allows me to specify a batch size, it means I will always be able to have granular control over how much data is being handled at once (in case fine tuning ends up taking up a lot of RAM)

# Classifier Data:
For 2500 prompts
- About 7 seconds on pipelines
- About 5 seconds on manual

# Generative Data
For 2500 prompts, 150 new tokens
- 75.7s on pipelines
- 46.92s on manual

