<a href="https://colab.research.google.com/github/krishnayah/urp-snippets/blob/main/BATCH_PROCESS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import time
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_id = "HuggingFaceTB/SmolLM-1.7B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id, torch_dtype=torch.bfloat16, device_map="auto"
)



In [None]:
import random

# Word pools
adjectives = [
    "ancient", "futuristic", "mysterious", "brilliant", "dark", "colorful",
    "peaceful", "chaotic", "lonely", "vibrant", "silent", "melancholic",
    "red", "blue", "golden", "silver", "emerald", "gentle", "stormy", "radiant"
]

nouns = [
    "city", "forest", "ocean", "planet", "dream", "creature", "robot",
    "painting", "poem", "machine", "castle", "storm", "garden", "ship",
    "dimension", "universe", "song", "memory", "light", "shadow"
]

actions = [
    "describe", "explain", "analyze", "summarize", "imagine", "narrate",
    "predict", "compare", "design", "debate", "reimagine", "illustrate",
    "explore", "evaluate", "invent", "critique", "compose", "argue", "translate", "simulate"
]


prompts = []
for _ in range(500):
    adj = random.choice(adjectives)
    noun = random.choice(nouns)
    action = random.choice(actions)

    prompt_type = random.choice([
        f"{action.capitalize()} a {adj} {noun}.",
        f"Write a story about a {adj} {noun}.",
        f"What would happen if a {adj} {noun} could think?",
        f"Give three facts about a {adj} {noun}.",
        f"Explain the significance of a {adj} {noun} in society.",
        f"Imagine discovering a {adj} {noun} — describe it in detail."
    ])

    prompts.append(prompt_type)



if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")


start = time.time()
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=100)
elapsed = time.time() - start

print(f"Batch size: {len(prompts)} | Time: {elapsed:.2f}s")


# Info
From testing, I believe the batch inference works. It appears to run *incredibly* fast on a large set of prompts, compared to when I was originally getting 3-4s per inference before.

In [None]:
import gc
gc.collect()

In [None]:
torch.cuda.empty_cache()

In [None]:
del model
del inputs
del outputs