In [None]:
%pip install -q torch
%pip install -q transformers

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import torch
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-1.7B")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
import time
import threading
from queue import Queue

def generate_with_timing(model, tokenizer, use_cache, prompt, device, results_queue):
    tokens = tokenizer.encode(prompt, return_tensors="pt").to(device)
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
    
    # Armazena o texto gerado
    generated_text = []
    
    # Inicia o timer
    start_time = time.time()
    
    # Thread de geração
    thread = threading.Thread(
        target=model.generate,
        kwargs={
            "input_ids": tokens,
            "max_new_tokens": 100,
            "use_cache": use_cache,
            "streamer": streamer
        }
    )
    thread.start()
    
    # Consome o streamer
    for text in streamer:
        generated_text.append(text)
    
    thread.join()
    end_time = time.time()
    
    # Calcula métricas
    elapsed_time = end_time - start_time
    full_text = "".join(generated_text)
    tokens_generated = len(tokenizer.encode(full_text)) - len(tokens[0])
    tokens_per_second = tokens_generated / elapsed_time if elapsed_time > 0 else 0
    
    # Armazena resultados
    results_queue.put({
        "use_cache": use_cache,
        "time": elapsed_time,
        "tokens_generated": tokens_generated,
        "tokens_per_second": tokens_per_second,
        "text": full_text
    })

# Execução do benchmark
def run_benchmark(model, tokenizer, prompt, device):
    """
    Executa o benchmark comparando use_cache=True vs False
    """
    results_queue = Queue()
    threads = []
    
    print("Iniciando benchmark...\n")
    
    # Cria e inicia as threads
    for use_cache in [True, False]:
        thread = threading.Thread(
            target=generate_with_timing,
            args=(model, tokenizer, use_cache, prompt, device, results_queue)
        )
        threads.append(thread)
        thread.start()
    
    # Aguarda todas as threads terminarem
    for thread in threads:
        thread.join()
    
    # Coleta resultados
    results = []
    while not results_queue.empty():
        results.append(results_queue.get())
    
    # Ordena por use_cache para facilitar comparação
    results.sort(key=lambda x: x["use_cache"], reverse=True)
    
    print("=" * 70)
    print("RESULTADOS")
    print("=" * 70)
    
    for result in results:
        print(f"\n{'COM CACHE' if result['use_cache'] else 'SEM CACHE'}")
        print(f"  Tempo total: {result['time']:.2f}s")
        print(f"  Tokens gerados: {result['tokens_generated']}")
        print(f"  Velocidade: {result['tokens_per_second']:.2f} tokens/s")
        print(f"  Texto: {result['text'][:100]}...")
    
    # Comparação
    if len(results) == 2:
        speedup = results[0]['time'] / results[1]['time'] if results[1]['time'] > 0 else 0
        print(f"SPEEDUP: {speedup:.2f}x ")

In [None]:
prompt = "Black Panther was a"
run_benchmark(model, tokenizer, prompt, device)