In [3]:
import time
from transformers import pipeline
import torch

In [6]:
# Check if CUDA is available (it won't be on our case, Raspberry Pi)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


## TinyLlama-1.1B

In [7]:
 model='TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T'

In [8]:
# Load the model and measure loading time
start_time = time.time()
generator = pipeline('text-generation', 
                    model=model,
                    device=device)
load_time = time.time() - start_time
print(f"Model loading time: {load_time:.2f} seconds")

Device set to use cpu


Model loading time: 1.28 seconds


In [5]:
# Test prompt
test_prompt = "The weather today is"

# Measure inference time
start_time = time.time()
response = generator(test_prompt, 
                    max_length=50,
                    num_return_sequences=1,
                    temperature=0.7)
inference_time = time.time() - start_time

print(f"\nTest prompt: {test_prompt}")
print(f"Generated response: {response[0]['generated_text']}")
print(f"Inference time: {inference_time:.2f} seconds")

# Memory usage
if device == "cuda":
    print(f"\nGPU Memory allocated: {torch.cuda.memory_allocated()/1024**2:.2f} MB")
    print(f"GPU Memory cached: {torch.cuda.memory_reserved()/1024**2:.2f} MB")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



Test prompt: The weather today is
Generated response: The weather today is going to be sunny and warm with a high of 80 degrees.
The weather today is going to be sunny and warm with a high of 80 degrees.
The weather today is going to be
Inference time: 199.41 seconds
