In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# Ensure we are using GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Print GPU info if available
if device.type == "cuda":
    print(torch.cuda.get_device_name(0))
    print("Memory Usage Before Loading Model:")
    print("Allocated:", round(torch.cuda.memory_allocated(0) / 1024**3, 1), "GB")
    print("Cached:   ", round(torch.cuda.memory_reserved(0) / 1024**3, 1), "GB")

# Clear cache before loading the model to prevent memory fragmentation
torch.cuda.empty_cache()

# Model and tokenizer setup
# This is a very small model just to demonstrate the code. For reference, a model I would recommend:
# https://huggingface.co/neuralmagic/Meta-Llama-3.1-70B-Instruct-quantized.w4a16
# However, it requires a huggingface account and getting approved to access to the model.

# For larger scale operations (e.g. annotating 1000s of texts), look into vllm to make the process more optimized.
model_id = "PY007/TinyLlama-1.1B-step-50K-105b"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the model explicitly onto GPU
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16
)

# Text generation pipeline with explicit device assignment
text_generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device=device,  # Ensure correct device is used
)


Using device: cuda
Tesla V100-PCIE-32GB
Memory Usage Before Loading Model:
Allocated: 0.0 GB
Cached:    0.0 GB


In [2]:
# Input prompt
prompt = "How do I print banana in Python?"

# Generate text
sequences = text_generator(
    prompt,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    repetition_penalty=1.5,
    eos_token_id=tokenizer.eos_token_id,
    max_length=500,
)

# Print generated text
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Result: How do I print banana in Python? <issue_label="Open source" />
<p align='right'><a href='https://github.com/bartlomiejobson07'>Github</a></p> ---template: post-listingtitle:"Hacktoberfest"---   # About the Post    15 Nov 20, Tue.
#     * Author : <NAME>	[GitHub](https://www.linkedin.com/) | [Link](mailto:`+{name}@gmail.<EMAIL>`)                             -------------------------      ## What is HackerFest?        **What we will have this year**         ![happy](/static/media/imgs/_3x49d86eefacccadfcecbfdecaeaecfafeebabdfgahjheemqrkrlghhnmrzhaimnz).            If you are new to hackers or would like a refresher from one of our amazingly diverse community then check out what they offer at https://techcommunity.microsoft.com/.                The following links were provided by members and volunteer organisations involved with Hacker Fests around North America (some may require registration before viewable):           ### Link for US National Membership         We w