In [None]:
# Install required packages
!pip install transformers torch accelerate -q

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
import warnings
warnings.filterwarnings('ignore')

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load a smaller instruction-tuned model suitable for Google Colab
# We'll use TinyLlama-1.1B-Chat as it's relatively lightweight but still capable
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model with lower precision for efficiency
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Use float16 for efficiency
    device_map="auto",  # Automatically handle device placement
    load_in_8bit=True if device == "cuda" else False  # Use 8-bit quantization if GPU available
)

# Create a text generation pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=device
)

# Format the prompt according to the model's expected format
# TinyLlama uses specific tokens for chat
prompt = "<|system|>You are a helpful AI assistant.<|user|>How are you?<|assistant|>"

# Generate response
print("\nGenerating response...")
response = generator(
    prompt,
    max_length=200,
    num_return_sequences=1,
    temperature=0.7,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

# Extract and print the generated response
generated_text = response[0]['generated_text']
# Extract only the assistant's response (after the last <|assistant|> token)
assistant_response = generated_text.split("<|assistant|>")[-1].strip()
print("\nResponse:", assistant_response)

# Free up memory
del model
del generator
torch.cuda.empty_cache()