In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_gemma(model_name="google/gemma-2-9b", device="cuda"):
    """
    Load Gemma model and tokenizer.
    
    Args:
        model_name: HuggingFace model identifier
        device: Device to load the model on
    
    Returns:
        model: Loaded model with hooks for activation access
        tokenizer: Corresponding tokenizer
    """
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,  # Use fp16 for efficiency
        device_map=device
    )
    
    # Set evaluation mode
    model.eval()
    
    return model, tokenizer