In [None]:
import torch
from transformers import AutoModel, AutoTokenizer
from huggingface_hub import notebook_login 

import sys
import os
sys.path.append(os.getcwd() + "/../")
from src.utils.constants import MODELS_DIR, LLAMA_3P3_70B_MODEL_DIR, LLAMA_3P3_70B_MODEL_NAME

# Log in to HuggingFace
from src.utils.tokens import HF_TOKEN
os.environ["HF_TOKEN"] = HF_TOKEN

In [2]:
def load_model(model_name, cache_dir):
    """
    Load model with multi-GPU optimized settings
    Args:
        model_name (str): The name of the model to load.
        cache_dir (str): The directory to cache the model.

    Returns:
        model: The loaded model.
        tokenizer: The tokenizer for the model.
    """
    model_config = {
        "cache_dir": cache_dir,
        "device_map": "auto",          # Auto-split across available GPUs
        "low_cpu_mem_usage": True,     # Critical for large models
        "torch_dtype": torch.bfloat16, # Match H100 native format
        "trust_remote_code": True      # Required for Llama models
    }

    if os.path.exists(cache_dir):
        print(f"Using cached model in {cache_dir}")
        model = AutoModel.from_pretrained(cache_dir, **model_config)
        tokenizer = AutoTokenizer.from_pretrained(cache_dir)
    else:
        model = AutoModel.from_pretrained(model_name, **model_config)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model.save_pretrained(cache_dir)
        tokenizer.save_pretrained(cache_dir)

    # Configure tokenizer for padding
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    
    return model, tokenizer


def get_embeddings(model, tokenizer, texts):
    """
    Get the embeddings for a given list of strings using the specified model and tokenizer.
    Args:
        texts (list): A list of strings to encode.
        model: The model to use for generating embeddings.
        tokenizer: The tokenizer to use for encoding the text.

    Returns:
        torch.Tensor: The embeddings for the input text.
    """
    # Tokenize the input text
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

    # Move the inputs to the same device as the model
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Forward pass to get the embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state

    # Get the mean of the last hidden state across the sequence length (do not include padding)
    attention_mask = inputs["attention_mask"]
    mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
    sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
    embeddings = sum_embeddings / sum_mask

    return embeddings

In [3]:
model_name = LLAMA_3P3_70B_MODEL_NAME
cache_dir = LLAMA_3P3_70B_MODEL_DIR

In [None]:
model, tokenizer = load_model(model_name, cache_dir)

In [None]:
texts = [
    "Hello, how are you?", 
    "Hello, how are you doing?", 
    "I'm good. Thanks for asking! I hope you are doing well too."
]
embeddings = get_embeddings(model, tokenizer, texts)
print(embeddings)

In [None]:
embeddings.shape
