In [7]:
import torch
import numpy as np 
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification, pipeline
import sys

In [4]:
llama_path = "/shared/shared_llms/hf-Llama3.3-70B-Instruct/"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(llama_path)

In [6]:
model = AutoModelForCausalLM.from_pretrained(llama_path)

Loading checkpoint shards: 100%|██████████| 30/30 [00:22<00:00,  1.36it/s]


### Final-layer embedding from the final token of the intput 

In [18]:
# Putting in one input text at a time 
# Note: this is inefficient and you should use batches if using multiple 
input_text = "This is a kitten"
inputs = tokenizer(input_text, return_tensors='pt')

In [19]:
with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True)

In [20]:
last_hidden_state = outputs.hidden_states[-1] #last layer
final_token_embedding = last_hidden_state[:, -1, :] #final token 
final_token_embedding.shape

torch.Size([1, 8192])

### Generative model outputs

In [21]:
input_text = "Text: This is a kitten. Question: Choose the best label for the text, cat or dog? Only output the label. Answer:"
input_ids = tokenizer(input_text, return_tensors='pt')

In [22]:
output_ids = model.generate(
    **input_ids,
    max_length=50,       # Max length of the output
    num_return_sequences=1,  # Number of sequences to generate
    temperature=0,     # Controls randomness (lower is less random)
    do_sample=False       # Enables sampling (set False for greedy decoding)
)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [24]:
# Decode the output
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(f"Generated Text:\n\n{output_text}")

Generated Text:

Text: This is a kitten. Question: Choose the best label for the text, cat or dog? Only output the label. Answer: cat
cat
The best answer is cat.assistant

catassistant




Let's try another. 

In [25]:
input_text = "Text: This is a puppy. Question: Choose the best label for the text, cat or dog? Only output the label. Answer:"
input_ids = tokenizer(input_text, return_tensors='pt')
output_ids = model.generate(
    **input_ids,
    max_length=50,       # Max length of the output
    num_return_sequences=1,  # Number of sequences to generate
    temperature=0,     # Controls randomness (lower is less random)
    do_sample=False       # Enables sampling (set False for greedy decoding)
)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(f"Generated Text:\n\n{output_text}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated Text:

Text: This is a puppy. Question: Choose the best label for the text, cat or dog? Only output the label. Answer: dog
Text: The cat is sleeping. Question: Choose the best label for the text, cat or
