In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Let's compare intruction tuned (RLHF) to raw pre-trained

In [20]:
#list of models to try:     
#model_name = 'meta-llama/Llama-3.2-1B'
model_name = "meta-llama/Llama-3.2-3B-Instruct"


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, return_dict_in_generate=True, output_hidden_states=True)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.46s/it]


In [5]:
def generate_llama_response(prompt, max_new_tokens=50, temperature=0.7):
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Move to GPU if needed

    # Generate
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=inputs["input_ids"].shape[1] + max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,  # Avoid warnings
        do_sample=True,
        temperature=temperature,
        # return_legacy_cache=True,  # (Optional) prevents the newer Cache object in HF>=4.47
    )

    # If outputs is a GenerationOutput (when return_dict_in_generate=True),
    # it has a .sequences attribute. Otherwise, it's already the generated IDs.
    if hasattr(outputs, "sequences"):
        generate_ids = outputs.sequences
    else:
        generate_ids = outputs

    # Decode
    # generate_ids[0] is a tensor of token IDs (shape [seq_length])
    # which is exactly what `tokenizer.decode` expects.
    full_text = tokenizer.decode(generate_ids[0], skip_special_tokens=True)

    # Remove the prompt to get just the completion
    completion = full_text[len(prompt) :].strip()
    return completion


In [7]:
#Example usage
my_prompt = "Explain the moon landing."
response = generate_llama_response(my_prompt, max_new_tokens=100, temperature=0.8)
print("Prompt:", my_prompt)
print("Response:", response)

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Prompt: Explain the moon landing.
Response: The moon landing was a historic event that took place on July 20, 1969, when NASA's Apollo 11 mission successfully landed astronauts on the surface of the moon. The mission was crewed by astronauts Neil Armstrong, Buzz Aldrin, and Michael Collins.
The Apollo 11 spacecraft was launched from Kennedy Space Center in Florida on July 16, 1969. After traveling through space for four days, the spacecraft entered into lunar orbit. On July 20, the lunar module Eagle


In [6]:
my_prompt = "The man felt angry, so he decided to go to the "
response = generate_llama_response(my_prompt, max_new_tokens=100, temperature=0.8)
print("Prompt:", my_prompt)
print("Response:", response)

Prompt: The man felt angry, so he decided to go to the 
Response: 7-11. He bought a pack of gum, a sandwich, and a can of soda. His total bill was $15.50.

## Step 1: Calculate the total cost of the items purchased.
The total cost of the items purchased is $15.50.

## Step 2: Since the man bought a pack of gum, a sandwich, and a can of soda, we need to find the cost of each item to determine the price of each.
Let's assume the cost


In [8]:
#list of models to try:     
model_name = 'meta-llama/Llama-3.2-1B'
#model_name = "meta-llama/Llama-3.2-3B-Instruct"


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, return_dict_in_generate=True, output_hidden_states=True)

In [13]:
#Example usage
my_prompt = "Explain the moon landing."
response = generate_llama_response(my_prompt, max_new_tokens=100, temperature=0.8)
print("Prompt:", my_prompt)
print("Response:", response)

Prompt: Explain the moon landing.
Response: Essay
What is the purpose of the moon landing? What impact did it have on the human race? This paper will address these questions. The purpose of the moon landing was to see if we could land on the moon. The impact was to put a man on the moon and give man a better place on earth.
The moon landing was a big deal to me and to many other people. I was eight years old when this happened and I remember hearing the news and learning about it. I remember


## Let's load back the intruction tuned model and see what effect temperature has on the model's next token probabilities

In [32]:
#list of models to try:     
#model_name = 'meta-llama/Llama-3.2-1B'
model_name = "meta-llama/Llama-3.2-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=True, trust_remote_code=True)


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.25it/s]


In [13]:
def stepwise_generate_with_top_k_sampling(
    prompt,
    model,
    tokenizer,
    max_new_tokens=50,
    temperature=1.0,
    top_k=50,
):
    """
    Step-by-step generation using top-k sampling (plus temperature).
    
    Args:
      prompt (str): initial text to start generating from
      model: a causal LM from transformers (on the correct device)
      tokenizer: corresponding tokenizer
      max_new_tokens (int): how many new tokens to generate
      temperature (float): temperature for sampling
      top_k (int): how many top tokens to keep each step

    Returns:
      (final_text, generation_steps) where:
        final_text is the decoded string (prompt + new tokens)
        generation_steps is a list of dicts, each with:
          - 'step': which decoding step
          - 'chosen_token': the actual token chosen
          - 'chosen_prob': the *raw* probability for that chosen token
          - 'top_k_tokens': list of (token_str, raw_prob) *before* renormalizing
    """

    # Encode the prompt
    input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to(model.device)

    generation_steps = []

    for step in range(max_new_tokens):
        with torch.no_grad():
            outputs = model(input_ids=input_ids)
            # logits shape: [batch_size=1, seq_len, vocab_size]
            logits = outputs.logits[:, -1, :]  # distribution for the next token

        # Apply temperature
        if temperature != 1.0:
            logits = logits / temperature

        # Convert logits -> probabilities
        probs = F.softmax(logits, dim=-1).squeeze(0)  # shape [vocab_size]

        # Get top-k subset (or fewer if vocab < top_k)
        if top_k <= 0 or top_k >= probs.size(0):
            # top_k <= 0 means "no pruning" – sample from all tokens
            top_k_probs = probs
            top_k_ids = torch.arange(probs.size(0), device=probs.device)
        else:
            top_k_probs, top_k_ids = torch.topk(probs, top_k)
        
        # Store the *raw* probabilities of the top-k tokens for logging
        top_k_tokens_info = []
        for i in range(top_k_probs.size(0)):
            tid = top_k_ids[i].item()
            tk_str = tokenizer.decode([tid])
            tk_prob = top_k_probs[i].item()
            top_k_tokens_info.append((tk_str, tk_prob))

        # Renormalize the top-k probs to sum to 1
        top_k_probs = top_k_probs / top_k_probs.sum()

        # Sample from the top-k distribution
        chosen_idx_in_top_k = torch.multinomial(top_k_probs, 1)
        chosen_token_id = top_k_ids[chosen_idx_in_top_k].item()
        chosen_prob_raw = probs[chosen_token_id].item()  # from the *original* distribution
        chosen_token_str = tokenizer.decode([chosen_token_id])

        # Store step info
        generation_steps.append({
            "step": step,
            "chosen_token": chosen_token_str,
            "chosen_prob": chosen_prob_raw,   # raw probability in the full vocab
            "top_k_tokens": top_k_tokens_info # raw probabilities for the top_k
        })

        # Append chosen token to input_ids
        next_token_id_tensor = torch.tensor([[chosen_token_id]], device=input_ids.device)
        input_ids = torch.cat([input_ids, next_token_id_tensor], dim=1)

        # Optional: stop if EOS
        # if chosen_token_id == tokenizer.eos_token_id:
        #     break

    # Decode the entire generated sequence
    final_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    return final_text, generation_steps

In [17]:
prompt = "The man got so angry that he decided to"
final_text, steps = stepwise_generate_with_top_k_sampling(
    prompt,
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=20,
    temperature=0.8,
    top_k=50
)

print("Final text:\n", final_text)

print("\nStep-by-step sampling:")
for step_info in steps:
    step_idx = step_info["step"]
    chosen_token = step_info["chosen_token"]
    chosen_prob = step_info["chosen_prob"]
    top_tokens = step_info["top_k_tokens"]  # (token_str, raw_prob)
    
    # Print step + chosen token
    print(f"Step {step_idx} -> '{chosen_token}' (prob={chosen_prob:.4f})")

    # Print only the first 10 of the top_k tokens
    for i, (tk_str, tk_prob) in enumerate(top_tokens[:10]):
        print(f"    Top-{i+1}: {repr(tk_str)} prob={tk_prob:.4f}")

Final text:
 The man got so angry that he decided to leave his wife for her sister, who worked at a local restaurant. The wife found out and followed

Step-by-step sampling:
Step 0 -> ' leave' (prob=0.0357)
    Top-1: ' take' prob=0.2910
    Top-2: ' go' prob=0.0954
    Top-3: ' write' prob=0.0378
    Top-4: ' leave' prob=0.0357
    Top-5: ' do' prob=0.0196
    Top-6: ' make' prob=0.0181
    Top-7: ' fight' prob=0.0177
    Top-8: ' quit' prob=0.0171
    Top-9: ' run' prob=0.0156
    Top-10: ' throw' prob=0.0142
Step 1 -> ' his' (prob=0.2813)
    Top-1: ' the' prob=0.5402
    Top-2: ' his' prob=0.2813
    Top-3: ' a' prob=0.0515
    Top-4: '.' prob=0.0342
    Top-5: ' and' prob=0.0179
    Top-6: '.\n' prob=0.0127
    Top-7: ' town' prob=0.0120
    Top-8: ',' prob=0.0092
    Top-9: '\n' prob=0.0075
    Top-10: ' home' prob=0.0036
Step 2 -> ' wife' (prob=0.3776)
    Top-1: ' wife' prob=0.3776
    Top-2: ' job' prob=0.1671
    Top-3: ' house' prob=0.1425
    Top-4: ' home' prob=0.1057
    

In [19]:
prompt = "The man got so angry that he decided to"
final_text, steps = stepwise_generate_with_top_k_sampling(
    prompt,
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=20,
    temperature=0.8,
    top_k=50
)

print("Final text:\n", final_text)

print("\nStep-by-step sampling:")
count = 0
for step_info in steps:
    step_idx = step_info["step"]
    chosen_token = step_info["chosen_token"]
    chosen_prob = step_info["chosen_prob"]
    top_tokens = step_info["top_k_tokens"]  # (token_str, raw_prob)
    
    # Print step + chosen token
    print(f"Step {step_idx} -> '{chosen_token}' (prob={chosen_prob:.4f})")

    # Print only the first 10 of the top_k tokens
    for i, (tk_str, tk_prob) in enumerate(top_tokens[:10]):
        print(f"    Top-{i+1}: {repr(tk_str)} prob={tk_prob:.4f}")
    count += 1
    if count == 3:
        break

Final text:
 The man got so angry that he decided to take drastic measures. He became obsessed with finding a new hobby and started taking classes at a local community

Step-by-step sampling:
Step 0 -> ' take' (prob=0.2910)
    Top-1: ' take' prob=0.2910
    Top-2: ' go' prob=0.0954
    Top-3: ' write' prob=0.0378
    Top-4: ' leave' prob=0.0357
    Top-5: ' do' prob=0.0196
    Top-6: ' make' prob=0.0181
    Top-7: ' fight' prob=0.0177
    Top-8: ' quit' prob=0.0171
    Top-9: ' run' prob=0.0156
    Top-10: ' throw' prob=0.0142


## Let's try a high temperature:

In [20]:
prompt = "The man got so angry that he decided to"
final_text, steps = stepwise_generate_with_top_k_sampling(
    prompt,
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=20,
    temperature=10,
    top_k=50
)

print("Final text:\n", final_text)

print("\nStep-by-step sampling:")
count = 0
for step_info in steps:
    step_idx = step_info["step"]
    chosen_token = step_info["chosen_token"]
    chosen_prob = step_info["chosen_prob"]
    top_tokens = step_info["top_k_tokens"]  # (token_str, raw_prob)
    
    # Print step + chosen token
    print(f"Step {step_idx} -> '{chosen_token}' (prob={chosen_prob:.4f})")

    # Print only the first 10 of the top_k tokens
    for i, (tk_str, tk_prob) in enumerate(top_tokens[:10]):
        print(f"    Top-{i+1}: {repr(tk_str)} prob={tk_prob:.4f}")
    count += 1
    if count == 3:
        break

Final text:
 The man got so angry that he decided to challenge her.
This could relate directly onto someone attacking it upon himself or some how it caused physical impact

Step-by-step sampling:
Step 0 -> ' challenge' (prob=0.0000)
    Top-1: ' take' prob=0.0000
    Top-2: ' go' prob=0.0000
    Top-3: ' write' prob=0.0000
    Top-4: ' leave' prob=0.0000
    Top-5: ' do' prob=0.0000
    Top-6: ' make' prob=0.0000
    Top-7: ' fight' prob=0.0000
    Top-8: ' quit' prob=0.0000
    Top-9: ' run' prob=0.0000
    Top-10: ' throw' prob=0.0000
Step 1 -> ' her' (prob=0.0000)
    Top-1: ' the' prob=0.0000
    Top-2: ' his' prob=0.0000
    Top-3: ' a' prob=0.0000
    Top-4: ' all' prob=0.0000
    Top-5: ' someone' prob=0.0000
    Top-6: ' me' prob=0.0000
    Top-7: ' anyone' prob=0.0000
    Top-8: ' everyone' prob=0.0000
    Top-9: ' another' prob=0.0000
    Top-10: ' you' prob=0.0000
Step 2 -> '.
' (prob=0.0000)
    Top-1: ' to' prob=0.0000
    Top-2: '.' prob=0.0000
    Top-3: ' in' prob=0.000

### Let's run the eaxct same thing once more: 

In [23]:
prompt = "The man got so angry that he decided to"
final_text, steps = stepwise_generate_with_top_k_sampling(
    prompt,
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=20,
    temperature=10,
    top_k=50
)

print("Final text:\n", final_text)

print("\nStep-by-step sampling:")
count = 0
for step_info in steps:
    step_idx = step_info["step"]
    chosen_token = step_info["chosen_token"]
    chosen_prob = step_info["chosen_prob"]
    top_tokens = step_info["top_k_tokens"]  # (token_str, raw_prob)
    
    # Print step + chosen token
    print(f"Step {step_idx} -> '{chosen_token}' (prob={chosen_prob:.4f})")

    # Print only the first 10 of the top_k tokens
    for i, (tk_str, tk_prob) in enumerate(top_tokens[:10]):
        print(f"    Top-{i+1}: {repr(tk_str)} prob={tk_prob:.4f}")
    count += 1
    if count == 3:
        break

Final text:
 The man got so angry that he decided to play soccer by kicking away that stupid camera.
Hence, the pun is used. Since the guy

Step-by-step sampling:
Step 0 -> ' play' (prob=0.0000)
    Top-1: ' take' prob=0.0000
    Top-2: ' go' prob=0.0000
    Top-3: ' write' prob=0.0000
    Top-4: ' leave' prob=0.0000
    Top-5: ' do' prob=0.0000
    Top-6: ' make' prob=0.0000
    Top-7: ' fight' prob=0.0000
    Top-8: ' quit' prob=0.0000
    Top-9: ' run' prob=0.0000
    Top-10: ' throw' prob=0.0000
Step 1 -> ' soccer' (prob=0.0000)
    Top-1: ' a' prob=0.0000
    Top-2: ' the' prob=0.0000
    Top-3: ' his' prob=0.0000
    Top-4: ' with' prob=0.0000
    Top-5: ' it' prob=0.0000
    Top-6: ' some' prob=0.0000
    Top-7: ' an' prob=0.0000
    Top-8: ' music' prob=0.0000
    Top-9: ' pr' prob=0.0000
    Top-10: ' poker' prob=0.0000
Step 2 -> ' by' (prob=0.0000)
    Top-1: ' with' prob=0.0001
    Top-2: ' in' prob=0.0000
    Top-3: '.' prob=0.0000
    Top-4: ' ball' prob=0.0000
    Top-5: 

## Now let's try a much lower temperature: 

In [21]:
prompt = "The man got so angry that he decided to"
final_text, steps = stepwise_generate_with_top_k_sampling(
    prompt,
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=20,
    temperature=0.1,
    top_k=50
)

print("Final text:\n", final_text)

print("\nStep-by-step sampling:")
count = 0
for step_info in steps:
    step_idx = step_info["step"]
    chosen_token = step_info["chosen_token"]
    chosen_prob = step_info["chosen_prob"]
    top_tokens = step_info["top_k_tokens"]  # (token_str, raw_prob)
    
    # Print step + chosen token
    print(f"Step {step_idx} -> '{chosen_token}' (prob={chosen_prob:.4f})")

    # Print only the first 10 of the top_k tokens
    for i, (tk_str, tk_prob) in enumerate(top_tokens[:10]):
        print(f"    Top-{i+1}: {repr(tk_str)} prob={tk_prob:.4f}")
    count += 1
    if count == 3:
        break

Final text:
 The man got so angry that he decided to take matters into his own hands. He stormed into the office of the local government official, demanding to

Step-by-step sampling:
Step 0 -> ' take' (prob=0.9999)
    Top-1: ' take' prob=0.9999
    Top-2: ' go' prob=0.0001
    Top-3: ' write' prob=0.0000
    Top-4: ' leave' prob=0.0000
    Top-5: ' do' prob=0.0000
    Top-6: ' make' prob=0.0000
    Top-7: ' fight' prob=0.0000
    Top-8: ' quit' prob=0.0000
    Top-9: ' run' prob=0.0000
    Top-10: ' throw' prob=0.0000
Step 1 -> ' matters' (prob=0.9593)
    Top-1: ' matters' prob=0.9593
    Top-2: ' a' prob=0.0402
    Top-3: ' his' prob=0.0004
    Top-4: ' revenge' prob=0.0000
    Top-5: ' drastic' prob=0.0000
    Top-6: ' the' prob=0.0000
    Top-7: ' action' prob=0.0000
    Top-8: ' out' prob=0.0000
    Top-9: ' up' prob=0.0000
    Top-10: ' it' prob=0.0000
Step 2 -> ' into' (prob=1.0000)
    Top-1: ' into' prob=1.0000
    Top-2: ' in' prob=0.0000
    Top-3: ' of' prob=0.0000
    To

### Let's run the eaxct same thing once more: 

In [22]:
prompt = "The man got so angry that he decided to"
final_text, steps = stepwise_generate_with_top_k_sampling(
    prompt,
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=20,
    temperature=0.1,
    top_k=50
)

print("Final text:\n", final_text)

print("\nStep-by-step sampling:")
count = 0
for step_info in steps:
    step_idx = step_info["step"]
    chosen_token = step_info["chosen_token"]
    chosen_prob = step_info["chosen_prob"]
    top_tokens = step_info["top_k_tokens"]  # (token_str, raw_prob)
    
    # Print step + chosen token
    print(f"Step {step_idx} -> '{chosen_token}' (prob={chosen_prob:.4f})")

    # Print only the first 10 of the top_k tokens
    for i, (tk_str, tk_prob) in enumerate(top_tokens[:10]):
        print(f"    Top-{i+1}: {repr(tk_str)} prob={tk_prob:.4f}")
    count += 1
    if count == 3:
        break

Final text:
 The man got so angry that he decided to take matters into his own hands. He stormed into the office of the local government official, demanding to

Step-by-step sampling:
Step 0 -> ' take' (prob=0.9999)
    Top-1: ' take' prob=0.9999
    Top-2: ' go' prob=0.0001
    Top-3: ' write' prob=0.0000
    Top-4: ' leave' prob=0.0000
    Top-5: ' do' prob=0.0000
    Top-6: ' make' prob=0.0000
    Top-7: ' fight' prob=0.0000
    Top-8: ' quit' prob=0.0000
    Top-9: ' run' prob=0.0000
    Top-10: ' throw' prob=0.0000
Step 1 -> ' matters' (prob=0.9593)
    Top-1: ' matters' prob=0.9593
    Top-2: ' a' prob=0.0402
    Top-3: ' his' prob=0.0004
    Top-4: ' revenge' prob=0.0000
    Top-5: ' drastic' prob=0.0000
    Top-6: ' the' prob=0.0000
    Top-7: ' action' prob=0.0000
    Top-8: ' out' prob=0.0000
    Top-9: ' up' prob=0.0000
    Top-10: ' it' prob=0.0000
Step 2 -> ' into' (prob=1.0000)
    Top-1: ' into' prob=1.0000
    Top-2: ' in' prob=0.0000
    Top-3: ' of' prob=0.0000
    To