In [22]:
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import torch
import torch.nn.functional as F
from dotenv import load_dotenv
import os
import matplotlib.pyplot as plt
import os

### Set up

In [2]:
device = str(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
print(device)

cuda


In [3]:
from huggingface_hub import HfFolder, whoami
load_dotenv()
hf_token = os.getenv('HF_TOKEN')
HfFolder.save_token(hf_token)
user = whoami()
print(f"logged in as {user["name"]}")

logged in as M00nl8tshad0w


In [4]:
def load_model(model_name, local_dir="./models/llama3_70b"):
    if os.path.exists(local_dir):
        print(f"Loading model from local directory: {local_dir}")
        tokenizer = AutoTokenizer.from_pretrained(local_dir)
        model = AutoModelForCausalLM.from_pretrained(local_dir, device_map="auto", torch_dtype="auto")
    else:
        print(f"Local directory not found. Downloading model '{model_name}' from Hugging Face Hub...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")

        os.makedirs(local_dir, exist_ok=True)
        tokenizer.save_pretrained(local_dir)
        model.save_pretrained(local_dir)
        print(f"Model downloaded and saved locally to: {local_dir}")

    return tokenizer, model

tokenizer, model = load_model(model_name="meta-llama/Llama-3.3-70B-Instruct",
                              local_dir="/home/max/Studium/Leipzig/Semster6/Math_and_ML/hf_models/llama3_70b/")

Loading model from local directory: /home/max/Studium/Leipzig/Semster6/Math_and_ML/hf_models/llama3_70b/


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu and disk.


In [5]:
def load_hf_dataset(dataset_name, subset="default", local_dir="~/hf_datasets/OpenR1_Math_220k/"):
    local_dir = os.path.expanduser(local_dir)
    os.makedirs(local_dir, exist_ok=True)
    return load_dataset(dataset_name, subset, cache_dir=local_dir)

math_dataset = load_hf_dataset(dataset_name="open-r1/OpenR1-Math-220k",
                               local_dir="/home/max/Studium/Leipzig/Semster6/Math_and_ML/hf_datasets/open-r1/OpenR1-Math-220k")

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

### Generation

In [6]:
tokenizer.pad_token_id = tokenizer.eos_token_id
inputs = tokenizer(["Today is"], return_tensors="pt")
inputs = inputs.to(device)

In [7]:
# Example 1: Print the scores for each token generated with Greedy Search
outputs = model.generate(**inputs, max_new_tokens=2, return_dict_in_generate=True, output_scores=True)
print(f"Generated: {outputs}")
transition_scores = model.compute_transition_scores(
    outputs.sequences, outputs.scores, normalize_logits=True
)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated: GenerateDecoderOnlyOutput(sequences=tensor([[128000,  15724,    374,    279,   1176]], device='cuda:0'), scores=(tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0'), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0')), logits=None, attentions=None, hidden_states=None, past_key_values=<transformers.cache_utils.DynamicCache object at 0x731944cf75f0>)


In [14]:
input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
generated_tokens = outputs.sequences[:, input_length:]
print("| token | token string | log probability | probability")
for tok, score in zip(generated_tokens[0], transition_scores[0]):
    # | token | token string | log probability | probability
    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.detach().cpu().numpy():.3f} | {np.exp(score.detach().cpu().numpy()):.2%}")

| token | token string | log probability | probability
|   279 |  the     | -0.428 | 65.20%
|  1176 |  first   | -1.773 | 16.99%


### Logits

In [9]:
logits = outputs.scores  # This is a list of logits for each token generated
print(f"{len(logits)=}")
print(f"{logits[0].shape=}")
print(f"{logits[0]=}")
probabilities = [torch.nn.functional.softmax(logit, dim=-1) for logit in logits] # Convert logits to probabilities using softmax
print(f"{probabilities[0].shape=}")
print(f"{probabilities[0]=}")

len(logits)=2
logits[0].shape=torch.Size([1, 128256])
logits[0]=tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0')
probabilities[0].shape=torch.Size([1, 128256])
probabilities[0]=tensor([[0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')


In [12]:
def get_token_distributions(probabilites: list) -> list:
    distributions = []
    for token in probabilites:
        token_probabilities = {}
        token_tensor = token.squeeze(0)
        prob_values = torch.isclose(token_tensor, torch.tensor(float(0)))
        for i, bool in enumerate(prob_values):
            if not bool:
                actual_token = tokenizer.decode(i)
                token_prob = token_tensor[i]
                print(f"Probability of '{actual_token}':", end=' ')
                print(f"{token_prob.item():.2%}")
                token_probabilities[actual_token] = token_prob
        print("----")
        distributions.append(token_probabilities)
    return distributions

In [13]:
distributions = get_token_distributions(probabilities)

Probability of ' a': 23.01%
Probability of ' the': 65.20%
Probability of ' my': 4.35%
Probability of ' World': 5.35%
Probability of ' National': 2.10%
----
Probability of ' ': 25.77%
Probability of ' first': 16.99%
Probability of ' last': 25.77%
Probability of ' day': 28.59%
Probability of ' birthday': 2.89%
----


### Entropy + Plotting

In [73]:
def plot_token_probabilities(token_prob_dict, path, filename):    
    sorted_tokens = token_prob_dict.keys()
    sorted_tokens = ["_" if token==" " else token.replace(" ", "_") for token in sorted_tokens]
    probabilities = [p.item() for p in token_prob_dict.values()]
    
    entropy = -sum(p * np.log(p) for p in probabilities if p > 0) #show entropy too
    normalized_entropy = entropy / np.log(len(probabilities))
    
    # Create a figure and axis
    plt.figure(figsize=(12, 6))

    # Create the bar plot
    plt.bar(sorted_tokens, probabilities, color='lightblue', edgecolor='black', linewidth=0.5)

    # Set axis labels and title
    plt.xlabel('Tokens', fontsize=14, family='sans-serif')
    plt.ylabel('P(token)', fontsize=14, family='sans-serif')
    plt.title(f'Token Probabilities\nNormalized Entropy: {normalized_entropy:.3f}', fontsize=16, family='sans-serif', fontweight='bold')

    # Customize ticks and axes
    plt.xticks(rotation=0, ha='right', fontsize=12)
    plt.yticks(fontsize=12)
    plt.gca().set_facecolor('white')
    plt.grid(False)

    # Clean up spines
    ax = plt.gca()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_linewidth(0.5)
    ax.spines['bottom'].set_linewidth(0.5)

    # Tight layout for better spacing
    plt.tight_layout()

    os.makedirs(path, exist_ok=True)
    full_path = os.path.join("plots", filename)
    plt.savefig(full_path)
    plt.close()
    print(f"Plot saved to {full_path}")

In [74]:
for i,token_distribution in enumerate(distributions):
    plot_token_probabilities(token_distribution, "plots", f"token_{i}")

Plot saved to plots/token_0
Plot saved to plots/token_1


In [71]:
def get_entropies(distributions):
    def calculate_entropy(probs, n):
        entropy = -torch.sum(probs * torch.log(probs + 1e-10), dim=-1).item() 
        normalized_entropy = entropy / np.log(n) if np.log(n) > 0 else 0
        return normalized_entropy
    entropies = []
    for distribution in distributions:
        probabilities = torch.stack(list(distribution.values()))
        print(probabilities)
        entropies.append(calculate_entropy(probabilities, len(distribution)))
    print(f"Entropy for eachs token distribution: {entropies}")
    return entropies

In [72]:
entropies = get_entropies(distributions)
print("--check--")
print(entropies)

tensor([0.2301, 0.6520, 0.0435, 0.0535, 0.0210], device='cuda:0')
tensor([0.2577, 0.1699, 0.2577, 0.2859, 0.0289], device='cuda:0')
Entropy for eachs token distribution: [np.float64(0.6156799224784946), np.float64(0.9073807156661494)]
--check--
[np.float64(0.6156799224784946), np.float64(0.9073807156661494)]
