In [None]:
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import torch
import torch.nn.functional as F
from dotenv import load_dotenv
import os
import matplotlib.pyplot as plt

In [2]:
device = str(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
print(device)

cuda


In [3]:
from huggingface_hub import HfFolder, whoami
load_dotenv()
hf_token = os.getenv('HF_TOKEN')
HfFolder.save_token(hf_token)
user = whoami()
print(f"logged in as {user["name"]}")

logged in as M00nl8tshad0w


In [4]:
def load_model(model_name, local_dir="./models/llama3_70b"):
    if os.path.exists(local_dir):
        print(f"Loading model from local directory: {local_dir}")
        tokenizer = AutoTokenizer.from_pretrained(local_dir)
        model = AutoModelForCausalLM.from_pretrained(local_dir, device_map="auto", torch_dtype="auto")
    else:
        print(f"Local directory not found. Downloading model '{model_name}' from Hugging Face Hub...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")

        os.makedirs(local_dir, exist_ok=True)
        tokenizer.save_pretrained(local_dir)
        model.save_pretrained(local_dir)
        print(f"Model downloaded and saved locally to: {local_dir}")

    return tokenizer, model

tokenizer, model = load_model(model_name="meta-llama/Llama-3.3-70B-Instruct",
                              local_dir="/home/max/Studium/Leipzig/Semster6/Math_and_ML/hf_models/llama3_70b/")

Loading model from local directory: /home/max/Studium/Leipzig/Semster6/Math_and_ML/hf_models/llama3_70b/


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk and cpu.


In [5]:
def load_hf_dataset(dataset_name, subset="default", local_dir="~/hf_datasets/OpenR1_Math_220k/"):
    local_dir = os.path.expanduser(local_dir)
    os.makedirs(local_dir, exist_ok=True)
    return load_dataset(dataset_name, subset, cache_dir=local_dir)

math_dataset = load_hf_dataset(dataset_name="open-r1/OpenR1-Math-220k",
                               local_dir="/home/max/Studium/Leipzig/Semster6/Math_and_ML/hf_datasets/open-r1/OpenR1-Math-220k")

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

In [6]:
tokenizer.pad_token_id = tokenizer.eos_token_id
inputs = tokenizer(["Today is"], return_tensors="pt")
inputs = inputs.to(device)

In [7]:
# Example 1: Print the scores for each token generated with Greedy Search
outputs = model.generate(**inputs, max_new_tokens=2, return_dict_in_generate=True, output_scores=True)
print(f"Generated: {outputs}")
transition_scores = model.compute_transition_scores(
    outputs.sequences, outputs.scores, normalize_logits=True
)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated: GenerateDecoderOnlyOutput(sequences=tensor([[128000,  15724,    374,    279,   1566]], device='cuda:0'), scores=(tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0'), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0')), logits=None, attentions=None, hidden_states=None, past_key_values=<transformers.cache_utils.DynamicCache object at 0x764d986a4d70>)


In [8]:
# input_length is the length of the input prompt for decoder-only models, like the GPT family, and 1 for encoder-decoder models, like BART or T5.
input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
generated_tokens = outputs.sequences[:, input_length:]
print("| token | token string | log probability | probability")
for tok, score in zip(generated_tokens[0], transition_scores[0]):
    # | token | token string | log probability | probability
    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.detach().cpu().numpy():.3f} | {np.exp(score.detach().cpu().numpy()):.2%}")

| token | token string | log probability | probability
|   279 |  the     | -0.428 | 65.20%
|  1566 |  last    | -1.356 | 25.77%


In [None]:
logits = outputs.scores  # This is a list of logits for each token generated
# Convert logits to probabilities using softmax
probabilities = [torch.nn.functional.softmax(logit, dim=-1) for logit in logits]

In [23]:
print(logits)
print(logits[0].shape)
print(probabilities[0])

(tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0'), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]], device='cuda:0'))
torch.Size([1, 128256])
tensor([[0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')


In [27]:
exp = probabilities[0]
print(exp.shape)
exp = exp.squeeze(0)
print(exp.shape)
print(exp[0])

torch.Size([1, 128256])
torch.Size([128256])
tensor(0., device='cuda:0')


In [28]:
prob_values = torch.isclose(exp, torch.tensor(float(0)))
print(prob_values)

tensor([True, True, True,  ..., True, True, True], device='cuda:0')


In [42]:
token_probabilities = {}
sum_probs = 0
for i, bool in enumerate(prob_values):
    if not bool:
        actual_token = tokenizer.decode(i)
        token_prob = exp[i]
        print(f"Probability of '{actual_token}':", end=' ')
        print(f"{token_prob.item():.2%}")
        token_probabilities[actual_token] = token_prob
        sum_probs += token_prob
print(sum_probs)

Probability of ' a': 23.01%
Probability of ' the': 65.20%
Probability of ' my': 4.35%
Probability of ' World': 5.35%
Probability of ' National': 2.10%
tensor(1., device='cuda:0')


In [14]:
def calculate_entropy(probs):
    return -torch.sum(probs * torch.log(probs + 1e-10), dim=-1).item()

In [15]:
# Iterate over the probabilities of each generated token and compute entropy
entropies = []
for prob in probabilities:
    entropy = calculate_entropy(prob)
    entropies.append(entropy)

print("Entropies for each token:", entropies)

Entropies for each token: [0.9908984899520874, 1.4603729248046875]


In [79]:
generated_token_ids = outputs.sequences[0][inputs.input_ids.shape[1]:]

# Decode token IDs into strings
generated_tokens = tokenizer.decode(generated_token_ids, skip_special_tokens=True)

for i, (token, entropy) in enumerate(zip(generated_tokens, entropies)):
    print(f"Token: {token}, Entropy: {entropy:.3f}")

Token:  , Entropy: 0.991
Token: t, Entropy: 1.460


In [93]:
def plot_token_probabilities(token_prob_dict):
    
    sorted_tokens = sorted(token_prob_dict.keys())
    probabilities = [p.item() for p in token_prob_dict.values()]
    #sorted_tokens = sorted(token_prob_dict.items(), key=lambda x: x[1], reverse=True)
    #tokens, probabilities = zip(*sorted_tokens)

    # If probabilities are tensors, detach and move them to numpy
    #probabilities = [p.detach().cpu().numpy() for p in probabilities]

    # Create a figure and axis
    plt.figure(figsize=(12, 6))

    # Create the bar plot
    plt.bar(sorted_tokens, probabilities, color='blue', edgecolor='black', linewidth=0.5)

    # Set axis labels and title
    plt.xlabel('Tokens', fontsize=14, family='sans-serif')
    plt.ylabel('P(token)', fontsize=14, family='sans-serif')
    plt.title('Token Probabilities', fontsize=16, family='sans-serif', fontweight='bold')

    # Customize ticks and axes
    plt.xticks(rotation=45, ha='right', fontsize=12)
    plt.yticks(fontsize=12)
    plt.gca().set_facecolor('white')
    plt.grid(False)

    # Clean up spines
    ax = plt.gca()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_linewidth(0.5)
    ax.spines['bottom'].set_linewidth(0.5)

    # Tight layout for better spacing
    plt.tight_layout()

    # Show the plot
    plt.show()

In [None]:
plot_token_probabilities(token_probabilities)