In [None]:
# We want to load all experiments together into one big dataframe of this shape:
# [ "gsm8k" : ["Llama-3.1-8B-Instruct" : { <experiment_data> },
#               "Qwen-3" : { <experiment_data> }],
#    "creative_writing" : ["Llama-3.1-8B-Instruct" : { <experiment_data> }, 
#                           "Qwen-3" : { <experiment_data> }]
# ]

# And each experiment_data has this shape:
# [ "prompt0" : { <prompt_data> },
#    "prompt1" : { <prompt_data> }
# ]

# And each prompt_data has this shape:
# {
#  "top_p_tokens" : [ <list of lists of top p token ids> ],
#  "top_p_probs"  : [ <list of lists of top p token probabilities (after softmax)> ],
#  "top_p_logits" : [ <list of lists of top p token logits (before softmax)> ],
#  "top_p_generated_tokens" : [ <list of lists of top p tokens decoded> ],
#  "top_p_entropies" : [ <list of entropies of top p tokens> ],
#  "full_entropies"  : [ <list of entropies over full distribution> ],
#   "prompt" : "<prompt that was used>",
#  "correct" : "<did the model answer correct (for math questions)"
# }
# We nedd top_p_tokens
# We dont need top_p_probs (they can be calculated from top_p_logits via softmax)
# We dont need top_p_logits (because we dont look at those for now)
# We dont need top_p_generated_tokens (they can be decoded from top_p_tokens)
# we dont need top_p_entropies (we want to explore full entropy!)
# We need full_entropies, because we cant save all 32k logits to calculate it later.
# we need prompt
# we dont need correct (for now we dont care about that)

#dict_keys(['top_p_tokens', 'top_p_probs', 'top_p_logits', 'generated_tokens', 'entropies', 'cosines', 'prompt', 'correct'])

In [13]:
import torch 
from transformers import AutoTokenizer, AutoModelForCausalLM
model = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model)

tokens = tokenizer.encode("Hello world, my name is Julian. How are you?, Hello world, my name is Julian. How are you?, Hello world, my name is Julian. How are you?,Hello world, my name is Julian. How are you?, Hello world, my name is Julian. How are you?,Hello world, my name is Julian. How are you?")
print(tokens)
text = tokenizer.decode(tokens, skip_special_tokens=True)
print(text)

[128000, 9906, 1917, 11, 856, 836, 374, 38897, 13, 2650, 527, 499, 12909, 22691, 1917, 11, 856, 836, 374, 38897, 13, 2650, 527, 499, 12909, 22691, 1917, 11, 856, 836, 374, 38897, 13, 2650, 527, 499, 12909, 9906, 1917, 11, 856, 836, 374, 38897, 13, 2650, 527, 499, 12909, 22691, 1917, 11, 856, 836, 374, 38897, 13, 2650, 527, 499, 12909, 9906, 1917, 11, 856, 836, 374, 38897, 13, 2650, 527, 499, 30]
Hello world, my name is Julian. How are you?, Hello world, my name is Julian. How are you?, Hello world, my name is Julian. How are you?,Hello world, my name is Julian. How are you?, Hello world, my name is Julian. How are you?,Hello world, my name is Julian. How are you?


In [2]:
from data_prep import prepare_dataset_for_correlation_analysis

dataset = prepare_dataset_for_correlation_analysis("writingprompts", 100)

print(len(dataset))
print(dataset[0])

for i, prompt in enumerate(dataset["prompt"]):
    print(prompt)

100
{'prompt': "[ WP ] Every person in the world undergoes a `` goodness '' test . It 's designed to give a score from 1 to 200 , where 1 is pure evil , and 200 is an angel in human body . Then the world is divided into 200 zones , where people can live among their own kind .\n"}
[ WP ] Every person in the world undergoes a `` goodness '' test . It 's designed to give a score from 1 to 200 , where 1 is pure evil , and 200 is an angel in human body . Then the world is divided into 200 zones , where people can live among their own kind .

[ WP ] Space mining is on the rise . The Space tanker Exxon Valdez 2.0 crash and spill its cargo . Write a news story covering the event .

[ WP ] `` I wo n't have time to explain all of this to them . '' Start or end your story with this sentence .

[ CW ] Write about a song . Each sentence must start with the next letter in the song title .

[ EU ] You live in Skyrim . It is your job to keep lit all the candles in the abandoned caves and dungeons and 

In [None]:
# Dict[str, Dict[str, List[Dict[str, Tensor]]]]
# dataset_name -> model_name -> list of entries

import os
import torch

def load_results(results_root="results"):
    results = {}

    for dataset_name in os.listdir(results_root):
        dataset_path = os.path.join(results_root, dataset_name)
        if not os.path.isdir(dataset_path):
            continue

        results[dataset_name] = {}

        for model_name in os.listdir(dataset_path):
            model_path = os.path.join(dataset_path, model_name)
            if not os.path.isdir(model_path):
                continue

            # Expect one .pt file per model folder
            pt_files = [f for f in os.listdir(model_path) if f.endswith(".pt")]
            if not pt_files:
                continue
            if len(pt_files) > 1:
                raise ValueError(f"Multiple .pt files in {model_path}, expected only one.")

            file_path = os.path.join(model_path, pt_files[0])
            data = torch.load(file_path)  # list of dicts with tensors
            results[dataset_name][model_name] = data

    return results


In [2]:
results = load_results()
print(results[0].keys)

FileNotFoundError: [Errno 2] No such file or directory: '/results'