In [1]:
import torch
import json
import argparse
import os
import pandas as pd


experiment_name = "llama2_test_cot"
rerun = "no"

experiment_path = os.path.join('experiments', experiment_name)

#==========
# Metadata
#==========
with open(f"{experiment_path}/metadata.json", "r") as f:
    metadata = json.load(f)

model_name = metadata["model"]
dataset = metadata["dataset"]
prompting_technique = metadata['prompting_technique']

#==========
# Result Tensor
#==========
if rerun == "yes":
     reruns = []
     for filename in os.listdir(experiment_path):
          if filename.startswith("output") and filename.endswith(".pt"):
               output_tensor_path = os.path.join(experiment_path, filename)
          if filename.startswith("rerun") and filename.endswith(".pt"):
               reruns.append(os.path.join(experiment_path, filename))
     
     # Load original output tensor
     results = torch.load(output_tensor_path)

     # Load and merge rerun results
     for rerun_path in reruns:
          rerun_tensor = torch.load(rerun_path)
          results.update(rerun_tensor)  # overwrite buggy samples with rerun results
     print(f"{output_tensor_path=}")
     print(f"Rerun_paths = {reruns}")
else:
     for filename in os.listdir(experiment_path):
          if filename.startswith("output") and filename.endswith(".pt"):
               output_tensor_path = os.path.join(experiment_path, filename)
     results = torch.load(output_tensor_path)
     print(f"{output_tensor_path=}")

#==========
# Checking for duplicates
#==========
from evaluation_utils import check_for_duplicate_questions
duplicate_entries = check_for_duplicate_questions(exp_tensor=results)
if duplicate_entries:
    print("\nDUPLICATE QUESTIONS DETECTED:")
    for question, key1, key2 in duplicate_entries:
        print(f"Question: {question}\nFound in: {key1} and {key2}\n")
else:
    print("No duplicate questions found.")


#==========
# Evaluation
#==========
from evaluation_utils import calculate_accuracy, compute_entropy, get_latency, get_tokens_per_prompt, compute_logtoku_uncertainty, plot_logtoku_quadrants, plot_cosine_violin, plot_entropy_violin

accuracy, correctness_dict, answer_dict = calculate_accuracy(exp_tensor=results, prompting_technique=prompting_technique)
entropy = compute_entropy(exp_tensor=results, prompting_technique=prompting_technique, normalize=True)
latency_per_prompt = get_latency(exp_tensor=results)
tokens_per_prompt = get_tokens_per_prompt(exp_tensor=results)
logtoku_results = compute_logtoku_uncertainty(exp_tensor=results,prompting_technique=prompting_technique)

df_answers = pd.DataFrame([(k, v[0], v[1]) for k, v in answer_dict.items()],columns=["prompt_id", "llm_answer", "ground_truth"])
df_correct = pd.DataFrame(list(correctness_dict.items()), columns=["prompt_id", "correct"])
df_entropy = pd.DataFrame(list(entropy.items()), columns=["prompt_id", "entropy"])
df_latency = pd.DataFrame(list(latency_per_prompt.items()), columns=["prompt_id", "latency"])
df_tokens = pd.DataFrame(list(tokens_per_prompt.items()), columns=["prompt_id", "tokens_used"])
df_logtoku = pd.DataFrame.from_dict(logtoku_results, orient='index').reset_index().rename(columns={'index': 'prompt_id'})

# Merge all into a single dataframe on 'prompt_id'
df_merged = df_entropy.merge(df_latency, on="prompt_id") \
                      .merge(df_tokens, on="prompt_id") \
                      .merge(df_correct, on="prompt_id") \
                      .merge(df_answers, on="prompt_id") \
                      .merge(df_logtoku, on="prompt_id")
df_merged.to_csv(f"{experiment_path}/evaluation_results.csv", index=False)

#plot logtoku quadrants
plot_path = f"{experiment_path}/logtoku_quadrants.png"
plot_logtoku_quadrants(df_merged, output_path=plot_path)
print(f"Saved LogTokU quadrant plot to: {plot_path}")

#output a list of buggy samples to rerun them later
buggy_samples_indices = []
for key, value in correctness_dict.items():
     if value == "buggy":
          indice = key.replace("prompt", "")
          buggy_samples_indices.append(indice)
df_buggy_indices = pd.DataFrame(buggy_samples_indices, columns=["buggy_prompt_ids"])
df_buggy_indices.to_csv(f"{experiment_path}/buggy_prompts_to_rerun.csv")

#==========
# Compute average values
#==========

# ===== Entropy over all samples except buggy ones =====
try:
    entropies_list = list(entropy.values())
    cleaned_list = [x for x in entropies_list if x is not None]
    average_entropy = sum(cleaned_list) / len(cleaned_list)
except ZeroDivisionError:
     average_entropy = "Bug occured."

# =====Entropy over all correct answered prompts =====
df_correct = df_merged[df_merged["correct"] == "yes"]
if len(df_correct) > 0:
     average_entropy_correct = df_correct["entropy"].mean()
else:
     average_entropy_correct = "no correct samples"

     

#===== Entropy over all incorrect answered prompts =====
df_incorrect = df_merged[df_merged["correct"] == "no"]
if len(df_incorrect) > 0:
     average_entropy_incorrect = df_incorrect["entropy"].mean()
else:
     average_entropy_incorrect = "no correct samples"




plot_entropy_violin(df_correct, df_incorrect)


output_tensor_path='experiments/llama2_test_cot/output_2025-06-11_11-50.pt'
No duplicate questions found.
Saved LogTokU quadrant plot to: experiments/llama2_test_cot/logtoku_quadrants.png


  plt.show()
