In [1]:
import sys
import os
import random
import gc
import time
import torch
import numpy as np
import pandas as pd
import ast
from tqdm import tqdm
from scipy.sparse import csr_matrix
import itertools
from scipy.stats import spearmanr, pearsonr, kendalltau, rankdata
from sklearn.metrics import ndcg_score
from transformers import AutoModelForCausalLM, AutoTokenizer
from accelerate import Accelerator
import nltk
nltk.download('punkt')
os.environ["CUDA_VISIBLE_DEVICES"] = "2" 
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(parent_dir)
from SHapRAG import *

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df=pd.read_json("../data/musique/musique_ans_v1.0_train.jsonl", lines=True)

In [3]:
def get_titles(lst):
    # Titles where is_supporting is True
    supporting = [d['paragraph_text'] for d in lst if d.get('is_supporting') == True]
    # Titles where is_supporting is False or missing AND not already in supporting
    others = [d['paragraph_text'] for d in lst if d.get('is_supporting') != True and d['paragraph_text'] not in supporting]
    # Combine: all supporting + as many others as needed to reach 10
    result = supporting + others
    return result[:10]

df.paragraphs=df.paragraphs.apply(get_titles)

In [4]:
df['Sentences'] = df['paragraphs'].apply(
    lambda para_list: [sent for para in para_list for sent in nltk.sent_tokenize(para)]
)

In [None]:
df.Sentences[24]

In [None]:
df_save=df_save.drop(columns=['id', 'question_decomposition','paragraphs', 'answer_aliases', 'answerable' ])

In [None]:
df_save.to_csv()'../data/musique/sen_labeled.csv'

In [None]:
df_save=pd.read_csv('../data/musique/sen_labeled.csv')

In [None]:
# df["paragraphs"] = df["paragraphs"].apply(lambda p: p[:5]+ [p[1]] + p[5:])

In [5]:
SEED = 42
# Initialize Accelerator
accelerator_main = Accelerator(mixed_precision="fp16")

# Load Model
if accelerator_main.is_main_process:
    print("Main Script: Loading model...")
# model_path = "mistralai/Mistral-7B-Instruct-v0.3"
model_path = "meta-llama/Llama-3.1-8B-Instruct"
# model_path = "Qwen/Qwen2.5-3B-Instruct"

model_cpu = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model_cpu.config.pad_token_id = tokenizer.pad_token_id
    if hasattr(model_cpu, 'generation_config') and model_cpu.generation_config is not None:
        model_cpu.generation_config.pad_token_id = tokenizer.pad_token_id

if accelerator_main.is_main_process:
    print("Main Script: Preparing model with Accelerator...")
prepared_model = accelerator_main.prepare(model_cpu)
unwrapped_prepared_model = accelerator_main.unwrap_model(prepared_model)
unwrapped_prepared_model.eval()
if accelerator_main.is_main_process:
    print("Main Script: Model prepared and set to eval.")

# Define utility cache

accelerator_main.wait_for_everyone()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Main Script: Loading model...


Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.58it/s]


Main Script: Preparing model with Accelerator...
Main Script: Model prepared and set to eval.


In [8]:
# num_questions_to_run=len(df.question)
num_questions_to_run=50
k_values = [1,2,3,4,5]
all_results=[]
LDSs=[]
r2_fm=[]
r2_cc=[]
for i in tqdm(range(num_questions_to_run), disable=not accelerator_main.is_main_process):
    query = df.question[i]
    if accelerator_main.is_main_process:
        print(f"\n--- Question {i+1}/{num_questions_to_run}: {query[:60]}... ---")

    docs=df.Sentences[i]
    utility_cache_base_dir = f"../Experiment_data/musique/{model_path.split('/')[1]}/sentence"
    utility_cache_filename = f"utilities_q_idx{i}.pkl" # More robust naming
    current_utility_path = os.path.join(utility_cache_base_dir, utility_cache_filename)
    
    if accelerator_main.is_main_process: # Only main process creates directories
        os.makedirs(os.path.dirname(current_utility_path), exist_ok=True)
    
    # Initialize Harness
    harness = ContextAttribution(
        items=docs,
        query=query,
        prepared_model=prepared_model,
        prepared_tokenizer=tokenizer,
        accelerator=accelerator_main,
        utility_cache_path=current_utility_path
    )

    print(f'Response: {harness.target_response}')
    print(f'GT: {df.answer[i]}')
    # Compute metrics
    results_for_query = {}
    if accelerator_main.is_main_process:
        m_samples_map = {"L": 364} 
        # m_samples_map = {"L": 128, "XL":256, "XXL":512} 
        T_iterations_map = {"L":40, "XL":50, "XXL":60} 

        for size_key, num_s in m_samples_map.items():
            if 2**len(docs) < num_s and size_key != "L":
                actual_samples = max(1, 2**len(docs)-1 if 2**len(docs)>0 else 1)
            else:
                actual_samples = num_s

            if actual_samples > 0:
                results_for_query[f"ContextCite{actual_samples}"], model_cc = harness.compute_contextcite(num_samples=actual_samples, seed=SEED)
                attributions, ints=harness.compute_spex(sample_budget=actual_samples,max_order=2)
                results_for_query[f"FBII{actual_samples}"]=attributions['fbii']
                results_for_query[f"Spex{actual_samples}"]=attributions['fourier']
                results_for_query[f"FSII{actual_samples}"]=attributions['fsii']
                results_for_query[f"FM_WeightsD{actual_samples}"], F, modelfm = harness.compute_wss(num_samples=actual_samples, seed=SEED, sampling="kernelshap",sur_type="fm", utility_mode="divergence_utility")
                results_for_query[f"FM_Weights{actual_samples}"], F, modelfm = harness.compute_wss(num_samples=actual_samples, seed=SEED, sampling="kernelshap",sur_type="fm")
                # results_for_query[f"BetaShap{actual_samples}"] = harness.compute_beta_shap(num_iterations_max=T_iterations_map[size_key], beta_a=16, beta_b=1, max_unique_lookups=actual_samples, seed=SEED)
                # results_for_query[f"TMC{actual_samples}"] = harness.compute_tmc_shap(num_iterations_max=T_iterations_map[size_key], performance_tolerance=0.001, max_unique_lookups=actual_samples, seed=SEED)

        results_for_query["LOO"] = harness.compute_loo()
        results_for_query["ARC-JSD"] = harness.compute_arc_jsd()

        prob_topk = harness.evaluate_topk_performance(
                                                results_for_query, 
                                                k_values, 
                                                utility_type="probability"
                                            )

        div_topk = harness.evaluate_topk_performance(
                                            results_for_query, 
                                            k_values, 
                                            utility_type="divergence"
                                        )
        
        r2_fm.append([harness.r2_mse(30, modelfm, method='fm')])
        r2_cc.append([harness.r2_mse(30, model_cc, method='cc')])

        LDS = {}
        for i in results_for_query:
            if "FM" in i:
                calculate_LDS = {i:harness.lds(results_for_query[i], 30, utl=True, model=modelfm)}
                LDS.update(calculate_LDS)
            else:
                calculate_LDS = {i:harness.lds(results_for_query[i], 30)}
                LDS.update(calculate_LDS)
        LDS = [{i:harness.lds(results_for_query[i], 30)} for i in results_for_query]

        results_for_query["topk_probability"] = prob_topk
        results_for_query["topk_divergence"] = div_topk
        results_for_query["LDS"] = LDS
        harness.save_utility_cache(current_utility_path)
        
        all_results.append(results_for_query)

  0%|          | 0/50 [00:00<?, ?it/s]


--- Question 1/50: When was the institute that owned The Collegian founded?... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx0.pkl...
Successfully loaded 992 cached utility entries.
Response: Houston Baptist University was founded in 1960.
GT: 1960


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 230797.68it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 298/298 [00:00<00:00, 190243.93it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 298/298 [00:00<00:00, 215463.30it/s]
LOO Calls (logit-prob): 100%|██████████| 25/25 [00:00<00:00, 83819.02it/s]
LOO Calls (divergence_utility): 100%|██████████| 25/25 [00:00<00:00, 102700.88it/s]
  2%|▏         | 1/50 [00:18<15:22, 18.82s/it]

Main Process: Saving 992 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx0.pkl...
Save complete.

--- Question 2/50: What year saw the creation of the region where the county of... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx1.pkl...
Successfully loaded 1026 cached utility entries.
Response: 1994.
GT: 1994


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 302861.86it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 319/319 [00:00<00:00, 244872.43it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 319/319 [00:00<00:00, 271011.34it/s]
LOO Calls (logit-prob): 100%|██████████| 36/36 [00:00<00:00, 110619.01it/s]
LOO Calls (divergence_utility): 100%|██████████| 36/36 [00:00<00:00, 115175.40it/s]
  4%|▍         | 2/50 [00:45<18:57, 23.69s/it]

Main Process: Saving 1026 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx1.pkl...
Save complete.

--- Question 3/50: When was the abolishment of the studio that distributed The ... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx2.pkl...
Successfully loaded 1026 cached utility entries.
Response: 1999.
GT: 1999


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 294734.88it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 319/319 [00:00<00:00, 227664.28it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 319/319 [00:00<00:00, 266902.65it/s]
LOO Calls (logit-prob): 100%|██████████| 36/36 [00:00<00:00, 113274.53it/s]
LOO Calls (divergence_utility): 100%|██████████| 36/36 [00:00<00:00, 56807.73it/s]
  6%|▌         | 3/50 [01:13<19:58, 25.51s/it]

Main Process: Saving 1026 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx2.pkl...
Save complete.

--- Question 4/50: When was the publisher of Crux launched?... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx3.pkl...
Successfully loaded 1010 cached utility entries.
Response: May 2001.
GT: 1998


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 305895.94it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 310/310 [00:00<00:00, 268144.82it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 310/310 [00:00<00:00, 315820.80it/s]
LOO Calls (logit-prob): 100%|██████████| 28/28 [00:00<00:00, 110897.56it/s]
LOO Calls (divergence_utility): 100%|██████████| 28/28 [00:00<00:00, 130634.61it/s]
  8%|▊         | 4/50 [01:34<18:02, 23.54s/it]

Main Process: Saving 1010 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx3.pkl...
Save complete.

--- Question 5/50: Jan Šindel's was born in what country?... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx4.pkl...
Successfully loaded 1025 cached utility entries.
Response: Czech Republic.
GT: Czech Republic


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 301010.78it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 317/317 [00:00<00:00, 232853.65it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 317/317 [00:00<00:00, 156736.34it/s]
LOO Calls (logit-prob): 100%|██████████| 33/33 [00:00<00:00, 89877.94it/s]
LOO Calls (divergence_utility): 100%|██████████| 33/33 [00:00<00:00, 109503.19it/s]
 10%|█         | 5/50 [01:58<18:00, 24.02s/it]

Main Process: Saving 1025 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx4.pkl...
Save complete.

--- Question 6/50: What city is the person who broadened the doctrine of philos... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx5.pkl...
Successfully loaded 1024 cached utility entries.
Response: Copenhagen.
GT: Copenhagen


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 276480.74it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 319/319 [00:00<00:00, 269482.98it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 319/319 [00:00<00:00, 284737.81it/s]
LOO Calls (logit-prob): 100%|██████████| 36/36 [00:00<00:00, 112766.95it/s]
LOO Calls (divergence_utility): 100%|██████████| 36/36 [00:00<00:00, 124071.44it/s]
 12%|█▏        | 6/50 [02:31<19:43, 26.90s/it]

Main Process: Saving 1024 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx5.pkl...
Save complete.

--- Question 7/50: When was the baseball team winning the world series in 2015 ... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx6.pkl...
Successfully loaded 1034 cached utility entries.
Response: The Kansas City Royals were founded in 1969.
GT: 1969


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 241379.71it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 328/328 [00:00<00:00, 185734.00it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 328/328 [00:00<00:00, 196673.58it/s]
LOO Calls (logit-prob): 100%|██████████| 46/46 [00:00<00:00, 95608.52it/s]
LOO Calls (divergence_utility): 100%|██████████| 46/46 [00:00<00:00, 101386.22it/s]
 14%|█▍        | 7/50 [03:08<21:43, 30.31s/it]

Main Process: Saving 1034 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx6.pkl...
Save complete.

--- Question 8/50: Where did the Baldevins bryllup director die?... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx7.pkl...
Successfully loaded 1005 cached utility entries.
Response: There is no information about the director's death in the provided context.
GT: Copenhagen


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 328752.51it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 306/306 [00:00<00:00, 312885.67it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 306/306 [00:00<00:00, 333885.80it/s]
LOO Calls (logit-prob): 100%|██████████| 27/27 [00:00<00:00, 92977.18it/s]
LOO Calls (divergence_utility): 100%|██████████| 27/27 [00:00<00:00, 125133.93it/s]
 16%|█▌        | 8/50 [03:30<19:17, 27.55s/it]

Main Process: Saving 1005 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx7.pkl...
Save complete.

--- Question 9/50: Who was thee first president of the association that wrote t... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx8.pkl...
Successfully loaded 1039 cached utility entries.
Response: G. Stanley Hall.
GT: G. Stanley Hall


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 244471.84it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 327/327 [00:00<00:00, 240199.20it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 327/327 [00:00<00:00, 263706.48it/s]
LOO Calls (logit-prob): 100%|██████████| 43/43 [00:00<00:00, 116058.60it/s]
LOO Calls (divergence_utility): 100%|██████████| 43/43 [00:00<00:00, 122274.63it/s]
 18%|█▊        | 9/50 [04:07<20:45, 30.39s/it]

Main Process: Saving 1039 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx8.pkl...
Save complete.

--- Question 10/50: Which major Russian city borders the body of water in which ... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx9.pkl...
Successfully loaded 1023 cached utility entries.
Response: Saint Petersburg.
GT: Saint Petersburg


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 270169.29it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 316/316 [00:00<00:00, 216852.10it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 316/316 [00:00<00:00, 280329.96it/s]
LOO Calls (logit-prob): 100%|██████████| 37/37 [00:00<00:00, 114700.11it/s]
LOO Calls (divergence_utility): 100%|██████████| 37/37 [00:00<00:00, 112537.53it/s]
 20%|██        | 10/50 [04:33<19:24, 29.11s/it]

Main Process: Saving 1023 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx9.pkl...
Save complete.

--- Question 11/50: When was the employer of John J. Collins established?... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx10.pkl...
Successfully loaded 1027 cached utility entries.
Response: Yale Divinity School.
GT: 1822


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 279773.99it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 318/318 [00:00<00:00, 219156.86it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 318/318 [00:00<00:00, 261578.48it/s]
LOO Calls (logit-prob): 100%|██████████| 38/38 [00:00<00:00, 94198.32it/s]
LOO Calls (divergence_utility): 100%|██████████| 38/38 [00:00<00:00, 44187.29it/s]
 22%|██▏       | 11/50 [05:06<19:47, 30.44s/it]

Main Process: Saving 1027 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx10.pkl...
Save complete.

--- Question 12/50: When did Bush declare the war causing Kerry to criticize him... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx11.pkl...
Successfully loaded 1094 cached utility entries.
Response: The text does not explicitly state when Bush declared the war. However, it mentions that Bush relied on the resolution Kerry voted for in October 2002 to order the 2003 invasion of Iraq.
GT: 2003


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 209111.99it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 332/332 [00:00<00:00, 202017.83it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 332/332 [00:00<00:00, 226240.28it/s]
LOO Calls (logit-prob): 100%|██████████| 55/55 [00:00<00:00, 99605.66it/s]
LOO Calls (divergence_utility): 100%|██████████| 55/55 [00:00<00:00, 107898.37it/s]
 24%|██▍       | 12/50 [05:56<23:05, 36.46s/it]

Main Process: Saving 1094 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx11.pkl...
Save complete.

--- Question 13/50: What is the college Francis Walsingham attended an instance ... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx12.pkl...
Successfully loaded 1008 cached utility entries.
Response: King's College.
GT: college of the University of Cambridge


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 314659.24it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 313/313 [00:00<00:00, 296079.65it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 313/313 [00:00<00:00, 310946.74it/s]
LOO Calls (logit-prob): 100%|██████████| 29/29 [00:00<00:00, 110076.76it/s]
LOO Calls (divergence_utility): 100%|██████████| 29/29 [00:00<00:00, 112833.78it/s]
 26%|██▌       | 13/50 [06:20<19:59, 32.41s/it]

Main Process: Saving 1008 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx12.pkl...
Save complete.

--- Question 14/50: What type of university is the college Kyeon Mi-ri attended?... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx13.pkl...
Successfully loaded 1009 cached utility entries.
Response: Sejong University.
GT: private university


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 350729.76it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 313/313 [00:00<00:00, 302911.20it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 313/313 [00:00<00:00, 323672.87it/s]
LOO Calls (logit-prob): 100%|██████████| 29/29 [00:00<00:00, 122123.31it/s]
LOO Calls (divergence_utility): 100%|██████████| 29/29 [00:00<00:00, 115403.05it/s]
 28%|██▊       | 14/50 [06:40<17:18, 28.86s/it]

Main Process: Saving 1009 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx13.pkl...
Save complete.

--- Question 15/50: In what year was the author of The Insider's Guide to the Co... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx14.pkl...
Successfully loaded 1028 cached utility entries.
Response: The text does not mention the author of The Insider's Guide to the Colleges.
GT: 1878


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 290362.62it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 319/319 [00:00<00:00, 234281.73it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 319/319 [00:00<00:00, 270245.00it/s]
LOO Calls (logit-prob): 100%|██████████| 36/36 [00:00<00:00, 107930.62it/s]
LOO Calls (divergence_utility): 100%|██████████| 36/36 [00:00<00:00, 115527.88it/s]
 30%|███       | 15/50 [07:06<16:17, 27.93s/it]

Main Process: Saving 1028 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx14.pkl...
Save complete.

--- Question 16/50: When was the territory covered by RIBA's Cambridge branch of... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx15.pkl...
Successfully loaded 1019 cached utility entries.
Response: 1966.
GT: 1994


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 298830.82it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 312/312 [00:00<00:00, 287672.64it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 312/312 [00:00<00:00, 304685.18it/s]
LOO Calls (logit-prob): 100%|██████████| 31/31 [00:00<00:00, 113260.82it/s]
LOO Calls (divergence_utility): 100%|██████████| 31/31 [00:00<00:00, 110846.91it/s]
 32%|███▏      | 16/50 [07:28<14:50, 26.19s/it]

Main Process: Saving 1019 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx15.pkl...
Save complete.

--- Question 17/50: What's the meaning of the name of the school that does not i... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx16.pkl...
Successfully loaded 1029 cached utility entries.
Response: Theravada.
GT: The School of the Elders


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 247363.36it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 319/319 [00:00<00:00, 235105.07it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 319/319 [00:00<00:00, 266530.47it/s]
LOO Calls (logit-prob): 100%|██████████| 36/36 [00:00<00:00, 107699.67it/s]
LOO Calls (divergence_utility): 100%|██████████| 36/36 [00:00<00:00, 109814.50it/s]
 34%|███▍      | 17/50 [08:12<17:15, 31.37s/it]

Main Process: Saving 1029 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx16.pkl...
Save complete.

--- Question 18/50: Where did the director who provided the lyrics to A Time for... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx17.pkl...
Successfully loaded 1049 cached utility entries.
Response: University College London.
GT: Northwestern's School of Communication


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 227090.09it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 334/334 [00:00<00:00, 188571.48it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 334/334 [00:00<00:00, 226976.27it/s]
LOO Calls (logit-prob): 100%|██████████| 49/49 [00:00<00:00, 97035.36it/s]
LOO Calls (divergence_utility): 100%|██████████| 49/49 [00:00<00:00, 108111.99it/s]
 36%|███▌      | 18/50 [08:53<18:22, 34.46s/it]

Main Process: Saving 1049 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx17.pkl...
Save complete.

--- Question 19/50: When did the country formerly known as Zaire become independ... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx18.pkl...
Successfully loaded 1038 cached utility entries.
Response: 1960.
GT: 1960


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 247003.18it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 322/322 [00:00<00:00, 175352.62it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 322/322 [00:00<00:00, 205503.03it/s]
LOO Calls (logit-prob): 100%|██████████| 42/42 [00:00<00:00, 109963.03it/s]
LOO Calls (divergence_utility): 100%|██████████| 42/42 [00:00<00:00, 50045.67it/s]
 38%|███▊      | 19/50 [09:27<17:45, 34.37s/it]

Main Process: Saving 1038 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx18.pkl...
Save complete.

--- Question 20/50: Where did Peter and Paul Fortress' designer die?... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx19.pkl...
Successfully loaded 1016 cached utility entries.
Response: Domenico Trezzini's death location is not mentioned in the provided context.
GT: Saint Petersburg


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 314076.66it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 313/313 [00:00<00:00, 306017.98it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 313/313 [00:00<00:00, 262301.13it/s]
LOO Calls (logit-prob): 100%|██████████| 29/29 [00:00<00:00, 102732.11it/s]
LOO Calls (divergence_utility): 100%|██████████| 29/29 [00:00<00:00, 111285.28it/s]
 40%|████      | 20/50 [09:52<15:45, 31.53s/it]

Main Process: Saving 1016 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx19.pkl...
Save complete.

--- Question 21/50: When did the network which airs Alt for Norge start?... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx20.pkl...
Successfully loaded 1017 cached utility entries.
Response: 5 December 1988.
GT: 1988


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 299299.48it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 312/312 [00:00<00:00, 283681.52it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 312/312 [00:00<00:00, 256794.12it/s]
LOO Calls (logit-prob): 100%|██████████| 31/31 [00:00<00:00, 99558.52it/s]
LOO Calls (divergence_utility): 100%|██████████| 31/31 [00:00<00:00, 124068.15it/s]
 42%|████▏     | 21/50 [10:18<14:21, 29.70s/it]

Main Process: Saving 1017 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx20.pkl...
Save complete.

--- Question 22/50: Who failed to take back what the French believed instrumenta... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx21.pkl...
Successfully loaded 1018 cached utility entries.
Response: The Russians failed to retake the Malakoff.
GT: the Russian defences


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 276180.65it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 316/316 [00:00<00:00, 157860.89it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 316/316 [00:00<00:00, 273503.93it/s]
LOO Calls (logit-prob): 100%|██████████| 37/37 [00:00<00:00, 51557.89it/s]
LOO Calls (divergence_utility): 100%|██████████| 37/37 [00:00<00:00, 116421.04it/s]
 44%|████▍     | 22/50 [10:50<14:11, 30.42s/it]

Main Process: Saving 1018 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx21.pkl...
Save complete.

--- Question 23/50: What is the field of work of the proposer of the modern synt... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx22.pkl...
Successfully loaded 1044 cached utility entries.
Response: Evolutionary biology.
GT: bio


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 236555.11it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 329/329 [00:00<00:00, 239487.33it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 329/329 [00:00<00:00, 280643.89it/s]
LOO Calls (logit-prob): 100%|██████████| 45/45 [00:00<00:00, 92295.20it/s]
LOO Calls (divergence_utility): 100%|██████████| 45/45 [00:00<00:00, 99130.08it/s]
 46%|████▌     | 23/50 [11:29<14:51, 33.00s/it]

Main Process: Saving 1044 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx22.pkl...
Save complete.

--- Question 24/50: When was the season of Greys Anatomy when Derek died filmed?... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx23.pkl...
Successfully loaded 1044 cached utility entries.
Response: July 25, 2014.
GT: filming for the eleventh season would begin on July 25, 2014


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 237511.93it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 327/327 [00:00<00:00, 232463.97it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 327/327 [00:00<00:00, 244219.62it/s]
LOO Calls (logit-prob): 100%|██████████| 44/44 [00:00<00:00, 88640.43it/s]
LOO Calls (divergence_utility): 100%|██████████| 44/44 [00:00<00:00, 101400.76it/s]
 48%|████▊     | 24/50 [12:06<14:47, 34.15s/it]

Main Process: Saving 1044 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx23.pkl...
Save complete.

--- Question 25/50: When did the manufacturer of a pedometer accessory for the i... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx24.pkl...
Successfully loaded 1037 cached utility entries.
Response: The text does not mention when the manufacturer of the Nike+iPod pedometer became a publicly traded company.
GT: 1980


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 258723.38it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 325/325 [00:00<00:00, 244598.74it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 325/325 [00:00<00:00, 263105.35it/s]
LOO Calls (logit-prob): 100%|██████████| 40/40 [00:00<00:00, 108379.95it/s]
LOO Calls (divergence_utility): 100%|██████████| 40/40 [00:00<00:00, 122461.43it/s]
 50%|█████     | 25/50 [12:35<13:39, 32.78s/it]

Main Process: Saving 1037 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx24.pkl...
Save complete.

--- Question 26/50: What is the record label for the person who sang Beauty and ... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx25.pkl...
Successfully loaded 1114 cached utility entries.
Response: Peabo Bryson.
GT: Capitol Records


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 188554.61it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 333/333 [00:00<00:00, 175377.10it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 333/333 [00:00<00:00, 192781.67it/s]
LOO Calls (logit-prob): 100%|██████████| 65/65 [00:00<00:00, 108574.18it/s]
LOO Calls (divergence_utility): 100%|██████████| 65/65 [00:00<00:00, 91058.70it/s]
 52%|█████▏    | 26/50 [13:33<16:06, 40.25s/it]

Main Process: Saving 1114 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx25.pkl...
Save complete.

--- Question 27/50: Who is the employer of the Iranian scientist who co-invented... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx26.pkl...
Successfully loaded 1039 cached utility entries.
Response: MIT.
GT: M.I.T.


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 248976.95it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 327/327 [00:00<00:00, 214738.91it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 327/327 [00:00<00:00, 210616.92it/s]
LOO Calls (logit-prob): 100%|██████████| 43/43 [00:00<00:00, 110647.28it/s]
LOO Calls (divergence_utility): 100%|██████████| 43/43 [00:00<00:00, 110241.49it/s]
 54%|█████▍    | 27/50 [14:05<14:26, 37.67s/it]

Main Process: Saving 1039 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx26.pkl...
Save complete.

--- Question 28/50: How many championships in a row were won by the person who p... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx27.pkl...
Successfully loaded 1099 cached utility entries.
Response: 8 consecutive championships from 1959 to 1966.
GT: eight


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 207802.73it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 332/332 [00:00<00:00, 165992.24it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 332/332 [00:00<00:00, 122536.86it/s]
LOO Calls (logit-prob): 100%|██████████| 55/55 [00:00<00:00, 26598.26it/s]
LOO Calls (divergence_utility): 100%|██████████| 55/55 [00:00<00:00, 30013.88it/s]
 56%|█████▌    | 28/50 [14:45<14:05, 38.45s/it]

Main Process: Saving 1099 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx27.pkl...
Save complete.

--- Question 29/50: In what language is the star of Koyelaanchal fluent?... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx28.pkl...
Successfully loaded 1043 cached utility entries.
Response: Hindi.
GT: Hindi


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 237364.22it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 327/327 [00:00<00:00, 221788.07it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 327/327 [00:00<00:00, 195347.87it/s]
LOO Calls (logit-prob): 100%|██████████| 44/44 [00:00<00:00, 100956.99it/s]
LOO Calls (divergence_utility): 100%|██████████| 44/44 [00:00<00:00, 117323.19it/s]
 58%|█████▊    | 29/50 [15:13<12:22, 35.34s/it]

Main Process: Saving 1043 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx28.pkl...
Save complete.

--- Question 30/50: What instrument did the artiste for Vi skall fara bortom mån... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx29.pkl...
Successfully loaded 995 cached utility entries.
Response: Guitar.
GT: violin


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 345491.44it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 298/298 [00:00<00:00, 275490.98it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 298/298 [00:00<00:00, 332597.82it/s]
LOO Calls (logit-prob): 100%|██████████| 25/25 [00:00<00:00, 119291.92it/s]
LOO Calls (divergence_utility): 100%|██████████| 25/25 [00:00<00:00, 126792.74it/s]
 60%|██████    | 30/50 [15:31<10:02, 30.13s/it]

Main Process: Saving 995 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx29.pkl...
Save complete.

--- Question 31/50: What year did the council which was seated in 1949 adopt the... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx30.pkl...
Successfully loaded 1015 cached utility entries.
Response: The year is not specified in the text.
GT: 2001


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 311895.13it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 308/308 [00:00<00:00, 230275.51it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 308/308 [00:00<00:00, 255861.68it/s]
LOO Calls (logit-prob): 100%|██████████| 30/30 [00:00<00:00, 52980.68it/s]
LOO Calls (divergence_utility): 100%|██████████| 30/30 [00:00<00:00, 44858.87it/s]
 62%|██████▏   | 31/50 [15:56<09:05, 28.71s/it]

Main Process: Saving 1015 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx30.pkl...
Save complete.

--- Question 32/50: When did the author of season 7 of game of thrones begin wri... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx31.pkl...
Successfully loaded 1052 cached utility entries.
Response: By the middle of 2010.
GT: 2010


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 233945.24it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 329/329 [00:00<00:00, 170109.22it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 329/329 [00:00<00:00, 256539.51it/s]
LOO Calls (logit-prob): 100%|██████████| 45/45 [00:00<00:00, 109100.39it/s]
LOO Calls (divergence_utility): 100%|██████████| 45/45 [00:00<00:00, 125161.59it/s]
 64%|██████▍   | 32/50 [16:39<09:54, 33.04s/it]

Main Process: Saving 1052 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx31.pkl...
Save complete.

--- Question 33/50: What languages are spoken, written or signed by the person t... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx32.pkl...
Successfully loaded 1049 cached utility entries.
Response: German.
GT: Italian


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 216987.87it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 330/330 [00:00<00:00, 223065.32it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 330/330 [00:00<00:00, 204418.89it/s]
LOO Calls (logit-prob): 100%|██████████| 47/47 [00:00<00:00, 94412.02it/s]
LOO Calls (divergence_utility): 100%|██████████| 47/47 [00:00<00:00, 100834.93it/s]
 66%|██████▌   | 33/50 [17:15<09:32, 33.70s/it]

Main Process: Saving 1049 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx32.pkl...
Save complete.

--- Question 34/50: What language did the leader of the least radicalized factio... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx33.pkl...
Successfully loaded 1031 cached utility entries.
Response: Mandarin.
GT: Chinese


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 253188.50it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 322/322 [00:00<00:00, 205066.18it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 322/322 [00:00<00:00, 245736.15it/s]
LOO Calls (logit-prob): 100%|██████████| 42/42 [00:00<00:00, 71061.22it/s]
LOO Calls (divergence_utility): 100%|██████████| 42/42 [00:00<00:00, 113652.11it/s]
 68%|██████▊   | 34/50 [17:51<09:11, 34.49s/it]

Main Process: Saving 1031 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx33.pkl...
Save complete.

--- Question 35/50: When did the majority of the patents held by the company the... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx34.pkl...
Successfully loaded 1004 cached utility entries.
Response: The text does not specify the exact date when the majority of the patents held by the Italian patents firm representing Sisvel expired. However, it mentions that the majority of MP3 patents expired in the US between 2007 and 2015.
GT: 2015


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 315830.92it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 310/310 [00:00<00:00, 296383.46it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 310/310 [00:00<00:00, 316204.82it/s]
LOO Calls (logit-prob): 100%|██████████| 28/28 [00:00<00:00, 115024.99it/s]
LOO Calls (divergence_utility): 100%|██████████| 28/28 [00:00<00:00, 118626.78it/s]
 70%|███████   | 35/50 [18:18<08:02, 32.15s/it]

Main Process: Saving 1004 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx34.pkl...
Save complete.

--- Question 36/50: What language is used by the person after whom the Panizzi l... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx35.pkl...
Successfully loaded 1041 cached utility entries.
Response: Italian.
GT: Italian


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 254836.70it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 330/330 [00:00<00:00, 260319.79it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 330/330 [00:00<00:00, 273325.50it/s]
LOO Calls (logit-prob): 100%|██████████| 41/41 [00:00<00:00, 99460.07it/s]
LOO Calls (divergence_utility): 100%|██████████| 41/41 [00:00<00:00, 107077.50it/s]
 72%|███████▏  | 36/50 [18:44<07:05, 30.41s/it]

Main Process: Saving 1041 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx35.pkl...
Save complete.

--- Question 37/50: What is the language for the person who translated Plato's w... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx36.pkl...
Successfully loaded 1024 cached utility entries.
Response: Latin.
GT: Italian


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 281996.06it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 317/317 [00:00<00:00, 246815.36it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 317/317 [00:00<00:00, 271069.19it/s]
LOO Calls (logit-prob): 100%|██████████| 33/33 [00:00<00:00, 111174.32it/s]
LOO Calls (divergence_utility): 100%|██████████| 33/33 [00:00<00:00, 119217.94it/s]
 74%|███████▍  | 37/50 [19:10<06:18, 29.09s/it]

Main Process: Saving 1024 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx36.pkl...
Save complete.

--- Question 38/50: What did Goring believe the person whom he refused to work w... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx37.pkl...
Successfully loaded 1025 cached utility entries.
Response: Kriegsmarine would gain control of more Luftwaffe units.
GT: control of more Luftwaffe units


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 269692.04it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 319/319 [00:00<00:00, 203805.48it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 319/319 [00:00<00:00, 244603.83it/s]
LOO Calls (logit-prob): 100%|██████████| 36/36 [00:00<00:00, 108240.10it/s]
LOO Calls (divergence_utility): 100%|██████████| 36/36 [00:00<00:00, 112431.08it/s]
 76%|███████▌  | 38/50 [19:45<06:09, 30.81s/it]

Main Process: Saving 1025 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx37.pkl...
Save complete.

--- Question 39/50: Who was the president of the body that petitioned the king o... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx38.pkl...
Successfully loaded 1034 cached utility entries.
Response: Continental Congress.
GT: President John Hancock


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 262685.25it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 325/325 [00:00<00:00, 221398.21it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 325/325 [00:00<00:00, 260441.12it/s]
LOO Calls (logit-prob): 100%|██████████| 40/40 [00:00<00:00, 101557.00it/s]
LOO Calls (divergence_utility): 100%|██████████| 40/40 [00:00<00:00, 108100.62it/s]
 78%|███████▊  | 39/50 [20:19<05:49, 31.73s/it]

Main Process: Saving 1034 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx38.pkl...
Save complete.

--- Question 40/50: What city is the place Kara Vâsıf Bey died located in?... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx39.pkl...
Successfully loaded 1011 cached utility entries.
Response: Kadıköy.
GT: Istanbul


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 322638.77it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 310/310 [00:00<00:00, 280283.30it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 310/310 [00:00<00:00, 299455.15it/s]
LOO Calls (logit-prob): 100%|██████████| 28/28 [00:00<00:00, 99105.92it/s]
LOO Calls (divergence_utility): 100%|██████████| 28/28 [00:00<00:00, 102389.29it/s]
 80%|████████  | 40/50 [20:41<04:49, 28.96s/it]

Main Process: Saving 1011 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx39.pkl...
Save complete.

--- Question 41/50: What languages can the star of Accident on Hill Road speak, ... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx40.pkl...
Successfully loaded 1016 cached utility entries.
Response: The star of Accident on Hill Road, Farooq Sheikh, is a Hindi film actor.
GT: Hindi


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 300359.37it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 313/313 [00:00<00:00, 286641.30it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 313/313 [00:00<00:00, 302283.48it/s]
LOO Calls (logit-prob): 100%|██████████| 29/29 [00:00<00:00, 91180.52it/s]
LOO Calls (divergence_utility): 100%|██████████| 29/29 [00:00<00:00, 125396.72it/s]
 82%|████████▏ | 41/50 [21:07<04:12, 28.08s/it]

Main Process: Saving 1016 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx40.pkl...
Save complete.

--- Question 42/50: In what year did France host the finals of the competition f... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx41.pkl...
Successfully loaded 1046 cached utility entries.
Response: 1991.
GT: 2007


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 214037.10it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 331/331 [00:00<00:00, 184960.65it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 331/331 [00:00<00:00, 213226.02it/s]
LOO Calls (logit-prob): 100%|██████████| 50/50 [00:00<00:00, 86946.60it/s]
LOO Calls (divergence_utility): 100%|██████████| 50/50 [00:00<00:00, 89967.91it/s]
 84%|████████▍ | 42/50 [21:44<04:04, 30.55s/it]

Main Process: Saving 1046 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx41.pkl...
Save complete.

--- Question 43/50: In which year did the publisher of Roald Dahl's Guide to Rai... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx42.pkl...
Successfully loaded 1000 cached utility entries.
Response: 2001.
GT: 2001


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 326014.66it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 308/308 [00:00<00:00, 327713.25it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 308/308 [00:00<00:00, 364207.96it/s]
LOO Calls (logit-prob): 100%|██████████| 26/26 [00:00<00:00, 124916.27it/s]
LOO Calls (divergence_utility): 100%|██████████| 26/26 [00:00<00:00, 137518.16it/s]
 86%|████████▌ | 43/50 [22:05<03:15, 27.87s/it]

Main Process: Saving 1000 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx42.pkl...
Save complete.

--- Question 44/50: When was the inception of the company that released Poptropi... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx43.pkl...
Successfully loaded 1003 cached utility entries.
Response: July 1998
GT: 1998


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 300951.44it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 310/310 [00:00<00:00, 255148.01it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 310/310 [00:00<00:00, 306514.44it/s]
LOO Calls (logit-prob): 100%|██████████| 28/28 [00:00<00:00, 33689.19it/s]
LOO Calls (divergence_utility): 100%|██████████| 28/28 [00:00<00:00, 38130.04it/s]
 88%|████████▊ | 44/50 [22:27<02:36, 26.09s/it]

Main Process: Saving 1003 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx43.pkl...
Save complete.

--- Question 45/50: When did the distributer of Poptropica start?... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx44.pkl...
Successfully loaded 1006 cached utility entries.
Response: 2007
GT: 1998


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 359398.93it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 306/306 [00:00<00:00, 249214.96it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 306/306 [00:00<00:00, 264357.78it/s]
LOO Calls (logit-prob): 100%|██████████| 27/27 [00:00<00:00, 102578.09it/s]
LOO Calls (divergence_utility): 100%|██████████| 27/27 [00:00<00:00, 123631.23it/s]
 90%|█████████ | 45/50 [22:51<02:07, 25.40s/it]

Main Process: Saving 1006 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx44.pkl...
Save complete.

--- Question 46/50: What is the source of the river where Southern California ge... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx45.pkl...
Successfully loaded 1009 cached utility entries.
Response: The Rocky Mountains of Colorado and Wyoming.
GT: La Poudre Pass


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 297028.53it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 308/308 [00:00<00:00, 240075.38it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 308/308 [00:00<00:00, 239009.37it/s]
LOO Calls (logit-prob): 100%|██████████| 30/30 [00:00<00:00, 47215.43it/s]
LOO Calls (divergence_utility): 100%|██████████| 30/30 [00:00<00:00, 104945.05it/s]
 92%|█████████▏| 46/50 [23:25<01:52, 28.06s/it]

Main Process: Saving 1009 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx45.pkl...
Save complete.

--- Question 47/50: When was the university Elizabeth Harwood attended formed?... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx46.pkl...
Successfully loaded 1038 cached utility entries.
Response: The text does not mention Elizabeth Harwood attending a university.
GT: 1972


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 269454.05it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 319/319 [00:00<00:00, 213530.64it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 319/319 [00:00<00:00, 252449.62it/s]
LOO Calls (logit-prob): 100%|██████████| 36/36 [00:00<00:00, 87787.76it/s]
LOO Calls (divergence_utility): 100%|██████████| 36/36 [00:00<00:00, 105003.44it/s]
 94%|█████████▍| 47/50 [23:51<01:21, 27.28s/it]

Main Process: Saving 1038 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx46.pkl...
Save complete.

--- Question 48/50: What is the population of the city where the torch event was... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx47.pkl...
Successfully loaded 1020 cached utility entries.
Response: 8,426,100
GT: 8,426,100


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 266584.02it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 316/316 [00:00<00:00, 215757.78it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 316/316 [00:00<00:00, 259933.33it/s]
LOO Calls (logit-prob): 100%|██████████| 34/34 [00:00<00:00, 102447.08it/s]
LOO Calls (divergence_utility): 100%|██████████| 34/34 [00:00<00:00, 122094.47it/s]
 96%|█████████▌| 48/50 [24:22<00:57, 28.52s/it]

Main Process: Saving 1020 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx47.pkl...
Save complete.

--- Question 49/50: Who is the spouse of one of the artist who have had concerts... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx48.pkl...
Successfully loaded 1044 cached utility entries.
Response: David Furnish is the spouse of Elton John.
GT: David Furnish


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 242222.22it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 327/327 [00:00<00:00, 230471.75it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 327/327 [00:00<00:00, 199119.83it/s]
LOO Calls (logit-prob): 100%|██████████| 43/43 [00:00<00:00, 77806.33it/s]
LOO Calls (divergence_utility): 100%|██████████| 43/43 [00:00<00:00, 82054.17it/s]
 98%|█████████▊| 49/50 [24:54<00:29, 29.43s/it]

Main Process: Saving 1044 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx48.pkl...
Save complete.

--- Question 50/50: What type of community is the municipality where Norbert Pfr... ---
Main Process: Attempting to load utility cache from ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx49.pkl...
Successfully loaded 1010 cached utility entries.
Response: Lana an der Etsch is a "comune" (municipality).
GT: comune


Computing utilities for ContextCite: 100%|██████████| 364/364 [00:00<00:00, 302262.26it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 313/313 [00:00<00:00, 235651.97it/s]
Computing utilities for WSS (kernelshap): 100%|██████████| 313/313 [00:00<00:00, 261830.31it/s]
LOO Calls (logit-prob): 100%|██████████| 29/29 [00:00<00:00, 113571.26it/s]
LOO Calls (divergence_utility): 100%|██████████| 29/29 [00:00<00:00, 110980.67it/s]
100%|██████████| 50/50 [25:19<00:00, 30.39s/it]

Main Process: Saving 1010 utility entries to ../Experiment_data/musique/Llama-3.1-8B-Instruct/sentence/utilities_q_idx49.pkl...
Save complete.





In [None]:
precs=np.zeros((len(all_results), len(methods)))
for j, i in enumerate(all_results):
    if j not in [3,6,7,10,13,14,15,16,17,19,20]:
        for n, m in enumerate(methods):
            precs[j][n]=precision(ast.literal_eval(df_save.labels[j]), i[m])

precs.mean(axis=0)

In [7]:
methods = [f'ContextCite{actual_samples}', f'FM_Weights{actual_samples}',f'FM_WeightsD{actual_samples}',f'Spex{actual_samples}',f'FBII{actual_samples}',f'FSII{actual_samples}', 'LOO', 'ARC-JSD']

# Initialize lists
topk_probs = {method: [] for method in methods}
topk_divs = {method: [] for method in methods}
LDSs = {method: [] for method in methods}

# Collect values
for ind, entry in enumerate(all_results):
    for method in methods:
        topk_probs[method].append(entry['topk_probability'][method][2])
        topk_divs[method].append(entry['topk_divergence'][method][2])
        for d in entry['LDS']:
            if method in d:
                LDSs[method].append(d[method])
                break
        

# Compute means
mean_topk_probs = {method: np.mean(topk_probs[method]) for method in methods}
mean_topk_divs = {method: np.mean(topk_divs[method]) for method in methods}
mean_LDSs = {method: np.mean(LDSs[method]) for method in methods}

print("Mean topk_probability:", mean_topk_probs)
print("Mean topk_divergence:", mean_topk_divs)
print("Mean LDS:", mean_LDSs)

Mean topk_probability: {'ContextCite364': 12.571211284399032, 'FM_Weights364': 12.364087232351302, 'FM_WeightsD364': 12.71926569879055, 'Spex364': 13.031725627183913, 'FBII364': 12.251579568088054, 'FSII364': 12.251579568088054, 'LOO': 12.105467292070388, 'ARC-JSD': 11.805027627348899}
Mean topk_divergence: {'ContextCite364': 1.3486342845000832, 'FM_Weights364': 1.361668637958353, 'FM_WeightsD364': 1.4080414159643897, 'Spex364': 1.4066534375193316, 'FBII364': 1.295260504515863, 'FSII364': 1.295260504515863, 'LOO': 1.244968577786883, 'ARC-JSD': 1.2697375269033706}
Mean LDS: {'ContextCite364': 0.8208676307007786, 'FM_Weights364': 0.716360400444939, 'FM_WeightsD364': 0.7853615127919911, 'Spex364': 0.7577306932599748, 'FBII364': 0.7252641083609697, 'FSII364': 0.7252641083609697, 'LOO': 0.640400444938821, 'ARC-JSD': 0.7898998887652948}


In [None]:
harness.precision([0,1], all)

In [None]:
import matplotlib.pyplot as plt
def plot_metric(metric_dict, title, ylabel):
    methods = list(metric_dict.keys())
    values = np.array([metric_dict[m] for m in methods])
    
    # Identify the best method (max value assumed to be better)
    best_index = np.argmax(values)
    
    colors = ['gray'] * len(methods)
    colors[best_index] = 'gold'  # Highlight best method

    plt.figure(figsize=(10, 5))
    bars = plt.bar(methods, values, color=colors)
    plt.xticks(rotation=45, ha='right')
    plt.ylabel(ylabel)
    plt.title(title)
    
    # Annotate values on top of bars
    for i, bar in enumerate(bars):
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                 f'{values[i]:.4f}', ha='center', va='bottom', fontsize=9)

    plt.tight_layout()
    plt.show()


In [None]:
plot_metric(mean_topk_probs, "Mean Top-k Probability", "topk_probability")
plot_metric(mean_topk_divs, "Mean Top-k Divergence", "topk_divergence")
plot_metric(mean_LDSs, "Mean LDS", "LDS")

In [None]:
all_results[0]['FM_WeightsD364']

In [None]:
[(i,np.array(all_results[1][i]).argsort()) for i in methods]

In [None]:
df.paragraphs[1]

In [None]:
all_results[1]

In [None]:
df.Sentences[10][1]

In [None]:
[all_results[0][i].argsort() for i in methods]

In [None]:
all_results[4]['LOO']

In [None]:
sum(r2_cc)

In [None]:
for i in range(len(r2_cc)):
    print(r2_cc[i], r2_fm[i])