In [1]:
import sys
import os
import random
import gc
import time
import torch
import numpy as np
import pandas as pd
import ast
from tqdm import tqdm
from scipy.sparse import csr_matrix
import itertools
from scipy.stats import spearmanr, pearsonr, kendalltau, rankdata
from sklearn.metrics import ndcg_score
from sklearn.metrics import root_mean_squared_error
from transformers import AutoModelForCausalLM, AutoTokenizer
from accelerate import Accelerator
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(parent_dir)
from SHapRAG import *
#os.environ["CUDA_VISIBLE_DEVICES"] = "0" 

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /home/ulb/code_wit/ekuzmenk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#df=pd.read_json("../data/musique/musique_ans_v1.0_train.jsonl", lines=True)
df= pd.read_csv("../scripts/nq_2_positives.csv",index_col=False)

In [3]:
from os import getenv
from dotenv import load_dotenv

load_dotenv()
HF_TOKEN = os.getenv('HF_TOKEN')

In [4]:
SEED = 42
# Initialize Accelerator
accelerator_main = Accelerator(mixed_precision="fp16")

# Load Model
if accelerator_main.is_main_process:
    print("Main Script: Loading model...")
# model_path = "mistralai/Mistral-7B-Instruct-v0.3"
model_path = "meta-llama/Llama-3.1-8B-Instruct"
# model_path = "Qwen/Qwen2.5-3B-Instruct"


model_cpu = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    token=HF_TOKEN
)
tokenizer = AutoTokenizer.from_pretrained(model_path, token=HF_TOKEN)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model_cpu.config.pad_token_id = tokenizer.pad_token_id
    if hasattr(model_cpu, 'generation_config') and model_cpu.generation_config is not None:
        model_cpu.generation_config.pad_token_id = tokenizer.pad_token_id

if accelerator_main.is_main_process:
    print("Main Script: Preparing model with Accelerator...")
prepared_model = accelerator_main.prepare(model_cpu)
unwrapped_prepared_model = accelerator_main.unwrap_model(prepared_model)
unwrapped_prepared_model.eval()
if accelerator_main.is_main_process:
    print("Main Script: Model prepared and set to eval.")

# Define utility cache

accelerator_main.wait_for_everyone()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Main Script: Loading model...


Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████| 4/4 [00:08<00:00,  2.15s/it]


Main Script: Preparing model with Accelerator...
Main Script: Model prepared and set to eval.


In [5]:
all_sents = []
for i in range(len(df.question)):
    n = 0
    docs=ast.literal_eval(df.context[i])
    doc_sents = []
    for j in range(len(docs)):
        sents = nltk.sent_tokenize(docs[j])
        new_sents = []
        for s in range(len(sents)):
            #new_sents.append(str(n + s) + '-' + str(j) + '-' + sents[s])
            new_sents.append(sents[s])
        n += len(sents)
        doc_sents.append(new_sents)
    flat_doc_sents = [
    x
    for xs in doc_sents
    for x in xs
]
    all_sents.append(flat_doc_sents)
df['Sentences'] = all_sents

In [6]:
df.head(10)

Unnamed: 0,question,context,answer,Sentences
0,total number of death row inmates in the us,['on death row in the United States on January...,2718,[on death row in the United States on January ...
1,big little lies season 2 how many episodes,['series garnered several accolades. It receiv...,seven,"[series garnered several accolades., It receiv..."
2,who sang waiting for a girl like you,"['Waiting for a Girl Like You ""Waiting for a G...",Foreigner,"[Waiting for a Girl Like You ""Waiting for a Gi..."
3,where do you cross the arctic circle in norway,['Arctic Norway Arctic Norway () comprises the...,Saltfjellet,[Arctic Norway Arctic Norway () comprises the ...
4,who is the main character in green eggs and ham,['Green Eggs and Ham Green Eggs and Ham is a c...,Sam-I-am,[Green Eggs and Ham Green Eggs and Ham is a ch...
5,do veins carry blood to the heart or away,['Vein Veins are blood vessels that carry bloo...,to,[Vein Veins are blood vessels that carry blood...
6,who played charlie bucket in the original char...,['Peter Ostrum Peter Gardner Ostrum (; born No...,Peter Gardner[4] Ostrum,[Peter Ostrum Peter Gardner Ostrum (; born Nov...
7,what is 1 radian in terms of pi,['and the integral computes the area between t...,1/2π,[and the integral computes the area between th...
8,when does season 5 of bates motel come out,['Bates Motel (season 5) The fifth and final s...,"February 20, 2017",[Bates Motel (season 5) The fifth and final se...
9,how many episodes are in series 7 game of thrones,['Game of Thrones (season 7) The seventh and p...,seven,[Game of Thrones (season 7) The seventh and pe...


In [6]:
# NEW VERSION
num_questions_to_run=50
k_values = [1,2,3,4,5]
all_results=[]
LDSs=[]
r2_fm=[]
r2_cc=[]

for i in tqdm(range(num_questions_to_run), disable=not accelerator_main.is_main_process):
    query = df.question[i]
    if accelerator_main.is_main_process:
        print(f"\n--- Question {i+1}/{num_questions_to_run}: {query[:60]}... ---")

    docs=df.Sentences[i]
    #docs=ast.literal_eval(df.context[i])
    utility_cache_base_dir = f"../Experiment_data/NQ/{model_path.split('/')[1]}/sentences"
    #utility_cache_base_dir = f"../Experiment_data/musique"
    utility_cache_filename = f"utilities_q_idx{i}.pkl" # More robust naming
    current_utility_path = os.path.join(utility_cache_base_dir, utility_cache_filename)
    
    if accelerator_main.is_main_process: # Only main process creates directories
        os.makedirs(os.path.dirname(current_utility_path), exist_ok=True)
    
    # Initialize Harness
    print('Initializing harness...')
    harness = ContextAttribution(
        items=docs,
        query=query,
        prepared_model=prepared_model,
        prepared_tokenizer=tokenizer,
        accelerator=accelerator_main,
        utility_cache_path=current_utility_path
    )
    print(f'Response: {harness.target_response}')
    print(f'GT: {df.answer[i]}')
    # Compute metrics
    results_for_query = {}
    if accelerator_main.is_main_process:
        m_samples_map = {"L": 364} 
        # m_samples_map = {"L": 128, "XL":256, "XXL":512} 
        T_iterations_map = {"L":40, "XL":50, "XXL":60} 

        for size_key, num_s in m_samples_map.items():
            if 2**len(docs) < num_s and size_key != "L":
                actual_samples = max(1, 2**len(docs)-1 if 2**len(docs)>0 else 1)
            else:
                actual_samples = num_s

            if actual_samples > 0:
                results_for_query[f"ContextCite{actual_samples}"], model_cc = harness.compute_contextcite(num_samples=actual_samples, seed=SEED)
                attributions, ints=harness.compute_spex(sample_budget=actual_samples,max_order=2)
                results_for_query[f"FBII{actual_samples}"]=attributions['fbii']
                results_for_query[f"Spex{actual_samples}"]=attributions['fourier']
                results_for_query[f"FSII{actual_samples}"]=attributions['fsii']
                results_for_query[f"FM_WeightsD{actual_samples}"], F, modelfm = harness.compute_wss(num_samples=actual_samples, seed=SEED, sampling="kernelshap",sur_type="fm", utility_mode="divergence_utility")
                results_for_query[f"FM_Weights{actual_samples}"], F, modelfm = harness.compute_wss(num_samples=actual_samples, seed=SEED, sampling="kernelshap",sur_type="fm")
                # results_for_query[f"BetaShap{actual_samples}"] = harness.compute_beta_shap(num_iterations_max=T_iterations_map[size_key], beta_a=16, beta_b=1, max_unique_lookups=actual_samples, seed=SEED)
                # results_for_query[f"TMC{actual_samples}"] = harness.compute_tmc_shap(num_iterations_max=T_iterations_map[size_key], performance_tolerance=0.001, max_unique_lookups=actual_samples, seed=SEED)

        results_for_query["LOO"] = harness.compute_loo()
        results_for_query["ARC-JSD"] = harness.compute_arc_jsd()

        prob_topk = harness.evaluate_topk_performance(
                                                results_for_query, 
                                                k_values, 
                                                utility_type="probability"
                                            )

        div_topk = harness.evaluate_topk_performance(
                                            results_for_query, 
                                            k_values, 
                                            utility_type="divergence"
                                        )
        # for doc level for now, needs annotation for sentences
        cc_precision = harness.precision([0, 1], results_for_query[f"ContextCite{actual_samples}"])
        fm_precision = harness.precision([0, 1], results_for_query[f"FM_Weights{actual_samples}"])
        fmw_precision = harness.precision([0, 1], results_for_query[f"FM_WeightsD{actual_samples}"])
        fbi_precision = harness.precision([0, 1], results_for_query[f"FBII{actual_samples}"])


        LDS = {}
        for i in results_for_query:
            if "FM" in i:
                calculate_LDS = {i:harness.lds(results_for_query[i], 30, utl=True, model=modelfm)}
                LDS.update(calculate_LDS)
            else:
                calculate_LDS = {i:harness.lds(results_for_query[i], 30)}
                LDS.update(calculate_LDS)
        LDS = [{i:harness.lds(results_for_query[i], 30)} for i in results_for_query]

        results_for_query["topk_probability"] = prob_topk
        results_for_query["topk_divergence"] = div_topk
        results_for_query["LDS"] = LDS
        harness.save_utility_cache(current_utility_path)
        
        all_results.append(results_for_query)

  0%|                                                                                                                   | 0/50 [00:00<?, ?it/s]


--- Question 1/50: total number of death row inmates in the us... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx0.pkl...
Successfully loaded 779 cached utility entries.
Response: As of November 1999, there are 956 death convicts at the National Bilibid Prisons and at the Correctional Institute for Women.
GT: 2,718



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 375209.30it/s][A




100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 406.53it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:50<00:00,  1.12s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 338/338 [00:00<00:00, 315951.58it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 338/338 [00:00<00:00, 354773.46it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 57/57 [00:00<00:00, 164200.09it/s][A

LOO Calls (divergence_utility): 100%|██████████████████████████████████████████████████████████████████████| 57/57 [00:00<00:00, 206099.42it/s][A
  2%|██                    

Main Process: Saving 779 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx0.pkl...
Save complete.

--- Question 2/50: big little lies season 2 how many episodes... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx1.pkl...
Successfully loaded 791 cached utility entries.
Response: 7
GT: seven



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 328187.16it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 392.75it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:51<00:00,  1.14s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 341/341 [00:00<00:00, 255631.40it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 341/341 [00:00<00:00, 336848.25it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 173677.18it/s][A

LOO Calls (divergence_utili

Main Process: Saving 791 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx1.pkl...
Save complete.

--- Question 3/50: who sang waiting for a girl like you... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx2.pkl...
Successfully loaded 776 cached utility entries.
Response: Foreigner.
GT: Foreigner



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 351456.41it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 415.12it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:50<00:00,  1.13s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 340/340 [00:00<00:00, 256301.83it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 340/340 [00:00<00:00, 339377.29it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 56/56 [00:00<00:00, 165176.53it/s][A

LOO Calls (divergence_utili

Main Process: Saving 776 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx2.pkl...
Save complete.

--- Question 4/50: where do you cross the arctic circle in norway... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx3.pkl...
Successfully loaded 761 cached utility entries.
Response: Saltfjellet.
GT: Saltfjellet



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 374106.02it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:00<00:00, 459.99it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:42<00:00,  1.10s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 331/331 [00:00<00:00, 321965.36it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 331/331 [00:00<00:00, 390634.39it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 136800.52it/s][A

LOO Calls (divergence_utili

Main Process: Saving 761 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx3.pkl...
Save complete.

--- Question 5/50: who is the main character in green eggs and ham... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx4.pkl...
Successfully loaded 772 cached utility entries.
Response: The main character in Green Eggs and Ham is a strange creature.
GT: Sam-I-am



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 351861.41it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 420.04it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:51<00:00,  1.13s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 332/332 [00:00<00:00, 355503.94it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 332/332 [00:00<00:00, 389621.97it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 55/55 [00:00<00:00, 187550.18it/s][A

LOO Calls (divergence_utili

Main Process: Saving 772 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx4.pkl...
Save complete.

--- Question 6/50: do veins carry blood to the heart or away... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx5.pkl...
Successfully loaded 774 cached utility entries.
Response: Veins carry blood toward the heart.
GT: to



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 341855.50it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 418.31it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:48<00:00,  1.07s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 336/336 [00:00<00:00, 271381.89it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 336/336 [00:00<00:00, 379146.12it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 54/54 [00:00<00:00, 182361.04it/s][A

LOO Calls (divergence_utili

Main Process: Saving 774 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx5.pkl...
Save complete.

--- Question 7/50: who played charlie bucket in the original charlie and the ch... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx6.pkl...
Successfully loaded 772 cached utility entries.
Response: Peter Ostrum.
GT: Peter Gardner[4] Ostrum



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 358639.10it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 434.15it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:53<00:00,  1.18s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 336/336 [00:00<00:00, 262681.48it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 336/336 [00:00<00:00, 370280.12it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 54/54 [00:00<00:00, 176258.69it/s][A

LOO Calls (divergence_utili

Main Process: Saving 772 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx6.pkl...
Save complete.

--- Question 8/50: what is 1 radian in terms of pi... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx7.pkl...
Successfully loaded 764 cached utility entries.
Response: 1° = π/180 radians.
GT: 1/2π



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 365858.29it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:00<00:00, 444.26it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:43<00:00,  1.12s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 334/334 [00:00<00:00, 287010.35it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 334/334 [00:00<00:00, 390330.88it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 49/49 [00:00<00:00, 180123.48it/s][A

LOO Calls (divergence_utili

Main Process: Saving 764 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx7.pkl...
Save complete.

--- Question 9/50: when does season 5 of bates motel come out... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx8.pkl...
Successfully loaded 768 cached utility entries.
Response: September 19, 2017.
GT: February 20, 2017



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 367620.19it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 432.43it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:52<00:00,  1.16s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 333/333 [00:00<00:00, 276246.68it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 333/333 [00:00<00:00, 367456.78it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 52/52 [00:00<00:00, 180102.24it/s][A

LOO Calls (divergence_utili

Main Process: Saving 768 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx8.pkl...
Save complete.

--- Question 10/50: how many episodes are in series 7 game of thrones... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx9.pkl...
Successfully loaded 776 cached utility entries.
Response: 7
GT: seven



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 336951.37it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 424.96it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:51<00:00,  1.13s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 336/336 [00:00<00:00, 365478.77it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 336/336 [00:00<00:00, 364062.55it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 54/54 [00:00<00:00, 189374.93it/s][A

LOO Calls (divergence_utili

Main Process: Saving 776 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx9.pkl...
Save complete.

--- Question 11/50: who is next in line to be the monarch of england... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx10.pkl...
Successfully loaded 779 cached utility entries.
Response: Prince William, Duke of Cambridge.
GT: Charles, Prince of Wales



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 349845.70it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 425.32it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:51<00:00,  1.14s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 336/336 [00:00<00:00, 341231.51it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 336/336 [00:00<00:00, 354626.61it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 54/54 [00:00<00:00, 182950.26it/s][A

LOO Calls (divergence_utili

Main Process: Saving 779 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx10.pkl...
Save complete.

--- Question 12/50: who is in charge of enforcing the pendleton act of 1883... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx11.pkl...
Successfully loaded 770 cached utility entries.
Response: The Civil Service Commission.
GT: United States Civil Service Commission



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 348329.15it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 430.58it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:47<00:00,  1.06s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 333/333 [00:00<00:00, 348826.98it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 333/333 [00:00<00:00, 369987.61it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 52/52 [00:00<00:00, 180102.24it/s][A

LOO Calls (divergence_utili

Main Process: Saving 770 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx11.pkl...
Save complete.

--- Question 13/50: what is the name of latest version of android... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx12.pkl...
Successfully loaded 775 cached utility entries.
Response: Android Pie.
GT: Oreo



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 334368.52it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 402.09it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:52<00:00,  1.16s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 340/340 [00:00<00:00, 339619.76it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 340/340 [00:00<00:00, 346974.05it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 56/56 [00:00<00:00, 177805.47it/s][A

LOO Calls (divergence_utili

Main Process: Saving 775 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx12.pkl...
Save complete.

--- Question 14/50: why was there so much interest in cuba both before and after... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx13.pkl...
Successfully loaded 763 cached utility entries.
Response: Historians have debated America's intentions in Cuba, with some believing it was due to selfless humanitarian interest in the Cuban people, while others think it was driven by a desire to prevent the spread of communism, particularly after the 1917 Russian Revolution and Cuba's subsequent alliance with the Soviet Union.
GT: sugar markets



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 350006.11it/s][A
  model = cd_fast.enet_coordinate_descent(




100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:00<00:00, 432.49it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:46<00:00,  1.18s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 332/332 [00:00<00:00, 369169.92it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 332/332 [00:00<00:00, 367999.19it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 51/51 [00:00<00:00, 182672.51it/s][A

LOO Calls (divergence_utility): 100%|██████████████████████████████████████████████████████████████████████| 51/51 [00:00<00:00, 223754.71it/s][A
 28%|███████████████████████

Main Process: Saving 763 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx13.pkl...
Save complete.

--- Question 15/50: when did veterans day start being called veterans day... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx14.pkl...
Successfully loaded 773 cached utility entries.
Response: May 26, 1954.
GT: June 1, 1954



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 329817.81it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 426.37it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:51<00:00,  1.13s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 332/332 [00:00<00:00, 344936.57it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 332/332 [00:00<00:00, 365775.92it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 55/55 [00:00<00:00, 180224.00it/s][A

LOO Calls (divergence_utili

Main Process: Saving 773 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx14.pkl...
Save complete.

--- Question 16/50: when did big air snowboarding become an olympic sport... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx15.pkl...
Successfully loaded 763 cached utility entries.
Response: 2018.
GT: 2018



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 356378.77it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:00<00:00, 437.08it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:42<00:00,  1.10s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 334/334 [00:00<00:00, 362082.59it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 334/334 [00:00<00:00, 389571.06it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 49/49 [00:00<00:00, 173581.84it/s][A

LOO Calls (divergence_utili

Main Process: Saving 763 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx15.pkl...
Save complete.

--- Question 17/50: who played in the most world series games... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx16.pkl...
Successfully loaded 756 cached utility entries.
Response: The New York Yankees.
GT: the New York Yankees



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 385829.33it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:00<00:00, 490.79it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:45<00:00,  1.16s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 327/327 [00:00<00:00, 312993.48it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 327/327 [00:00<00:00, 410885.98it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 43/43 [00:00<00:00, 198192.39it/s][A

LOO Calls (divergence_utili

Main Process: Saving 756 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx16.pkl...
Save complete.

--- Question 18/50: who sings i can't stop this feeling anymore... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx17.pkl...
Successfully loaded 789 cached utility entries.
Response: Justin Timberlake.
GT: American rock band REO Speedwagon



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 297375.66it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 389.33it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:50<00:00,  1.13s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 341/341 [00:00<00:00, 238217.47it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 341/341 [00:00<00:00, 327439.94it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 62/62 [00:00<00:00, 172903.49it/s][A

LOO Calls (divergence_utili

Main Process: Saving 789 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx17.pkl...
Save complete.

--- Question 19/50: who is the month of may named after... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx18.pkl...
Successfully loaded 771 cached utility entries.
Response: The month of May is named after the goddess Maia, a Greek and Roman goddess of fertility.
GT: the Greek Goddess Maia



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 335618.08it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 428.29it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:50<00:00,  1.13s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 333/333 [00:00<00:00, 274995.71it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 333/333 [00:00<00:00, 385936.23it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 53/53 [00:00<00:00, 175590.93it/s][A

LOO Calls (divergence_utili

Main Process: Saving 771 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx18.pkl...
Save complete.

--- Question 20/50: who has the most petroleum in the world... ---
Initializing harness...
Main Process: Attempting to load utility cache from ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx19.pkl...
Successfully loaded 762 cached utility entries.
Response: Venezuela.
GT: Venezuela



Computing utilities for ContextCite: 100%|███████████████████████████████████████████████████████████████| 364/364 [00:00<00:00, 334148.97it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:00<00:00, 464.40it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:44<00:00,  1.13s/it]

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

0it [00:00, ?it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 332/332 [00:00<00:00, 286053.60it/s][A

Computing utilities for WSS (kernelshap): 100%|██████████████████████████████████████████████████████████| 332/332 [00:00<00:00, 389512.99it/s][A

LOO Calls (logit-prob): 100%|██████████████████████████████████████████████████████████████████████████████| 48/48 [00:00<00:00, 172368.66it/s][A

LOO Calls (divergence_utili

Main Process: Saving 762 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx19.pkl...
Save complete.

--- Question 21/50: who is the sister of for king and country... ---
Initializing harness...
Response: Rebecca St. James.
GT: Rebecca St. James



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<01:10,  5.12it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:54,  6.60it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:50,  7.19it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:49,  7.29it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:50,  7.08it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/332 [00:00<01:37,  3.39it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/332 [00:00<00:31, 10.49it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/332 [00:00<00:37,  8.78it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/332 [00:00<00:29, 10.83it/s][A
Computing utilities for WSS (kernelshap):   3%|█▉                                                             | 10/332 [00:01<00:37,  8.56it/s][A
Computing utilities for WSS (kernelshap):   4%|██▍                                                            | 13/332 [00:01<00:37,  8.57it/s][A
Computing utilities for WSS (kernelshap):   5%|██▊                                                            | 15/33

Main Process: Saving 769 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx20.pkl...
Save complete.

--- Question 22/50: who developed the first periodic table with 8 columns... ---
Initializing harness...
Response: Gilbert N. Lewis and Irving Langmuir.
GT: Dmitri Mendeleev



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:43,  8.33it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:46,  7.75it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:47,  7.56it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:47,  7.64it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:51,  6.92it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/332 [00:00<01:38,  3.36it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/332 [00:00<00:31, 10.45it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/332 [00:00<00:37,  8.72it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/332 [00:00<00:29, 10.98it/s][A
Computing utilities for WSS (kernelshap):   3%|█▉                                                             | 10/332 [00:01<00:37,  8.69it/s][A
Computing utilities for WSS (kernelshap):   4%|██▍                                                            | 13/332 [00:01<00:36,  8.63it/s][A
Computing utilities for WSS (kernelshap):   5%|██▊                                                            | 15/33

Main Process: Saving 773 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx21.pkl...
Save complete.

--- Question 23/50: who plays skyler on lab rats elite force... ---
Initializing harness...
Response: I couldn't find any information about a character named Skyler on Lab Rats: Elite Force.
GT: Paris Berelc



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:48,  7.53it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:50,  7.11it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:46,  7.79it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:43,  8.37it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:47,  7.63it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/332 [00:00<01:35,  3.46it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/332 [00:00<01:27,  3.78it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/332 [00:00<01:06,  4.93it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/332 [00:01<00:55,  5.85it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/332 [00:01<00:52,  6.14it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/332 [00:01<01:01,  5.25it/s][A
Computing utilities for WSS (kernelshap):   3%|█▋                                                              | 9/33

Main Process: Saving 767 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx22.pkl...
Save complete.

--- Question 24/50: when is season seven of game of thrones coming out... ---
Initializing harness...
Response: July 16, 2017.
GT: July 16, 2017



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:37,  9.63it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:47,  7.67it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:43,  8.25it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:47,  7.53it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:48,  7.45it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/340 [00:00<01:55,  2.93it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/340 [00:00<01:16,  4.41it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/340 [00:00<00:48,  6.93it/s][A
Computing utilities for WSS (kernelshap):   1%|▉                                                               | 5/340 [00:00<00:59,  5.66it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/340 [00:01<00:54,  6.14it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/340 [00:01<01:02,  5.31it/s][A
Computing utilities for WSS (kernelshap):   3%|█▋                                                              | 9/34

Main Process: Saving 783 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx23.pkl...
Save complete.

--- Question 25/50: who went home on rupaul's drag race season 10 episode 4... ---
Initializing harness...
Response: Dusty Ray Bottoms.
GT: Dusty Ray Bottoms



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:42,  8.44it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:43,  8.27it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:50,  7.10it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:51,  6.97it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/364 [00:00<00:52,  6.88it/s][A
Computing utilities for ContextCite:   2%|█▎                                                                   | 7/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/336 [00:00<02:07,  2.64it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/336 [00:00<01:41,  3.29it/s][A
Computing utilities for WSS (kernelshap):   1%|▌                                                               | 3/336 [00:00<01:28,  3.77it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/336 [00:01<01:30,  3.68it/s][A
Computing utilities for WSS (kernelshap):   1%|▉                                                               | 5/336 [00:01<01:28,  3.73it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/336 [00:01<01:20,  4.09it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/33

Main Process: Saving 786 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx24.pkl...
Save complete.

--- Question 26/50: when did nigeria become a federation of 19 states... ---
Initializing harness...
Response: 1976.
GT: 1976



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:40,  9.00it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:48,  7.44it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:48,  7.38it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:50,  7.15it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:51,  7.03it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/332 [00:00<01:21,  4.04it/s][A
Computing utilities for WSS (kernelshap):   1%|▌                                                               | 3/332 [00:00<00:55,  5.94it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/332 [00:00<00:52,  6.27it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/332 [00:00<00:34,  9.35it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/332 [00:01<00:41,  7.80it/s][A
Computing utilities for WSS (kernelshap):   3%|█▉                                                             | 10/332 [00:01<00:40,  7.90it/s][A
Computing utilities for WSS (kernelshap):   3%|██                                                             | 11/33

Main Process: Saving 768 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx25.pkl...
Save complete.

--- Question 27/50: when did roller derby first appear in the press... ---
Initializing harness...
Response: 1922.
GT: 1922



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:57,  6.32it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:55,  6.58it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:54,  6.61it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:53,  6.69it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:52,  6.78it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/333 [00:00<02:48,  1.97it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/333 [00:00<01:56,  2.83it/s][A
Computing utilities for WSS (kernelshap):   1%|▌                                                               | 3/333 [00:00<01:36,  3.44it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/333 [00:01<01:17,  4.27it/s][A
Computing utilities for WSS (kernelshap):   2%|▉                                                               | 5/333 [00:01<01:13,  4.44it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/333 [00:01<01:10,  4.65it/s][A
Computing utilities for WSS (kernelshap):   3%|█▋                                                              | 9/33

Main Process: Saving 774 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx26.pkl...
Save complete.

--- Question 28/50: what nba team has the highest winning percentage... ---
Initializing harness...
Response: The Boston Celtics have the highest winning percentage (61.9%).
GT: San Antonio Spurs



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:43,  8.42it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:42,  8.50it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:43,  8.26it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:49,  7.27it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/364 [00:00<00:45,  7.89it/s][A
Computing utilities for ContextCite:   2%|█▎                                                                   | 7/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/332 [00:00<01:30,  3.67it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/332 [00:00<01:16,  4.29it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/332 [00:00<01:01,  5.36it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/332 [00:01<00:48,  6.68it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/332 [00:01<00:47,  6.85it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/332 [00:01<00:56,  5.75it/s][A
Computing utilities for WSS (kernelshap):   3%|█▋                                                              | 9/33

Main Process: Saving 760 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx27.pkl...
Save complete.

--- Question 29/50: if there is a random change in the genetics of a small popul... ---
Initializing harness...
Response: Genetic drift.
GT: genetic drift



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:55,  6.56it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:54,  6.65it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:53,  6.74it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:53,  6.72it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:53,  6.68it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/333 [00:00<02:51,  1.94it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/333 [00:00<01:58,  2.79it/s][A
Computing utilities for WSS (kernelshap):   1%|▌                                                               | 3/333 [00:00<01:34,  3.49it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/333 [00:01<01:15,  4.34it/s][A
Computing utilities for WSS (kernelshap):   2%|▉                                                               | 5/333 [00:01<01:14,  4.43it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/333 [00:01<01:09,  4.72it/s][A
Computing utilities for WSS (kernelshap):   3%|█▋                                                              | 9/33

Main Process: Saving 774 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx28.pkl...
Save complete.

--- Question 30/50: who played vincent in nanny mcphee and the big bang... ---
Initializing harness...
Response: The information is not provided in the given context.
GT: Oscar Steer



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:54,  6.64it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:45,  7.96it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:52,  6.86it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:53,  6.72it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:49,  7.31it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/331 [00:00<01:51,  2.97it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/331 [00:00<01:33,  3.53it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/331 [00:00<01:07,  4.82it/s][A
Computing utilities for WSS (kernelshap):   2%|▉                                                               | 5/331 [00:01<01:08,  4.76it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/331 [00:01<00:54,  5.98it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/331 [00:01<01:02,  5.19it/s][A
Computing utilities for WSS (kernelshap):   3%|█▉                                                             | 10/33

Main Process: Saving 766 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx29.pkl...
Save complete.

--- Question 31/50: who played the eldest daughter in the sound of music... ---
Initializing harness...
Response: Charmian Carr and Ariane Rinehart both played the eldest daughter, Liesl, in different versions of "The Sound of Music".
GT: Charmian Carr



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:39,  9.22it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:51,  7.02it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:51,  7.06it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:56,  6.41it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:55,  6.48it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/340 [00:00<01:51,  3.04it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/340 [00:00<01:13,  4.60it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/340 [00:00<00:45,  7.36it/s][A
Computing utilities for WSS (kernelshap):   1%|▉                                                               | 5/340 [00:00<00:59,  5.65it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/340 [00:01<00:54,  6.07it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/340 [00:01<01:03,  5.21it/s][A
Computing utilities for WSS (kernelshap):   3%|█▋                                                              | 9/34

Main Process: Saving 771 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx30.pkl...
Save complete.

--- Question 32/50: who plays andy's teacher in parks and rec... ---
Initializing harness...
Response: Fred Willard
GT: Danielle Bisutti



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:46,  7.86it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:50,  7.22it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:45,  7.97it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:43,  8.22it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:47,  7.59it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/332 [00:00<01:39,  3.33it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/332 [00:00<01:28,  3.72it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/332 [00:00<01:07,  4.88it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/332 [00:01<00:53,  6.13it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/332 [00:01<00:50,  6.38it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/332 [00:01<01:00,  5.34it/s][A
Computing utilities for WSS (kernelshap):   3%|█▋                                                              | 9/33

Main Process: Saving 756 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx31.pkl...
Save complete.

--- Question 33/50: in which channel fifa world cup will be broadcasted... ---
Initializing harness...
Response: Optus Sport, Sony SIX, Sony TEN 1, Sony TEN 2, Sony TEN 3, Sony ESPN, and SonyLIV.
GT: NBCTelemundo



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:38,  9.32it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:44,  8.05it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:48,  7.42it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:47,  7.50it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:48,  7.33it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/328 [00:00<02:49,  1.93it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/328 [00:00<01:55,  2.82it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/328 [00:01<01:16,  4.21it/s][A
Computing utilities for WSS (kernelshap):   2%|▉                                                               | 5/328 [00:01<01:17,  4.19it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/328 [00:01<01:18,  4.10it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/328 [00:01<01:19,  4.06it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/32

Main Process: Saving 760 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx32.pkl...
Save complete.

--- Question 34/50: what is a large body of water that flows through land... ---
Initializing harness...
Response: River.
GT: River



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:39,  9.22it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:45,  8.03it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:41,  8.68it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:46,  7.74it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:49,  7.28it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/336 [00:00<02:04,  2.70it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/336 [00:00<01:39,  3.35it/s][A
Computing utilities for WSS (kernelshap):   1%|▌                                                               | 3/336 [00:00<01:27,  3.83it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/336 [00:01<01:27,  3.78it/s][A
Computing utilities for WSS (kernelshap):   1%|▉                                                               | 5/336 [00:01<01:26,  3.84it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/336 [00:01<01:21,  4.06it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/33

Main Process: Saving 790 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx33.pkl...
Save complete.

--- Question 35/50: who played all the carly's on general hospital... ---
Initializing harness...
Response: Laura Wright and Tamara Braun played Carly Corinthos on General Hospital.
GT: Jennifer BransfordTamara BraunSarah Joy BrownLaura Wright



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:38,  9.49it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:47,  7.67it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:43,  8.21it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:50,  7.12it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:47,  7.62it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/334 [00:00<02:54,  1.91it/s][A
Computing utilities for WSS (kernelshap):   1%|▌                                                               | 3/334 [00:00<01:15,  4.37it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/334 [00:01<01:22,  4.00it/s][A
Computing utilities for WSS (kernelshap):   1%|▉                                                               | 5/334 [00:01<01:16,  4.29it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/334 [00:01<01:20,  4.09it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/334 [00:01<01:21,  3.99it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/33

Main Process: Saving 752 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx34.pkl...
Save complete.

--- Question 36/50: an object that has mass and occupies space is known as... ---
Initializing harness...
Response: Matter.
GT: matter



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:44,  8.24it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:49,  7.28it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:51,  7.07it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:54,  6.55it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:57,  6.22it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/330 [00:00<01:31,  3.58it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/330 [00:00<01:09,  4.73it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/330 [00:00<00:56,  5.82it/s][A
Computing utilities for WSS (kernelshap):   2%|▉                                                               | 5/330 [00:01<01:05,  5.00it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/330 [00:01<00:55,  5.79it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/330 [00:01<00:56,  5.67it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/33

Main Process: Saving 757 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx35.pkl...
Save complete.

--- Question 37/50: who sang the original blinded by the light... ---
Initializing harness...
Response: Bruce Springsteen.
GT: Bruce Springsteen



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:42,  8.63it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:39,  9.26it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:44,  8.03it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:46,  7.70it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:46,  7.77it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/331 [00:00<01:44,  3.17it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/331 [00:00<01:24,  3.90it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/331 [00:00<01:01,  5.29it/s][A
Computing utilities for WSS (kernelshap):   2%|▉                                                               | 5/331 [00:01<01:02,  5.20it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/331 [00:01<00:50,  6.40it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/331 [00:01<00:57,  5.61it/s][A
Computing utilities for WSS (kernelshap):   3%|█▉                                                             | 10/33

Main Process: Saving 762 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx36.pkl...
Save complete.

--- Question 38/50: who is the yellow rose of texas named after... ---
Initializing harness...
Response: Emily Morgan.
GT: Emily D. West



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:43,  8.44it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:41,  8.63it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:41,  8.75it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:45,  7.85it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:43,  8.19it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/334 [00:00<02:41,  2.06it/s][A
Computing utilities for WSS (kernelshap):   1%|▌                                                               | 3/334 [00:00<01:09,  4.78it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/334 [00:00<01:14,  4.41it/s][A
Computing utilities for WSS (kernelshap):   1%|▉                                                               | 5/334 [00:01<01:08,  4.78it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/334 [00:01<01:13,  4.49it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/334 [00:01<01:15,  4.35it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/33

Main Process: Saving 768 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx37.pkl...
Save complete.

--- Question 39/50: who sings the song i can see clearly now the rain is gone... ---
Initializing harness...
Response: Johnny Nash.
GT: Johnny Nash



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:44,  8.20it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:44,  8.06it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:46,  7.73it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:48,  7.36it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:48,  7.35it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/336 [00:00<02:44,  2.04it/s][A
Computing utilities for WSS (kernelshap):   1%|▌                                                               | 3/336 [00:00<01:19,  4.21it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/336 [00:01<01:20,  4.15it/s][A
Computing utilities for WSS (kernelshap):   1%|▉                                                               | 5/336 [00:01<01:09,  4.73it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/336 [00:01<01:15,  4.39it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/336 [00:01<01:01,  5.32it/s][A
Computing utilities for WSS (kernelshap):   3%|█▋                                                              | 9/33

Main Process: Saving 777 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx38.pkl...
Save complete.

--- Question 40/50: who was the rfc editor until 1998 just provide the family na... ---
Initializing harness...
Response: Postel
GT: Postel



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:55,  6.49it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:51,  6.98it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:52,  6.82it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:53,  6.74it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:52,  6.88it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/333 [00:00<02:51,  1.94it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/333 [00:00<01:59,  2.78it/s][A
Computing utilities for WSS (kernelshap):   1%|▌                                                               | 3/333 [00:00<01:38,  3.35it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/333 [00:01<01:15,  4.34it/s][A
Computing utilities for WSS (kernelshap):   2%|▉                                                               | 5/333 [00:01<01:14,  4.42it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/333 [00:01<01:11,  4.57it/s][A
Computing utilities for WSS (kernelshap):   3%|█▋                                                              | 9/33

Main Process: Saving 772 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx39.pkl...
Save complete.

--- Question 41/50: what is the number 1 sport in the usa... ---
Initializing harness...
Response: American football.
GT: American football



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:48,  7.53it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:49,  7.38it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:51,  7.04it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:54,  6.61it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:56,  6.30it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/330 [00:00<01:29,  3.69it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/330 [00:00<01:08,  4.81it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/330 [00:00<00:53,  6.10it/s][A
Computing utilities for WSS (kernelshap):   2%|▉                                                               | 5/330 [00:00<01:02,  5.23it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/330 [00:01<00:53,  6.09it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/330 [00:01<00:55,  5.78it/s][A
Computing utilities for WSS (kernelshap):   3%|█▋                                                              | 9/33

Main Process: Saving 758 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx40.pkl...
Save complete.

--- Question 42/50: when was the flight of the bumblebee written... ---
Initializing harness...
Response: 1899-1900.
GT: in 1899–1900



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:46,  7.82it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:45,  7.97it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:45,  7.97it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:51,  6.95it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:49,  7.24it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/334 [00:00<03:00,  1.84it/s][A
Computing utilities for WSS (kernelshap):   1%|▌                                                               | 3/334 [00:00<01:17,  4.26it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/334 [00:01<01:22,  4.00it/s][A
Computing utilities for WSS (kernelshap):   1%|▉                                                               | 5/334 [00:01<01:19,  4.16it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/334 [00:01<01:22,  3.96it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/334 [00:01<01:24,  3.87it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/33

Main Process: Saving 761 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx41.pkl...
Save complete.

--- Question 43/50: who plays cullen bohannon in hell on wheels... ---
Initializing harness...
Response: Anson Mount.
GT: Anson Adams Mount IV



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:42,  8.48it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:48,  7.42it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:47,  7.66it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:49,  7.30it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:50,  7.06it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/332 [00:00<01:39,  3.32it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/332 [00:00<00:31, 10.28it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/332 [00:00<00:36,  9.03it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/332 [00:00<00:29, 11.17it/s][A
Computing utilities for WSS (kernelshap):   3%|█▉                                                             | 10/332 [00:01<00:37,  8.64it/s][A
Computing utilities for WSS (kernelshap):   4%|██▎                                                            | 12/332 [00:01<00:30, 10.60it/s][A
Computing utilities for WSS (kernelshap):   4%|██▋                                                            | 14/33

Main Process: Saving 770 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx42.pkl...
Save complete.

--- Question 44/50: when does the new pokemon movie come out... ---
Initializing harness...
Response: Pokémon the Movie: Volcanion and the Mechanical Marvel was released in Japan on July 16, 2016.
GT: July 15, 2017



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:38,  9.41it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:54,  6.63it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:50,  7.14it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:52,  6.87it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:53,  6.77it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/340 [00:00<02:01,  2.79it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/340 [00:00<01:22,  4.08it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/340 [00:00<00:50,  6.60it/s][A
Computing utilities for WSS (kernelshap):   1%|▉                                                               | 5/340 [00:00<01:02,  5.32it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/340 [00:01<00:56,  5.87it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/340 [00:01<01:05,  5.10it/s][A
Computing utilities for WSS (kernelshap):   3%|█▋                                                              | 9/34

Main Process: Saving 782 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx43.pkl...
Save complete.

--- Question 45/50: how many oar athletes are in the olympics... ---
Initializing harness...
Response: 28
GT: 168



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:50,  7.21it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:45,  8.04it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:47,  7.66it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:49,  7.21it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:48,  7.42it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/331 [00:00<01:37,  3.40it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/331 [00:00<01:20,  4.08it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/331 [00:00<00:59,  5.46it/s][A
Computing utilities for WSS (kernelshap):   2%|▉                                                               | 5/331 [00:01<01:01,  5.27it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/331 [00:01<00:49,  6.50it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/331 [00:01<00:56,  5.69it/s][A
Computing utilities for WSS (kernelshap):   3%|█▉                                                             | 10/33

Main Process: Saving 761 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx44.pkl...
Save complete.

--- Question 46/50: where is the honda classic played this year... ---
Initializing harness...
Response: PGA National Golf Club's Champion Course in Palm Beach Gardens.
GT: Palm Beach Gardens, Florida



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:45,  8.04it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:50,  7.14it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:55,  6.51it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:57,  6.28it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:58,  6.13it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/330 [00:00<01:39,  3.30it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/330 [00:00<01:14,  4.40it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/330 [00:00<00:57,  5.66it/s][A
Computing utilities for WSS (kernelshap):   2%|▉                                                               | 5/330 [00:01<01:05,  4.97it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/330 [00:01<00:55,  5.78it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/330 [00:01<00:50,  6.43it/s][A
Computing utilities for WSS (kernelshap):   3%|█▉                                                             | 10/33

Main Process: Saving 760 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx45.pkl...
Save complete.

--- Question 47/50: who won the nrl grand final in 2015... ---
Initializing harness...
Response: North Queensland Cowboys.
GT: North Queensland Cowboys



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:47,  7.61it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:42,  8.41it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:46,  7.74it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:43,  8.25it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/364 [00:00<00:46,  7.66it/s][A
Computing utilities for ContextCite:   2%|█▎                                                                   | 7/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/331 [00:00<01:33,  3.54it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/331 [00:00<01:19,  4.12it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/331 [00:00<00:59,  5.47it/s][A
Computing utilities for WSS (kernelshap):   2%|▉                                                               | 5/331 [00:00<01:01,  5.26it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/331 [00:01<00:49,  6.52it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/331 [00:01<00:56,  5.67it/s][A
Computing utilities for WSS (kernelshap):   3%|█▉                                                             | 10/33

Main Process: Saving 760 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx46.pkl...
Save complete.

--- Question 48/50: where was charlie and the choclate factory filmed... ---
Initializing harness...
Response: Pinewood Studios in England, and various locations in Munich, Germany.
GT: Pinewood Studios in England



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:46,  7.80it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:50,  7.21it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:49,  7.27it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:48,  7.48it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:52,  6.84it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/340 [00:00<02:14,  2.52it/s][A
Computing utilities for WSS (kernelshap):   1%|▍                                                               | 2/340 [00:00<01:44,  3.23it/s][A
Computing utilities for WSS (kernelshap):   1%|▌                                                               | 3/340 [00:00<01:33,  3.59it/s][A
Computing utilities for WSS (kernelshap):   1%|▉                                                               | 5/340 [00:01<01:08,  4.89it/s][A
Computing utilities for WSS (kernelshap):   2%|█▎                                                              | 7/340 [00:01<01:04,  5.14it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/340 [00:01<01:06,  5.00it/s][A
Computing utilities for WSS (kernelshap):   3%|█▋                                                              | 9/34

Main Process: Saving 775 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx47.pkl...
Save complete.

--- Question 49/50: who owns the rights to the song happy birthday... ---
Initializing harness...
Response: The song "Happy Birthday" is in the public domain.
GT: Warner/Chappell Music



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:41,  8.77it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:42,  8.47it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:47,  7.65it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:49,  7.29it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/364 [00:00<00:50,  7.14it/s][A
Computing utilities for ContextCite:   2%|█▎                                                                   | 7/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/332 [00:00<01:25,  3.86it/s][A
Computing utilities for WSS (kernelshap):   1%|▊                                                               | 4/332 [00:00<00:29, 11.02it/s][A
Computing utilities for WSS (kernelshap):   2%|█▏                                                              | 6/332 [00:00<00:33,  9.62it/s][A
Computing utilities for WSS (kernelshap):   2%|█▌                                                              | 8/332 [00:00<00:28, 11.57it/s][A
Computing utilities for WSS (kernelshap):   3%|█▉                                                             | 10/332 [00:01<00:35,  8.98it/s][A
Computing utilities for WSS (kernelshap):   4%|██▍                                                            | 13/332 [00:01<00:35,  8.90it/s][A
Computing utilities for WSS (kernelshap):   5%|██▊                                                            | 15/33

Main Process: Saving 773 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx48.pkl...
Save complete.

--- Question 50/50: who sang the song every breath you take... ---
Initializing harness...
Response: The Police.
GT: The Police



Computing utilities for ContextCite:   0%|                                                                             | 0/364 [00:00<?, ?it/s][A
Computing utilities for ContextCite:   0%|▏                                                                    | 1/364 [00:00<00:47,  7.72it/s][A
Computing utilities for ContextCite:   1%|▍                                                                    | 2/364 [00:00<00:55,  6.51it/s][A
Computing utilities for ContextCite:   1%|▌                                                                    | 3/364 [00:00<00:54,  6.65it/s][A
Computing utilities for ContextCite:   1%|▊                                                                    | 4/364 [00:00<00:53,  6.72it/s][A
Computing utilities for ContextCite:   1%|▉                                                                    | 5/364 [00:00<00:53,  6.73it/s][A
Computing utilities for ContextCite:   2%|█▏                                                                   | 6/36

  (Divergence Utility) Caching baseline token distributions for full context...



Computing utilities for WSS (kernelshap):   0%|▏                                                               | 1/344 [00:00<01:34,  3.62it/s][A
Computing utilities for WSS (kernelshap):   1%|▌                                                               | 3/344 [00:00<01:02,  5.45it/s][A
Computing utilities for WSS (kernelshap):   1%|▋                                                               | 4/344 [00:00<00:57,  5.90it/s][A
Computing utilities for WSS (kernelshap):   1%|▉                                                               | 5/344 [00:00<01:06,  5.11it/s][A
Computing utilities for WSS (kernelshap):   2%|█                                                               | 6/344 [00:01<01:00,  5.55it/s][A
Computing utilities for WSS (kernelshap):   2%|█▍                                                              | 8/344 [00:01<00:55,  6.03it/s][A
Computing utilities for WSS (kernelshap):   3%|█▋                                                              | 9/34

Main Process: Saving 794 utility entries to ../Experiment_data/NQ/Llama-3.1-8B-Instruct/sentences/utilities_q_idx49.pkl...
Save complete.





In [7]:
methods = [f'ContextCite{actual_samples}', f'FM_Weights{actual_samples}',f'FM_WeightsD{actual_samples}',f'Spex{actual_samples}',f'FBII{actual_samples}',f'FSII{actual_samples}', 'LOO', 'ARC-JSD']

# Initialize lists
topk_probs = {method: [] for method in methods}
topk_divs = {method: [] for method in methods}
LDSs = {method: [] for method in methods}

# Collect values
for ind, entry in enumerate(all_results):
    for method in methods:
        topk_probs[method].append(entry['topk_probability'][method][3])
        topk_divs[method].append(entry['topk_divergence'][method][3])
        for d in entry['LDS']:
            if method in d and not np.isnan(d[method]):
                LDSs[method].append(d[method])
                break
        

# Compute means
mean_topk_probs = {method: np.mean(topk_probs[method]) for method in methods}
mean_topk_divs = {method: np.mean(topk_divs[method]) for method in methods}
mean_LDSs = {method: np.mean(LDSs[method]) for method in methods}

print("Mean topk_probability:", mean_topk_probs)
print("Mean topk_divergence:", mean_topk_divs)
print("Mean LDS:", mean_LDSs)

Mean topk_probability: {'ContextCite364': 11.87266467353329, 'FM_Weights364': 11.232420057263225, 'FM_WeightsD364': 10.670906675923616, 'Spex364': 12.551235888209193, 'FBII364': 12.43670495113358, 'FSII364': 12.43670495113358, 'LOO': 10.209360985066741, 'ARC-JSD': 11.18596002439037}
Mean topk_divergence: {'ContextCite364': 1.4231881811443678, 'FM_Weights364': 1.3669730568168712, 'FM_WeightsD364': 1.477624880192605, 'Spex364': 1.5508743182194544, 'FBII364': 1.5715239935201728, 'FSII364': 1.5715239935201728, 'LOO': 1.2295310488616673, 'ARC-JSD': 1.4662313626771435}
Mean LDS: {'ContextCite364': 0.7137408912396994, 'FM_Weights364': 0.5916351501668521, 'FM_WeightsD364': 0.636182424916574, 'Spex364': 0.7221959372590794, 'FBII364': 0.7041221685444364, 'FSII364': 0.7041221685444364, 'LOO': 0.44809425438696054, 'ARC-JSD': 0.7362402669632925}


In [21]:
type(mean_LDSs['ContextCite364'])

numpy.float64

In [69]:
query_to_explore = 10
print('Question:', df.question[query_to_explore])
print('Answer:', df.answer[query_to_explore])
docs = ast.literal_eval(df.context[query_to_explore])
for doc in docs:
    print('--------------------')
    print(doc)

Question: who is next in line to be the monarch of england
Answer: Charles, Prince of Wales
--------------------
Catholics are eligible. Queen Elizabeth II is the sovereign, and her heir apparent is her eldest son, Charles, Prince of Wales. Next in line after him is Prince William, Duke of Cambridge, the Prince of Wales's elder son. Third in line is Prince George, the eldest child of the Duke of Cambridge, followed by his sister, Princess Charlotte and younger brother, Prince Louis. Sixth in line is Prince Harry, Duke of Sussex, the younger son of the Prince of Wales. Under the Perth Agreement, which came into effect in 2015, only the first six in line of succession require the
--------------------
The Act required that the regent should be the next person in the line of succession who was: The Counsellors of State were to consist of: Thus, at the time of the passing of the Act, Prince Henry, Duke of Gloucester would have become Regent in the event that King George VI died while The Pr

In [70]:
result_to_observe = all_results[query_to_explore]
fm_weights = result_to_observe['FM_Weights512']
cc_weights = result_to_observe['ContextCite512']
loo_weights = result_to_observe['LOO']
jsd_weights = result_to_observe['ARC-JSD']
print('FM ranking: ', np.argsort(fm_weights)[::-1])
print('CC ranking: ', np.argsort(cc_weights)[::-1])
print('LOO ranking: ', np.argsort(loo_weights)[::-1])
print('JSD ranking: ', np.argsort(jsd_weights)[::-1])

FM ranking:  [0 6 5 9 7 3 8 2 4 1]
CC ranking:  [0 6 7 5 2 9 3 8 4 1]
LOO ranking:  [0 9 8 6 7 5 2 4 3 1]
JSD ranking:  [0 8 1 2 7 6 5 9 4 3]


In [24]:
import json
import copy
all_results_json = copy.deepcopy(all_results)

    # Recursively convert NumPy arrays to lists
def convert_numpy_to_list(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, (np.integer, np.floating, np.bool_)):
        # Convert NumPy scalar types to Python native types
        return obj.item()
    elif isinstance(obj, dict):
        return {k: convert_numpy_to_list(v) for k, v in obj.items()}
    elif isinstance(obj, list):
            return [convert_numpy_to_list(elem) for elem in obj]
    else:
        return obj

converted_all_results = convert_numpy_to_list(all_results_json)

with open('NQ_sents_25_FM4_512.json', 'w') as f:
    json.dump(converted_all_results, f, indent=4)

In [12]:
import matplotlib.pyplot as plt
def plot_bar_chart(data_dict, title="Averaged R2 for NQ sent-level, FM20",
                   x_label="Methods", y_label="R2"):

    plot_data = []
    for key, value in data_dict.items():
        if not math.isnan(value):
            plot_data.append((key, value))
        else:
            print(f"Skipping '{key}' for plotting as its value is NaN.")

    if not plot_data:
        print("No valid numeric data to plot after filtering NaNs.")
        return

    # --- NEW: Sort the data ---
    # Sort by the value (index 1 of the tuple), in descending order
    plot_data.sort(key=lambda item: item[1], reverse=True)

    # Unpack sorted data into separate lists for plotting
    labels = [item[0] for item in plot_data]
    values = [item[1] for item in plot_data]
    
    # --- NEW: Define a list of colors ---
    # You can customize this list with any valid matplotlib color names or hex codes.
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
    # If you have more bars than colors, you can use matplotlib.cm for a colormap:
    # import matplotlib.cm as cm
    # colors = cm.viridis(np.linspace(0, 1, len(labels)))

    plt.figure(figsize=(12, 7))
    # --- NEW: Pass the list of colors to plt.bar ---
    bars = plt.bar(labels, values, color=colors[:len(labels)]) # Slice to match number of bars

    # Add titles and labels
    plt.xlabel(x_label, fontsize=12)
    plt.ylabel(y_label, fontsize=12)
    plt.title(title, fontsize=14)

    # Rotate x-axis labels if they are long to prevent overlap
    plt.xticks(rotation=45, ha='right', fontsize=10) # 'ha' is horizontal alignment

    # Add value labels on top of the bars
    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, # Position text slightly above bar
                 round(yval, 4), # Format value to 4 decimal places
                 ha='center', va='bottom', fontsize=9)

    # Add a grid for easier reading of values
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Adjust layout to prevent labels from being cut off
    plt.tight_layout()

    # Display the plot
    #plt.show()
    plt.savefig(f'nq_sents_100_fm20_R2_2307.png')

plot_bar_chart(averaged_R2)

In [16]:
import matplotlib.pyplot as plt
import numpy as np
import math # For handling potential NaN values if they were in the input

# --- Plotting Function for Grouped Bar Chart ---
def plot_grouped_bar_chart(data_for_plotting, title="Divergence Climb",
                           x_label="Methods", y_label="Averaged Divergence Climb"):
    """
    Draws a grouped bar chart for the given nested dictionary data.

    Args:
        data_for_plotting (dict): A dictionary where keys are metric names,
                                  and values are dictionaries containing numeric
                                  values for categories like '1' and '2'.
                                  e.g., {'MetricA': {'1': 0.1, '2': 0.2}}
        title (str): The title of the plot.
        x_label (str): Label for the x-axis.
        y_label (str): Label for the y-axis.
    """
    if not data_for_plotting:
        print("No data to plot. The input dictionary is empty.")
        return

    # Extract metric names and ensure a consistent order (alphabetical for clarity)
    metric_names = sorted(data_for_plotting.keys())

    # Prepare data for plotting
    values_cat1 = []
    values_cat2 = []
    
    # Store labels for the actual metrics being plotted (in case some have NaNs)
    plot_metric_labels = []

    for metric in metric_names:
        cat_data = data_for_plotting[metric]
        val1 = cat_data.get('1', float('nan')) # Use .get() to handle missing keys
        val2 = cat_data.get('2', float('nan'))

        # Only include metrics where at least one category has a valid number
        if (isinstance(val1, (int, float)) and not math.isnan(val1)) or \
           (isinstance(val2, (int, float)) and not math.isnan(val2)):
            values_cat1.append(val1 if (isinstance(val1, (int, float)) and not math.isnan(val1)) else 0)
            values_cat2.append(val2 if (isinstance(val2, (int, float)) and not math.isnan(val2)) else 0)
            plot_metric_labels.append(metric)
        else:
            print(f"Skipping metric '{metric}' as both '1' and '2' values are NaN or non-numeric.")


    if not plot_metric_labels:
        print("No valid data points found to plot after processing categories.")
        return

    # Set up positions for the bars
    bar_width = 0.35
    index = np.arange(len(plot_metric_labels)) # The x locations for the groups

    plt.figure(figsize=(14, 8)) # Adjust figure size

    # Plotting the bars
    bar1 = plt.bar(index - bar_width/2, values_cat1, bar_width, label='1', color='skyblue')
    bar2 = plt.bar(index + bar_width/2, values_cat2, bar_width, label='2', color='lightcoral')

    # Add labels, title, and legend
    plt.xlabel(x_label, fontsize=14)
    plt.ylabel(y_label, fontsize=14)
    plt.title(title, fontsize=16)
    plt.xticks(index, plot_metric_labels, rotation=45, ha='right', fontsize=12) # Set metric labels at group center
    plt.yticks(fontsize=12)
    plt.legend(fontsize=12)

    # Add value labels on top of the bars
    def autolabel(bars):
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2, height + 0.005, # Position text slightly above bar
                     f'{height:.4f}', # Format value to 4 decimal places
                     ha='center', va='bottom', fontsize=9, rotation=0)

    autolabel(bar1)
    autolabel(bar2)

    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout() # Adjust layout to prevent labels from being cut off
    #plt.show()
    plt.savefig(f'nq_sents_100_fm20_div_drop_2307.png')

plot_grouped_bar_chart(div_drops)

In [9]:
[i for i in results_for_query]

['ContextCite100',
 'FM_Shap100',
 'FM_Weights100',
 'BetaShap100',
 'TMC100',
 'LOO',
 'ARC-JSD',
 'topk_probability',
 'topk_divergence']

In [10]:
df.question[9]

'how many episodes are in series 7 game of thrones'

In [11]:
harness._generate_sampled_ablations(4, sampling_method='uniform', seed=2)

In [12]:
df.context[9]

'[\'Game of Thrones (season 7) The seventh and penultimate season of the fantasy drama television series "Game of Thrones" premiered on HBO on July 16, 2017, and concluded on August 27, 2017. Unlike previous seasons that consisted of ten episodes each, the seventh season consisted of only seven. Like the previous season, it largely consisted of original content not found in George R. R. Martin\\\'s "A Song of Ice and Fire" series, while also incorporating material Martin revealed to showrunners about the upcoming novels in the series. The series was adapted for television by David Benioff and D. B. Weiss.\', \'Bender, who worked on the show\\\'s sixth season, said that the seventh season would consist of seven episodes. Benioff and Weiss stated that they were unable to produce 10 episodes in the show\\\'s usual 12 to 14 month time frame, as Weiss said "It\\\'s crossing out of a television schedule into more of a mid-range movie schedule." HBO confirmed on July 18, 2016, that the sevent

In [37]:
docs[6]

'genuine, though very obscure, saying, "only fools and horses work for a living", which had its origins in 19th-century American vaudeville. "Only Fools and Horses" had also been the title of an episode of "Citizen Smith", and Sullivan liked the expression and thought it was suited to the new sitcom. He also thought longer titles would attract attention. He was first overruled on the grounds that the audience would not understand the title, but he eventually got his way. Filming of the first series began in May 1981, and the first episode, "Big Brother", was transmitted on BBC One at\''

In [15]:
all_results[0]

{'ContextCite100': array([ 9.42799874e+00,  4.95333848e+00,  1.54510382e-01, -1.18733185e+00,
        -1.97362536e-01, -1.63601493e-03, -4.20977587e-01, -1.71711992e+00,
         5.13743546e-01, -3.61054749e-01]),
 'FM_Shap100': array([10.32068284,  5.69851447,  0.11906173, -0.44636073, -0.88331575,
        -1.00641049, -0.22007863, -0.79765982,  0.29558806, -0.81720759]),
 'FM_Weights100': array([10.32068284,  5.69851447,  0.11906173, -0.44636073, -0.88331575,
        -1.00641049, -0.22007863, -0.79765982,  0.29558806, -0.81720759]),
 'BetaShap100': array([12.18496471,  6.09588892,  0.22350156,  0.12756782, -0.06228456,
         0.05961801,  0.48444461, -0.42519495,  0.84019263, -0.43572434]),
 'TMC100': array([ 9.96529419,  5.98631053, -0.16527917, -0.53574083, -0.08729146,
        -0.09466684,  0.01608808, -0.76350622,  0.2977114 , -0.34037049]),
 'LOO': array([12.21575832,  8.61024857,  1.24591446,  0.25884056, -0.06205273,
         0.06636238,  0.56908131, -0.40280247,  0.83529949

In [16]:
F

array([[ 0.        , -2.71718673,  2.64412498,  0.42945254,  0.83703511,
         0.81296934,  2.79770344, -0.70547725, -1.78606628, -1.45755979],
       [-2.71718673,  0.        , -1.55238252,  0.47399397,  0.30587813,
        -1.0472584 , -3.32381804, -0.22556415,  1.52053671,  2.70219414],
       [ 2.64412498, -1.55238252,  0.        , -0.05849546,  0.24429583,
         1.2087218 , -1.96988449,  0.43088249,  1.2081562 ,  1.73971653],
       [ 0.42945254,  0.47399397, -0.05849546,  0.        ,  0.45773645,
         0.56765182,  0.01962721, -0.01922445,  0.25747797, -0.84770228],
       [ 0.83703511,  0.30587813,  0.24429583,  0.45773645,  0.        ,
         0.23349889,  0.07346147, -0.04195382,  0.03793556, -0.31164835],
       [ 0.81296934, -1.0472584 ,  1.2087218 ,  0.56765182,  0.23349889,
         0.        ,  0.17034947,  0.01708509,  0.06055724, -0.52206366],
       [ 2.79770344, -3.32381804, -1.96988449,  0.01962721,  0.07346147,
         0.17034947,  0.        , -0.15618031

In [17]:
all_subsets = list(itertools.product([0, 1], repeat=4))

In [18]:
sampled_tuples = harness._generate_sampled_ablations(10, sampling_method='uniform', seed=2)

In [17]:
d = {}
type(d)

dict

In [18]:
import matplotlib.pyplot as plt

for result in range(len(all_results)):
    method_scores = {}
    for method, scores in all_results[result].items():
        if scores is not None and type(scores) is not dict:
            print(method, scores)
            method_scores[method] = np.round(scores, 4)

    for method, scores in method_scores.items():
        plt.figure(figsize=(10, 4))
        plt.bar(range(len(scores)), scores, color='skyblue')
        plt.title(f"Approximate Scores: {method}")
        plt.xlabel("Index")
        plt.ylabel("Score")
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.xticks(range(len(scores)))
        plt.tight_layout()
        plt.savefig(f'nq_doc_plots/{result}_{method}.png')

ContextCite100 [ 9.42799874e+00  4.95333848e+00  1.54510382e-01 -1.18733185e+00
 -1.97362536e-01 -1.63601493e-03 -4.20977587e-01 -1.71711992e+00
  5.13743546e-01 -3.61054749e-01]
FM_Shap100 [10.32068284  5.69851447  0.11906173 -0.44636073 -0.88331575 -1.00641049
 -0.22007863 -0.79765982  0.29558806 -0.81720759]
FM_Weights100 [10.32068284  5.69851447  0.11906173 -0.44636073 -0.88331575 -1.00641049
 -0.22007863 -0.79765982  0.29558806 -0.81720759]
BetaShap100 [12.18172271  7.73213164  0.78913852  0.1642193  -0.06000717  0.05865822
  0.56100273 -0.43111163  0.83715638 -0.46782496]
TMC100 [ 9.96529419  5.98631053 -0.16527917 -0.53574083 -0.08729146 -0.09466684
  0.01608808 -0.76350622  0.2977114  -0.34037049]
LOO [12.21575832  8.61024857  1.24591446  0.25884056 -0.06205273  0.06636238
  0.56908131 -0.40280247  0.83529949 -0.49150467]
ARC-JSD [ 0.34588716 10.75895722  0.03822068  0.01981782  0.03414572  0.03501194
  0.05906658  0.02726513  0.04695243  0.04338138]
ContextCite100 [ 0.53418168

  plt.figure(figsize=(10, 4))
