In [1]:
import sys
import os
import random
import gc
import time
import torch
import numpy as np
import pandas as pd
import ast
from tqdm import tqdm
from scipy.sparse import csr_matrix
import itertools
from scipy.stats import spearmanr, pearsonr, kendalltau, rankdata
from sklearn.metrics import ndcg_score
from transformers import AutoModelForCausalLM, AutoTokenizer
from accelerate import Accelerator

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(parent_dir)
from SHapRAG import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df=pd.read_json("../data/musique/musique_ans_v1.0_train.jsonl", lines=True)

In [3]:
def get_titles(lst):
    # Titles where is_supporting is True
    supporting = [d['paragraph_text'] for d in lst if d.get('is_supporting') == True]
    # Titles where is_supporting is False or missing AND not already in supporting
    others = [d['paragraph_text'] for d in lst if d.get('is_supporting') != True and d['paragraph_text'] not in supporting]
    # Combine: all supporting + as many others as needed to reach 10
    result = supporting + others
    return result[:10]

df.paragraphs=df.paragraphs.apply(get_titles)

In [4]:
SEED = 42
# Initialize Accelerator
accelerator_main = Accelerator(mixed_precision="fp16")

# Load Model
if accelerator_main.is_main_process:
    print("Main Script: Loading model...")
# model_path = "mistralai/Mistral-7B-Instruct-v0.3"
model_path = "meta-llama/Llama-3.1-8B-Instruct"
# model_path = "Qwen/Qwen2.5-3B-Instruct"

model_cpu = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model_cpu.config.pad_token_id = tokenizer.pad_token_id
    if hasattr(model_cpu, 'generation_config') and model_cpu.generation_config is not None:
        model_cpu.generation_config.pad_token_id = tokenizer.pad_token_id

if accelerator_main.is_main_process:
    print("Main Script: Preparing model with Accelerator...")
prepared_model = accelerator_main.prepare(model_cpu)
unwrapped_prepared_model = accelerator_main.unwrap_model(prepared_model)
unwrapped_prepared_model.eval()
if accelerator_main.is_main_process:
    print("Main Script: Model prepared and set to eval.")

# Define utility cache

accelerator_main.wait_for_everyone()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Main Script: Loading model...


Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.74it/s]


Main Script: Preparing model with Accelerator...
Main Script: Model prepared and set to eval.


In [11]:
# num_questions_to_run=len(df.question)
num_questions_to_run=2
k_values = [1, 2]
all_metrics_data = []
all_results=[]
spearman_fms=[]
spearman_ccs=[]
r2_ccs=[]
r2_fms=[]
for i in tqdm(range(num_questions_to_run), disable=not accelerator_main.is_main_process):
    query = df.question[i]
    if accelerator_main.is_main_process:
        print(f"\n--- Question {i+1}/{num_questions_to_run}: {query[:60]}... ---")

    docs=df.paragraphs[i]

    utility_cache_base_dir = "../Experiment_data/musique"
    utility_cache_filename = f"utilities_q_idx{i}.pkl" # More robust naming
    current_utility_path = os.path.join(utility_cache_base_dir, utility_cache_filename)
    
    if accelerator_main.is_main_process: # Only main process creates directories
        os.makedirs(os.path.dirname(current_utility_path), exist_ok=True)
    
    # Initialize Harness
    harness = ContextAttribution(
        items=docs,
        query=query,
        prepared_model_for_harness=prepared_model,
        tokenizer_for_harness=tokenizer,
        accelerator_for_harness=accelerator_main,
        utility_cache_path=current_utility_path
    )

    print(f'Response: {harness.target_response}')
    # Compute metrics
    results_for_query = {}
    if accelerator_main.is_main_process:
        m_samples_map = {"L": 100} 
        T_iterations_map = {"L":40} 

        for size_key, num_s in m_samples_map.items():
            if 2**len(docs) < num_s and size_key != "L":
                actual_samples = max(1, 2**len(docs)-1 if 2**len(docs)>0 else 1)
            else:
                actual_samples = num_s

            if actual_samples > 0: 
                results_for_query[f"ContextCite{actual_samples}"], model_cc = harness.compute_contextcite(num_samples=actual_samples, seed=SEED)
                results_for_query[f"FM_Shap{actual_samples}"], results_for_query[f"FM_Weights{actual_samples}"], F, modelfm = harness.compute_wss(num_samples=actual_samples, seed=SEED, sampling="kernelshap",sur_type="fm")
                results_for_query[f"BetaShap{actual_samples}"] = harness.compute_beta_shap(num_iterations_max=T_iterations_map[size_key], beta_a=16, beta_b=1, max_unique_lookups=actual_samples, seed=SEED)
                results_for_query[f"TMC{actual_samples}"] = harness.compute_tmc_shap(num_iterations_max=T_iterations_map[size_key], performance_tolerance=0.001, max_unique_lookups=actual_samples, seed=SEED)

        results_for_query["LOO"] = harness.compute_loo()
        results_for_query["ARC-JSD"] = harness.compute_arc_jsd()

        prob_topk = harness.evaluate_topk_performance(
                                                results_for_query, 
                                                k_values, 
                                                utility_type="probability"
                                            )
    
        div_topk = harness.evaluate_topk_performance(
                                            results_for_query, 
                                            k_values, 
                                            utility_type="divergence"
                                        )
    
        results_for_query["topk_probability"] = prob_topk
        results_for_query["topk_divergence"] = div_topk
        harness.save_utility_cache(current_utility_path)
        all_results.append(results_for_query)
        spearman_cc, spearman_fm, r2_cc, r2_fm =harness.lds_and_faithfulness(modelfm, model_cc, 30)
        spearman_ccs.append(spearman_cc)
        spearman_fms.append(spearman_fm)
        r2_ccs.append(r2_cc)
        r2_fms.append(r2_fm)
        all_results.append(results_for_query)

  0%|          | 0/2 [00:00<?, ?it/s]


--- Question 1/2: When was the institute that owned The Collegian founded?... ---
Loading existing utility cache from ../Experiment_data/musique/utilities_q_idx0.pkl...
Successfully loaded 346 cached utilities.
Response: Houston Baptist University was founded in 1960.


100%|██████████| 10/10 [00:01<00:00,  5.03it/s]
 50%|█████     | 1/2 [00:12<00:12, 12.31s/it]


--- Question 2/2: What year saw the creation of the region where the county of... ---
Loading existing utility cache from ../Experiment_data/musique/utilities_q_idx1.pkl...
Successfully loaded 346 cached utilities.
Response: 1994.


100%|██████████| 10/10 [00:02<00:00,  3.44it/s]
100%|██████████| 2/2 [00:29<00:00, 14.79s/it]


In [10]:
import matplotlib.pyplot as plt

all_drops = defaultdict(lambda: defaultdict(list))

# Probability-based results
print("\nProbability-based Top-k Performance:")
for method, drops in prob_topk.items():
    print(f"  {method}:")
    for k, drop in drops.items():
        print(f"    k={k}: Drop = {drop:.4f}")

# Divergence-based results
print("\nDivergence-based Top-k Performance:")
for method, jsds in div_topk.items():
    print(f"  {method}:")
    for k, jsd in jsds.items():
        print(f"    k={k}: JSD = {jsd:.4f}")

# methods = list(all_drops.keys())
# k_values = sorted(list(all_drops[methods[0]].keys()))

# for method in methods:
#     means = [np.nanmean(all_drops[method][k]) for k in k_values]
#     plt.plot(k_values, means, label=method, marker='o')

# plt.xlabel('k')
# plt.ylabel('Mean Utility Drop')
# plt.title('Top-k Removal Probability Drop')
# plt.legend()
# plt.show()


Probability-based Top-k Performance:
  ContextCite100:
    k=1: Drop = 11.4530
    k=2: Drop = 11.7507
  FM_Shap100:
    k=1: Drop = 11.4530
    k=2: Drop = 11.7507
  FM_Weights100:
    k=1: Drop = 11.4530
    k=2: Drop = 11.7507
  BetaShap100:
    k=1: Drop = 11.4530
    k=2: Drop = 12.3379
  TMC100:
    k=1: Drop = 11.4530
    k=2: Drop = 11.7507
  LOO:
    k=1: Drop = 11.4530
    k=2: Drop = 12.3379
  ARC-JSD:
    k=1: Drop = 11.4530
    k=2: Drop = 11.9030

Divergence-based Top-k Performance:
  ContextCite100:
    k=1: JSD = 1.1366
    k=2: JSD = 1.0900
  FM_Shap100:
    k=1: JSD = 1.1366
    k=2: JSD = 1.0900
  FM_Weights100:
    k=1: JSD = 1.1366
    k=2: JSD = 1.0900
  BetaShap100:
    k=1: JSD = 1.1366
    k=2: JSD = 1.0931
  TMC100:
    k=1: JSD = 1.1366
    k=2: JSD = 1.0900
  LOO:
    k=1: JSD = 1.1366
    k=2: JSD = 1.0931
  ARC-JSD:
    k=1: JSD = 1.1366
    k=2: JSD = 1.0704


In [None]:
df.answer[2]

In [None]:
harness.target_response

In [None]:
df.paragraphs[0]

In [None]:
all_results

In [None]:
F

In [None]:
all_subsets = list(itertools.product([0, 1], repeat=4))

In [None]:
sampled_tuples = harness._generate_sampled_ablations(10, sampling_method='uniform', seed=2)

In [None]:
import matplotlib.pyplot as plt

method_scores = {}

for result in all_results:
    for method, scores in result.items():
        if scores is not None:
            method_scores[method] = np.round(scores, 4)

for method, scores in method_scores.items():
    plt.figure(figsize=(10, 4))
    plt.bar(range(len(scores)), scores, color='skyblue')
    plt.title(f"Approximate Scores: {method}")
    plt.xlabel("Index")
    plt.ylabel("Score")
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.xticks(range(len(scores)))
    plt.tight_layout()
    plt.show()

In [None]:
F