In [1]:
import sys
import os
import random
import gc
import time
import torch
import numpy as np
import pandas as pd
import ast
from tqdm import tqdm
from scipy.sparse import csr_matrix
import itertools
from scipy.stats import spearmanr, pearsonr, kendalltau, rankdata
from sklearn.metrics import ndcg_score
from transformers import AutoModelForCausalLM, AutoTokenizer
from accelerate import Accelerator
import nltk
nltk.download('punkt')
os.environ["CUDA_VISIBLE_DEVICES"] = "2" 
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(parent_dir)
from SHapRAG import *

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df=pd.read_json("../data/musique/musique_ans_v1.0_train.jsonl", lines=True)

In [3]:
def get_titles(lst):
    # Titles where is_supporting is True
    supporting = [d['paragraph_text'] for d in lst if d.get('is_supporting') == True]
    # Titles where is_supporting is False or missing AND not already in supporting
    others = [d['paragraph_text'] for d in lst if d.get('is_supporting') != True and d['paragraph_text'] not in supporting]
    # Combine: all supporting + as many others as needed to reach 10
    result = supporting + others
    return result[:10]

df.paragraphs=df.paragraphs.apply(get_titles)

In [None]:
df["paragraphs"] = df["paragraphs"].apply(lambda p: p[:5]+ [p[1]] + p[5:])

In [4]:
all_sents = []
for i in range(len(df.question)):
    n = 0
    docs=df.paragraphs[i]
    doc_sents = []
    for j in range(len(docs)):
        sents = nltk.sent_tokenize(docs[j])
        new_sents = []
        for s in range(len(sents)):
            new_sents.append(str(n + s) + '-' + str(j) + '-' + sents[s])
        n += len(sents)
        doc_sents.append(new_sents)
    flat_doc_sents = [
    x
    for xs in doc_sents
    for x in xs
]
    all_sents.append(flat_doc_sents)
df['Sentences'] = all_sents

In [5]:
SEED = 42
# Initialize Accelerator
accelerator_main = Accelerator(mixed_precision="fp16")

# Load Model
if accelerator_main.is_main_process:
    print("Main Script: Loading model...")
# model_path = "mistralai/Mistral-7B-Instruct-v0.3"
model_path = "meta-llama/Llama-3.1-8B-Instruct"
# model_path = "Qwen/Qwen2.5-3B-Instruct"

model_cpu = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model_cpu.config.pad_token_id = tokenizer.pad_token_id
    if hasattr(model_cpu, 'generation_config') and model_cpu.generation_config is not None:
        model_cpu.generation_config.pad_token_id = tokenizer.pad_token_id

if accelerator_main.is_main_process:
    print("Main Script: Preparing model with Accelerator...")
prepared_model = accelerator_main.prepare(model_cpu)
unwrapped_prepared_model = accelerator_main.unwrap_model(prepared_model)
unwrapped_prepared_model.eval()
if accelerator_main.is_main_process:
    print("Main Script: Model prepared and set to eval.")

# Define utility cache

accelerator_main.wait_for_everyone()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Main Script: Loading model...


Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.58it/s]


Main Script: Preparing model with Accelerator...
Main Script: Model prepared and set to eval.


In [None]:
# num_questions_to_run=len(df.question)
num_questions_to_run=10
k_values = [2]
all_metrics_data = []
all_results=[]
LDSs=[]
r2_fm=[]
r2_cc=[]
for i in tqdm(range(num_questions_to_run), disable=not accelerator_main.is_main_process):
    query = df.question[i]
    if accelerator_main.is_main_process:
        print(f"\n--- Question {i+1}/{num_questions_to_run}: {query[:60]}... ---")

    # docs=df.paragraphs[i]
    docs=[sent[4:] for sent in df.Sentences[i]]

    utility_cache_base_dir = "../Experiment_data/musique/Sentence1"
    utility_cache_filename = f"utilities_q_idx{i}.pkl" # More robust naming
    current_utility_path = os.path.join(utility_cache_base_dir, utility_cache_filename)
    
    if accelerator_main.is_main_process: # Only main process creates directories
        os.makedirs(os.path.dirname(current_utility_path), exist_ok=True)
    
    # Initialize Harness
    harness = ContextAttribution(
        items=docs,
        query=query,
        prepared_model_for_harness=prepared_model,
        tokenizer_for_harness=tokenizer,
        accelerator_for_harness=accelerator_main,
        utility_cache_path=current_utility_path
    )

    print(f'Response: {harness.target_response}')
    # Compute metrics
    results_for_query = {}
    if accelerator_main.is_main_process:
        m_samples_map = {"L": 512} 
        # m_samples_map = {"L": 128, "XL":256, "XXL":512} 
        T_iterations_map = {"L":40, "XL":50, "XXL":60} 

        for size_key, num_s in m_samples_map.items():
            if 2**len(docs) < num_s and size_key != "L":
                actual_samples = max(1, 2**len(docs)-1 if 2**len(docs)>0 else 1)
            else:
                actual_samples = num_s

            if actual_samples > 0: 
                results_for_query[f"ContextCite{actual_samples}"], model_cc = harness.compute_contextcite(num_samples=actual_samples, seed=SEED)
                results_for_query[f"FM_Weights{actual_samples}"], F, modelfm = harness.compute_wss(num_samples=actual_samples, seed=SEED, sampling="kernelshap",sur_type="fm")
                # results_for_query[f"BetaShap{actual_samples}"] = harness.compute_beta_shap(num_iterations_max=T_iterations_map[size_key], beta_a=16, beta_b=1, max_unique_lookups=actual_samples, seed=SEED)
                # results_for_query[f"TMC{actual_samples}"] = harness.compute_tmc_shap(num_iterations_max=T_iterations_map[size_key], performance_tolerance=0.001, max_unique_lookups=actual_samples, seed=SEED)

        results_for_query["LOO"] = harness.compute_loo()
        results_for_query["ARC-JSD"] = harness.compute_arc_jsd()

        prob_topk = harness.evaluate_topk_performance(
                                                results_for_query, 
                                                k_values, 
                                                utility_type="probability"
                                            )

        div_topk = harness.evaluate_topk_performance(
                                            results_for_query, 
                                            k_values, 
                                            utility_type="divergence"
                                        )
        
        r2_fm.append(harness.r2(30, modelfm, method='fm'))
        r2_cc.append(harness.r2(30, model_cc, method='cc'))

        LDS = {}
        for i in results_for_query:
            if "FM_Shap" in i:
                calculate_LDS = {i:harness.lds(results_for_query[i], 30, utl=True, model=modelfm)}
                LDS.update(calculate_LDS)
            else:
                calculate_LDS = {i:harness.lds(results_for_query[i], 30)}
                LDS.update(calculate_LDS)
        LDS = [{i:harness.lds(results_for_query[i], 30)} for i in results_for_query]

        results_for_query["topk_probability"] = prob_topk
        results_for_query["topk_divergence"] = div_topk
        results_for_query["LDS"] = LDS
        harness.save_utility_cache(current_utility_path)
        
        all_results.append(results_for_query)

  0%|          | 0/10 [00:00<?, ?it/s]


--- Question 1/10: When was the institute that owned The Collegian founded?... ---
Response: I couldn't find any information about the institute that owns The Collegian in the provided context.


100%|██████████| 25/25 [00:05<00:00,  4.65it/s]
 10%|█         | 1/10 [02:24<21:39, 144.42s/it]


--- Question 2/10: What year saw the creation of the region where the county of... ---
Response: 1994.


100%|██████████| 36/36 [00:12<00:00,  2.98it/s]
 20%|██        | 2/10 [05:46<23:47, 178.43s/it]


--- Question 3/10: When was the abolishment of the studio that distributed The ... ---
Response: 1999


In [None]:
methods = ['ContextCite512', 'FM_Shap512', 'FM_Weights512', 'LOO', 'ARC-JSD']

# Initialize lists
topk_probs = {method: [] for method in methods}
topk_divs = {method: [] for method in methods}
LDSs = {method: [] for method in methods}

# Collect values
for entry in all_results:
    for method in methods:
        topk_probs[method].append(entry['topk_probability'][method][2])
        topk_divs[method].append(entry['topk_divergence'][method][2])
        LDSs[method].append(entry['LDS'][method])
        

# Compute means
mean_topk_probs = {method: np.mean(topk_probs[method]) for method in methods}
mean_topk_divs = {method: np.mean(topk_divs[method]) for method in methods}
LDSs = {method: np.mean(LDS[method]) for method in methods}

print("Mean topk_probability:", mean_topk_probs)
print("Mean topk_divergence:", mean_topk_divs)
print("Mean LDS:", LDSs)

In [None]:
df.paragraphs[3]

In [None]:
all_results[3]

In [None]:
sum(r2_fm)

In [None]:
for i in range(len(r2_cc)):
    print(r2_cc[i], r2_fm[i])