In [1]:
import sys
import os
import random
import gc
import time
import torch
import numpy as np
import pandas as pd
import ast
from tqdm import tqdm
from scipy.stats import spearmanr, pearsonr, kendalltau, rankdata
from sklearn.metrics import ndcg_score
from transformers import AutoModelForCausalLM, AutoTokenizer
from accelerate import Accelerator
import nltk

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(parent_dir)
from SHapRAG import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#df= pd.read_csv("../data/NQ.csv",index_col=False, nrows=5)
df= pd.read_csv("../scripts/nq_reforged_without_titles.csv",index_col=False, nrows=5)

In [3]:
len(df.question)
df.head()

Unnamed: 0,question,context,answer
0,total number of death row inmates in the us,['on death row in the United States on January...,2718
1,big little lies season 2 how many episodes,['series garnered several accolades. It receiv...,seven
2,who sang waiting for a girl like you,"['Waiting for a Girl Like You ""Waiting for a G...",Foreigner
3,where do you cross the arctic circle in norway,['Arctic Norway Arctic Norway () comprises the...,Saltfjellet
4,who is the main character in green eggs and ham,['Green Eggs and Ham Green Eggs and Ham is a c...,Sam-I-am


In [4]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/ulb/code_wit/ekuzmenk/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
all_sents = []
for i in range(len(df.question)):
    n = 0
    docs=ast.literal_eval(df.context[i])
    doc_sents = []
    for j in range(len(docs)):
        sents = nltk.sent_tokenize(docs[j])
        new_sents = []
        for s in range(len(sents)):
            new_sents.append(str(n + s) + '-' + str(j) + '-' + sents[s])
        n += len(sents)
        doc_sents.append(new_sents)
    flat_doc_sents = [
    x
    for xs in doc_sents
    for x in xs
]
    all_sents.append(flat_doc_sents)
df['Sentences'] = all_sents

In [14]:
df.Sentences[2]

['0-0-Waiting for a Girl Like You "Waiting for a Girl Like You" is a 1981 power ballad by the British-American rock band Foreigner.',
 '1-0-The distinctive synthesizer theme was performed by the then-little-known Thomas Dolby, and this song also marked a major departure from their earlier singles because their previous singles were mid to upper tempo rock songs while this song was a softer love song with the energy of a power ballad.',
 '2-0-It was the second single released from the album "4" (1981) and was co-written by Lou Gramm and Mick Jones.',
 "3-0-It has become one of the band's most",
 '4-1-held off the number 1 spot by Olivia Newton-John\'s single "Physical" for nine consecutive weeks, and then by Hall & Oates\' "I Can\'t Go for That (No Can Do)" for a tenth week on January 30, 1982.',
 '5-1-Because of its chart longevity, it ended up being the number 19 song on the Top 100 singles of 1982.',
 '6-1-The song was the band\'s biggest hit until "I Want to Know What Love Is" hit n

In [7]:
SEED = 42
# Initialize Accelerator
accelerator_main = Accelerator(mixed_precision="fp16")

# Load Model
if accelerator_main.is_main_process:
    print("Main Script: Loading model...")
#model_path = "mistralai/Mistral-7B-Instruct-v0.3"
#model_path = "meta-llama/Llama-3.1-8B-Instruct"
model_path = "Qwen/Qwen2.5-3B-Instruct"

model_cpu = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model_cpu.config.pad_token_id = tokenizer.pad_token_id
    if hasattr(model_cpu, 'generation_config') and model_cpu.generation_config is not None:
        model_cpu.generation_config.pad_token_id = tokenizer.pad_token_id

if accelerator_main.is_main_process:
    print("Main Script: Preparing model with Accelerator...")
prepared_model = accelerator_main.prepare(model_cpu)
unwrapped_prepared_model = accelerator_main.unwrap_model(prepared_model)
unwrapped_prepared_model.eval()
if accelerator_main.is_main_process:
    print("Main Script: Model prepared and set to eval.")

# Define utility cache

accelerator_main.wait_for_everyone()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Main Script: Loading model...


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.82s/it]


Main Script: Preparing model with Accelerator...
Main Script: Model prepared and set to eval.


In [8]:
#num_questions_to_run=len(df.question)
num_questions_to_run=5
all_metrics_data = []
all_results=[]
Fs=[]
mse_fms=[]
mse_ccs = []
for i in tqdm(range(num_questions_to_run), desc="Processing Questions", disable=not accelerator_main.is_main_process):
    query = df.question[i]
    if accelerator_main.is_main_process:
        print(f"\n--- Question {i+1}/{num_questions_to_run}: {query[:60]}... ---")

    #docs=ast.literal_eval(df.context[i])
    docs = [sent[4:] for sent in df.Sentences[i]]
    utility_cache_base_dir = "../Experiment_data/NQ"
    utility_cache_filename = f"utilities_q_idx{i}_n{len(docs)}.pkl" # More robust naming
    current_utility_path = os.path.join(utility_cache_base_dir, utility_cache_filename)
    
    if accelerator_main.is_main_process: # Only main process creates directories
        os.makedirs(os.path.dirname(current_utility_path), exist_ok=True)
        print(f"  Instantiating ShapleyExperimentHarness for Q{i} (n={len(docs)} docs)...")
    
    # Initialize Harness
    harness = ContextAttribution(
        items=docs,
        query=query,
        prepared_model_for_harness=prepared_model,
        tokenizer_for_harness=tokenizer,
        accelerator_for_harness=accelerator_main,
        verbose=False
    )
    # Compute metrics
    print('Response: ', harness.target_response)
    results_for_query = {}
    # M.append(harness.compute_shapley_interaction_index_pairs_matrix())
    if accelerator_main.is_main_process:

        # results_for_query["ExactLinear"], mse_lin = harness.compute_exact_linear_shap()
        # results_for_query["ExactInter"], pair, mse_inter = harness.compute_exact_inter_shap()
        # pairs.append(pair)
        # mse_lins.append(mse_lin)
        # mse_inters.append(mse_inter)
        m_samples_map = {"L": 100} 
        T_iterations_map = { "L":40} 

        for size_key, num_s in m_samples_map.items():
            if 2**len(docs) < num_s and size_key != "L":
                actual_samples = max(1, 2**len(docs)-1 if 2**len(docs)>0 else 1)
            else:
                actual_samples = num_s

            if actual_samples > 0: 
                results_for_query[f"ContextCite{actual_samples}"], mse_cc = harness.compute_contextcite(num_samples=actual_samples, seed=SEED)
                results_for_query[f"WSS_FM{actual_samples}"], F, mse_fm = harness.compute_wss(num_samples=actual_samples, seed=SEED)
                Fs.append(F)
                mse_fms.append(mse_fm)
                mse_ccs.append(mse_cc)
                #results_for_query[f"BetaShap{actual_samples}"] = harness.compute_beta_shap(num_iterations_max=T_iterations_map[size_key], beta_a=4, beta_b=4, max_unique_lookups=actual_samples, seed=SEED)
                #results_for_query[f"TMC{actual_samples}"] = harness.compute_tmc_shap(num_iterations_max=T_iterations_map[size_key], performance_tolerance=0.001, max_unique_lookups=actual_samples, seed=SEED)

        results_for_query["LOO"] = harness.compute_loo()
        results_for_query["ARC-JSD"] = harness.compute_arc_jsd()

        # exact_scores = results_for_query.get("ExactInter")
        all_results.append(results_for_query)




--- Question 1/5: total number of death row inmates in the us... ---
  Instantiating ShapleyExperimentHarness for Q0 (n=59 docs)...
Response:  5,445



  0%|                                                                                                                   | 0/59 [00:00<?, ?it/s][A
  2%|█▊                                                                                                         | 1/59 [00:00<00:07,  7.70it/s][A
  3%|███▋                                                                                                       | 2/59 [00:00<00:07,  7.91it/s][A
  5%|█████▍                                                                                                     | 3/59 [00:00<00:07,  7.93it/s][A
  7%|███████▎                                                                                                   | 4/59 [00:00<00:06,  7.96it/s][A
  8%|█████████                                                                                                  | 5/59 [00:00<00:06,  7.99it/s][A
 10%|██████████▉                                                                                                | 6/5


--- Question 2/5: big little lies season 2 how many episodes... ---
  Instantiating ShapleyExperimentHarness for Q1 (n=58 docs)...
Response:  Seven



  0%|                                                                                                                   | 0/58 [00:00<?, ?it/s][A
  2%|█▊                                                                                                         | 1/58 [00:00<00:06,  8.22it/s][A
  3%|███▋                                                                                                       | 2/58 [00:00<00:06,  8.10it/s][A
  5%|█████▌                                                                                                     | 3/58 [00:00<00:06,  8.00it/s][A
  7%|███████▍                                                                                                   | 4/58 [00:00<00:06,  7.97it/s][A
  9%|█████████▏                                                                                                 | 5/58 [00:00<00:06,  7.97it/s][A
 10%|███████████                                                                                                | 6/5


--- Question 3/5: who sang waiting for a girl like you... ---
  Instantiating ShapleyExperimentHarness for Q2 (n=48 docs)...
Response:  Foreigner



  0%|                                                                                                                   | 0/48 [00:00<?, ?it/s][A
  2%|██▏                                                                                                        | 1/48 [00:00<00:05,  9.17it/s][A
  6%|██████▋                                                                                                    | 3/48 [00:00<00:04,  9.02it/s][A
  8%|████████▉                                                                                                  | 4/48 [00:00<00:05,  8.46it/s][A
 10%|███████████▏                                                                                               | 5/48 [00:00<00:05,  8.46it/s][A
 12%|█████████████▍                                                                                             | 6/48 [00:00<00:05,  8.27it/s][A
 15%|███████████████▌                                                                                           | 7/4


--- Question 4/5: where do you cross the arctic circle in norway... ---
  Instantiating ShapleyExperimentHarness for Q3 (n=55 docs)...
Response:  Saltfjellet separates Helgeland from the northern part of Nordland county in Norway.



  0%|                                                                                                                   | 0/55 [00:00<?, ?it/s][A
  2%|█▉                                                                                                         | 1/55 [00:00<00:07,  7.65it/s][A
  4%|███▉                                                                                                       | 2/55 [00:00<00:06,  7.76it/s][A
  5%|█████▊                                                                                                     | 3/55 [00:00<00:06,  7.72it/s][A
  7%|███████▊                                                                                                   | 4/55 [00:00<00:06,  7.73it/s][A
  9%|█████████▋                                                                                                 | 5/55 [00:00<00:06,  7.66it/s][A
 11%|███████████▋                                                                                               | 6/5


--- Question 5/5: who is the main character in green eggs and ham... ---
  Instantiating ShapleyExperimentHarness for Q4 (n=57 docs)...
Response:  Sam-I-Am



  0%|                                                                                                                   | 0/57 [00:00<?, ?it/s][A
  2%|█▉                                                                                                         | 1/57 [00:00<00:07,  8.00it/s][A
  4%|███▊                                                                                                       | 2/57 [00:00<00:06,  7.92it/s][A
  5%|█████▋                                                                                                     | 3/57 [00:00<00:06,  7.97it/s][A
  7%|███████▌                                                                                                   | 4/57 [00:00<00:06,  7.89it/s][A
  9%|█████████▍                                                                                                 | 5/57 [00:00<00:06,  7.87it/s][A
 11%|███████████▎                                                                                               | 6/5

In [10]:
print(f"ContextCite: {sum(mse_ccs)},\nFM: {sum(mse_fms)}")

ContextCite: 8.954175284012365,
FM: 0.6846025364072261


In [15]:
mse_fms

[0.0004207625899119422,
 0.5035804808509122,
 0.141468226282803,
 0.0019384622989517004,
 0.0371946043846473]

In [16]:
mse_ccs

[0.06041454111178961,
 3.6320756326295496,
 4.4785244270010285,
 0.022268405538348003,
 0.7608922777316499]

In [6]:
results_for_query

{'ContextCite32': array([-0.41971942,  0.12092963,  0.26682114,  2.36522086,  1.05925235,
        -0.51060558,  0.40442082,  0.        ,  0.24116468, -0.58990014]),
 'WSS_FM32': array([-0.40992928,  0.18123221,  0.53824333,  2.16647781,  1.34779097,
        -0.17111263, -0.43439963,  0.00655496, -0.25640965, -0.37507313]),
 'BetaShap32': array([-1.51930572,  0.70642529, -0.96471328,  3.96875529,  0.44023249,
        -0.233385  ,  0.15917291,  0.63305779,  0.21233613,  0.32725782]),
 'TMC32': array([-4.59314227e-01,  5.38640752e-01, -2.77367312e-01,  2.68052466e+00,
         2.09986234e-01, -4.23926890e-01,  2.07426333e-01,  3.88081467e-01,
         8.58408689e-02,  2.05065310e-03]),
 'LOO': array([-1.65271187, -0.01576805, -0.85803461,  0.63380051, -0.83787179,
        -0.30050182, -0.35937691, -0.35254407, -0.47583818, -0.85098219]),
 'ARC-JSD': [0.003660617076093331,
  0.003243437799932636,
  0.002352682964954056,
  0.05478103106759846,
  0.002019960490466488,
  0.0006410647259826874

In [14]:
import matplotlib.pyplot as plt
#import sys
#import matplotlib
#matplotlib.use('Qt5Agg')



for result in range(len(all_results)):
    method_scores = {}
    for method, scores in all_results[result].items():
        if scores is not None:
            method_scores[method] = np.round(scores, 4)

    for method, scores in method_scores.items():
        plt.figure(figsize=(10, 4))
        plt.bar(range(len(scores)), scores, color='skyblue')
        plt.title(f"Approximate Scores: {method}")
        plt.xlabel("Index")
        plt.ylabel("Score")
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.xticks(range(len(scores)))
        plt.tight_layout()
        plt.savefig(f'100_sents/{result}_{method}.png')

In [8]:
harness.target_response

'Sam-I-Am'