# Imports

In [14]:
import sys
import json
import os
import nltk
import torch
import numpy as np
import pandas as pd
import ast
from tqdm import tqdm
from scipy.stats import spearmanr, pearsonr, kendalltau, rankdata
from sklearn.metrics import ndcg_score
from transformers import AutoModelForCausalLM, AutoTokenizer
from accelerate import Accelerator
from nltk.tokenize import sent_tokenize

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(parent_dir)
from SHapRAG import *

# Data Loading & Preparation

For tidyQA, there are indexes provided for the possible answer passage holding the answer. If it is "-1" it does  NOT mean that the answer is in the last passage chunk but simply that there is no passage holding the answer for that question.

##  Long Version Dataset

In [100]:
data = []
with open('../data/tydi.jsonl', 'r') as f: 
    for line in f: 
        data.append(json.loads(line))

In [101]:
english_data = []
for i in range(len(data)): 
    if data[i]['language'] == 'english': 
        english_data.append(data[i])

In [None]:
english_data[0]['question_text'] # question
english_data[0]['document_plaintext'] # context
english_data[0]['document_title'] # title
# english_data[2]


'Zebra finch'

In [110]:
filtered_data = {'title' : [], 'context' : [],
                 'question' : []}
for i in range(len(english_data)) :
    filtered_data['title'].append(english_data[i]['document_title'])
    filtered_data['context'].append(english_data[i]['document_plaintext'])
    filtered_data['question'].append(english_data[i]['question_text'])

df = pd.DataFrame.from_dict(filtered_data)

## Short Version Dataset

In [48]:
f = open('../data/tydiqa.json', 'r')
data = json.load(f)

In [49]:
data['data'][0]['title'] # get title
data['data'][0]['paragraphs'][0]['context'] # get context
data['data'][0]['paragraphs'][0]['qas'][0]['question'] # get question
data['data'][0]['paragraphs'][0]['qas'][0]['answers'][0]["text"]# get answer

'Wound care'

In [50]:
filtered_data = {"title" : [], "context" : [],
                 "question": [], "answer" : []}
for i in range(len(data['data'])) : 
    filtered_data['title'].append(data['data'][i]['title'])
    filtered_data['context'].append(data['data'][i]['paragraphs'][0]['context'])
    filtered_data['question'].append(data['data'][i]['paragraphs'][0]['qas'][0]['question'])
    filtered_data['answer'].append(data['data'][i]['paragraphs'][0]['qas'][0]['answers'][0]["text"])

df = pd.DataFrame.from_dict(filtered_data)

In [51]:
df.head()

Unnamed: 0,title,context,question,answer
0,Wound healing,Wound care encourages and speeds wound healing...,What is a way to increase your wound healing s...,Wound care
1,Burntisland Shipbuilding Company,Brothers Amos and Wilfrid Ayre founded Burntis...,Who founded the Burntisland Shipbuilding Company?,Amos and Wilfrid Ayre
2,Cerebral cortex,The cerebral cortex is folded in a way that al...,What is the surface area of the human cortex?,1.3 square feet
3,Agatha Christie,Guinness World Records lists Christie as the b...,How many units has Agatha Christie sold?,2 billion copies
4,Nájera,The town was conquered by Ordoño II of Leon fo...,When was Nájera established?,923


## Sentence Level

In [112]:
df['context'] = df.context.apply(lambda x: sent_tokenize(x))

In [113]:
df['context_size'] = df.context.apply(lambda x: len(x))

In [125]:
df = df[df.context_size >= 10].copy()

In [126]:
df.shape

(979, 4)

# Running Experiment

## Setup

In [128]:
SEED = 42
# Initialize Accelerator
accelerator_main = Accelerator(mixed_precision="fp16")

# Load Model
if accelerator_main.is_main_process:
    print("Main Script: Loading model...")
# model_path = "mistralai/Mistral-7B-Instruct-v0.3"
# model_path = "meta-llama/Llama-3.1-8B-Instruct"
model_path = "Qwen/Qwen2.5-3B-Instruct"

model_cpu = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model_cpu.config.pad_token_id = tokenizer.pad_token_id
    if hasattr(model_cpu, 'generation_config') and model_cpu.generation_config is not None:
        model_cpu.generation_config.pad_token_id = tokenizer.pad_token_id

if accelerator_main.is_main_process:
    print("Main Script: Preparing model with Accelerator...")
prepared_model = accelerator_main.prepare(model_cpu)
unwrapped_prepared_model = accelerator_main.unwrap_model(prepared_model)
unwrapped_prepared_model.eval()
if accelerator_main.is_main_process:
    print("Main Script: Model prepared and set to eval.")

# Define utility cache

accelerator_main.wait_for_everyone()

Main Script: Loading model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Main Script: Preparing model with Accelerator...
Main Script: Model prepared and set to eval.


In [132]:
df = df.loc[0:2].copy()

df

Unnamed: 0,title,context,question,context_size
0,Zebra finch,[\nThe zebra finch (Taeniopygia guttata)[2] is...,Do zebra finches have stripes?,205
1,Christian dietary laws,"[\nIn mainstream Nicene Christianity, there is...",Does Catholicism have any dietary restrictions?,18
2,Wound healing,[\nWound healing is a complex process in which...,What is a way to increase your wound healing s...,349


In [133]:
num_questions_to_run=len(df.question)
# num_questions_to_run=1
all_metrics_data = []
all_results=[]
M=[]
Fs=[]
pairs=[]
mse_inters=[]
mse_lins=[]
mse_fms=[]
dataset_name = 'tidyQA'

for i in tqdm(range(num_questions_to_run), desc="Processing Questions", disable=not accelerator_main.is_main_process):
    query = df.question[i]
    if accelerator_main.is_main_process:
        print(f"\n--- Question {i+1}/{num_questions_to_run}: {query[:60]}... ---")

    if isinstance(df.context[i], list) == False: 
        docs=ast.literal_eval(df.context[i])
    else: 
        docs = df.context[i]

    utility_cache_base_dir = f"../Experiment_data/{dataset_name}"
    utility_cache_filename = f"utilities_q_idx{i}_n{len(docs)}.pkl" # More robust naming
    current_utility_path = os.path.join(utility_cache_base_dir, utility_cache_filename)
    
    if accelerator_main.is_main_process: # Only main process creates directories
        os.makedirs(os.path.dirname(current_utility_path), exist_ok=True)
        print(f"  Instantiating ShapleyExperimentHarness for Q{i} (n={len(docs)} docs)...")
    
    # Initialize Harness
    harness = ContextAttribution(
        items=docs,
        query=query,
        prepared_model_for_harness=prepared_model,
        tokenizer_for_harness=tokenizer,
        accelerator_for_harness=accelerator_main,
        verbose=False
    )
    # Compute metrics
    results_for_query = {}
    # M.append(harness.compute_shapley_interaction_index_pairs_matrix())
    if accelerator_main.is_main_process:

        m_samples_map = {"L": 32}
        T_iterations_map = { "L":20}

        for size_key, num_s in m_samples_map.items():
            if 2**len(docs) < num_s and size_key != "L":
                actual_samples = max(1, 2**len(docs)-1 if 2**len(docs)>0 else 1)
            else:
                actual_samples = num_s
            
            print("HERE", actual_samples)

            if actual_samples > 0: 
                results_for_query[f"ContextCite{actual_samples}"] = harness.compute_contextcite(num_samples=actual_samples, seed=SEED)
                print("HERE")

                # results_for_query[f"WSS_FM{actual_samples}"], F, mse_fm = harness.compute_wss(num_samples=actual_samples, seed=SEED)
                # Fs.append(F)
                # mse_fms.append(mse_fm)
                results_for_query[f"BetaShap{actual_samples}"] = harness.compute_beta_shap(num_iterations_max=T_iterations_map[size_key], beta_a=4, beta_b=4, max_unique_lookups=actual_samples, seed=SEED)
                results_for_query[f"TMC{actual_samples}"] = harness.compute_tmc_shap(num_iterations_max=T_iterations_map[size_key], performance_tolerance=0.001, max_unique_lookups=actual_samples, seed=SEED)

        results_for_query["LOO"] = harness.compute_loo()
        results_for_query["ARC-JSD"] = harness.compute_arc_jsd()

        # exact_scores = results_for_query.get("ExactInter")
        all_results.append(results_for_query)

Processing Questions:   0%|          | 0/3 [00:00<?, ?it/s]


--- Question 1/3: Do zebra finches have stripes?... ---
  Instantiating ShapleyExperimentHarness for Q0 (n=205 docs)...




HERE 32
HERE


  0%|          | 0/205 [00:00<?, ?it/s]


--- Question 2/3: Does Catholicism have any dietary restrictions?... ---
  Instantiating ShapleyExperimentHarness for Q1 (n=18 docs)...




HERE 32
HERE


  0%|          | 0/18 [00:00<?, ?it/s]


--- Question 3/3: What is a way to increase your wound healing speed?... ---
  Instantiating ShapleyExperimentHarness for Q2 (n=349 docs)...




HERE 32
HERE


  0%|          | 0/349 [00:00<?, ?it/s]

In [139]:
np.save('../Experiment_data/tidyQA/all_results.npy', all_results)

In [141]:
results = np.load('../Experiment_data/tidyQA/all_results.npy', allow_pickle=True)