In [1]:
import sys
import os
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(parent_dir)

from SHapRAG import*
import pandas as pd
from scipy.stats import spearmanr, pearsonr
import time
# torch.cuda.set_device(1)  # Use GPU1, or 2, or 3


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
documents = [
    "The weather in Paris is sunny today.",
    "Paris is the capital of France.",
    "The sun is shining in Paris today",
    "Berlin is the capital of Germany.", # Irrelevant
    # "The Eiffel Tower is located in Paris, France.",
    # "France borders several countries including Germany.",
    "The currency used in France is the Euro.",
    "Paris hosted the Summer Olympics in 1900 and 1924.",
    "Germany uses the Euro as well.", # Redundant info
    "It is cloudy in Berlin today."
]
queries = ["What is the weather like in the capital of France?", "Germany is good?", "Paris is the capital of Germany?"]
# target_response = "Paris is sunny." # The ideal answer fragment

In [None]:
print("\nInstantiating ShapleyExperimentHarness (will pre-compute all utilities)...")
harness = ShapleyExperimentHarness(
    items=documents,
    query="What is the weather like in the capital of France?",
    # llm_model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0", # Use a smaller model for faster demo if preferred
    llm_model_name="meta-llama/Meta-Llama-3.1-8B-Instruct", # Or your chosen model
    verbose=True
)
print(f"Harness target_response automatically generated: '{harness.target_response}'")
print(f"Number of items (n): {harness.n_items}")

# 3. Compute Attributions using different methods from the harness
results = {}
seed = 42 # For reproducibility of stochastic methods

print("\n--- Computing Attributions using Harness (from pre-computed utilities) ---")

# Adjust num_samples for WSS/ContextCite if n is very small
m_samples_for_approx = 64
results["ContextCite"] = harness.compute_contextcite_weights(num_samples=m_samples_for_approx, lasso_alpha=0.0, seed=seed) # LinReg
results["WSS"] = harness.compute_wss(num_samples=m_samples_for_approx, lasso_alpha=0.0, seed=seed) # LinReg

T_iterations = harness.n_items * 20 # Adjust iterations as needed
results["TMC"] = harness.compute_tmc_shap(num_iterations=T_iterations, performance_tolerance=0.001, seed=seed)
if beta_dist: 
    results["BetaShap (U)"] = harness.compute_beta_shap(num_iterations=T_iterations, beta_a=0.5, beta_b=0.5, seed=seed)


results["LOO"] = harness.compute_loo()
results["Exact"] = harness.compute_exact_shap() # n=6 is 64 calls, feasible

# 4. Display Results
print("\n\n--- Attribution Scores (from Harness) ---")
# Create item labels
item_labels = [f'Doc {i}' for i in range(harness.n_items)]

# Filter out None results before creating DataFrame
valid_results = {k:v for k, v in results.items() if v is not None and isinstance(v, np.ndarray) and len(v) == harness.n_items}

if valid_results:
    results_df = pd.DataFrame(valid_results, index=item_labels)
    print(results_df.round(4))

    if "Exact" in valid_results:
        print("\n--- Evaluation Metrics vs Exact Shapley ---")
        metrics_data = []
        exact_scores = valid_results["Exact"]
        for method, approx_scores in valid_results.items():
            if method != "Exact":

                # Handle potential constant arrays for correlation
                if np.all(exact_scores == exact_scores[0]) or np.all(approx_scores == approx_scores[0]):
                    pearson_c = 1.0 if np.allclose(exact_scores, approx_scores) else 0.0
                    spearman_c = 1.0 if np.allclose(exact_scores, approx_scores) else 0.0
                else:
                    pearson_c, _ = pearsonr(exact_scores, approx_scores)
                    spearman_c, _ = spearmanr(exact_scores, approx_scores)
                
                metrics_data.append({
                    "Method": method,
                    "Pearson": pearson_c,
                    "Spearman": spearman_c
                })
        
        if metrics_data:
            metrics_df = pd.DataFrame(metrics_data).set_index("Method")
            print(metrics_df.round(4))
        else:
            print("No approximate methods to compare against Exact.")
else:
    print("No valid attribution results were computed by the harness.")


Instantiating ShapleyExperimentHarness (will pre-compute all utilities)...
Loading LLM 'meta-llama/Meta-Llama-3.1-8B-Instruct' on device 'cuda'...


Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.76it/s]
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


LLM loaded successfully.
Generating target_response using all items...


Fetching 3 files: 100%|██████████| 3/3 [00:00<00:00, 12945.38it/s]


Generated target_response: 'The weather in the capital of France (Paris) is sunny today.'
Pre-computing utilities for all 256 subsets (n=8)...


Pre-computing Utilities: 100%|██████████| 256/256 [02:53<00:00,  1.47it/s]


Pre-computation complete. Made 256 LLM calls.
Harness target_response automatically generated: 'The weather in the capital of France (Paris) is sunny today.'
Number of items (n): 8

--- Computing Attributions using Harness (from pre-computed utilities) ---
Computing ContextCite Weights (m=64, using pre-computed utilities)...
Computing Weakly Supervised Shapley (m=64, using pre-computed utilities)...
Computing TMC-Shapley (T=160, using pre-computed utilities)...


                                                                    

Computing Beta-Shapley (T=160, α=0.5, β=0.5, using pre-computed utilities)...


                                                                                     

Computing LOO (n=8, using pre-computed utilities)...
Computing Exact Shapley (using pre-computed utilities)...


--- Attribution Scores (from Harness) ---
       ContextCite      WSS      TMC  BetaShap (U)     LOO    Exact
Doc 0      13.9144  14.9906  14.5110       14.8129  2.0640  14.9513
Doc 1       1.2630   2.1980   2.8406        5.3029  1.9820   2.5517
Doc 2      13.8312  15.3550  13.4659       15.8340  0.1303  13.2135
Doc 3      -1.2159  -0.0944   0.2742        2.1902 -0.1183  -0.1139
Doc 4      -0.7668  -0.9923  -0.2747        0.0036 -0.1946  -0.2381
Doc 5      -2.5924  -0.9990  -0.2695        0.4441 -0.0644  -0.1724
Doc 6       0.6996   0.9123   0.3246        1.1609 -0.2198   0.4397
Doc 7      -1.6877  -0.4979   0.0000        2.4306 -0.0896   0.2404

--- Evaluation Metrics vs Exact Shapley ---
                 MAE      MSE  Pearson  Spearman
Method                                          
ContextCite   1.1477   1.7816   0.9906    0.8333
WSS           0.6682   0.8417   0.9944   



In [None]:
def evaluate_shap(query, n_rag=5):
    # docs, scores=retrieve_documents(query, n_rag)
    target_response=compute_logprob(query, model, tokenizer, context=documents, max_new_tokens=50, response=True)
    harness = ShapleyExperimentHarness(
                                        items=documents,
                                        query=query,
                                        target_response=target_response,
                                        llm_caller=compute_logprob,
                                        verbose=False,
                                        utility_cache_file=None
                                    )
    query_metrics = []
    exact_s = harness.compute_exact_shap()
    methods_to_run = {
        "ContextCite": lambda: harness.compute_contextcite_weights(num_samples=64, lasso_alpha=0.0),
        "WSS": lambda: harness.compute_wss(num_samples=64, lasso_alpha=0.0),
        "TMC": lambda: harness.compute_tmc_shap(num_iterations=10*n_rag, performance_tolerance=0.001),
        "LOO": lambda: harness.compute_loo()
    }
    if beta_dist: # Only add if scipy was imported successfully
        methods_to_run["BetaShap (U)"] = lambda: harness.compute_beta_shap(num_iterations=10*n_rag, beta_a=0.5, beta_b=0.5)

    for method_name, method_func in methods_to_run.items():
        approx_scores = method_func()
        
        mae_val, pearson_val, spearman_val = np.nan, np.nan, np.nan # Default to NaN

        if approx_scores is not None and len(approx_scores) == len(exact_s):
            mae_val = np.mean(np.abs(exact_s - approx_scores))
            if pearsonr and spearmanr: # Check if scipy functions were imported
                # Handle constant arrays for correlation
                if np.all(exact_s == exact_s[0]) or np.all(approx_scores == approx_scores[0]):
                    is_close = np.allclose(exact_s, approx_scores)
                    pearson_val = 1.0 if is_close else 0.0
                    spearman_val = 1.0 if is_close else 0.0
                else:
                    pearson_val, _ = pearsonr(exact_s, approx_scores)


        query_metrics.append({
            "method": method_name,
            "mae": mae_val,
            "pearson": pearson_val,
            "spearman": spearman_val
        })
    return query_metrics

In [None]:
model_name="meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
)
model.eval()
model = accelerator.prepare(model)
evaluate_shap(queries[0])

In [None]:
print("--- Initializing Experiment Harness (Pre-computing all utilities) ---")
start_harness_time = time.time()
harness = ShapleyExperimentHarness(
    items=documents,
    query=query,
    target_response=target_response,
    llm_caller=compute_logprob# Use placeholder for speed
)
print(f"Harness initialized in {time.time() - start_harness_time:.2f}s. "
        f"{len(harness.all_true_utilities)} utilities computed.")

results_exp = {}
seed_exp = 42

print("\n--- Running Methods using Pre-computed Utilities ---")

start_method_time = time.time()
results_exp["Exact"] = harness.compute_exact_shap()
print(f"Exact Shapley (from cache) took {time.time() - start_method_time:.4f}s")

start_method_time = time.time()
results_exp["ContextCite"] = harness.compute_contextcite_weights(num_samples=64, lasso_alpha=0.0, seed=seed_exp)
print(f"ContextCite (from cache) took {time.time() - start_method_time:.4f}s")

start_method_time = time.time()
results_exp["WSS"] = harness.compute_wss(num_samples=64, lasso_alpha=0.0, seed=seed_exp)
print(f"WSS (from cache) took {time.time() - start_method_time:.4f}s")

start_method_time = time.time()
results_exp["TMC"] = harness.compute_tmc_shap(num_iterations=harness.n_items * 15, performance_tolerance=0.001, seed=seed_exp)
print(f"TMC (from cache) took {time.time() - start_method_time:.4f}s")

if beta_dist:
    start_method_time = time.time()
    results_exp["BetaShap (U)"] = harness.compute_beta_shap(num_iterations=harness.n_items * 15, beta_a=0.5, beta_b=0.5, seed=seed_exp)
    print(f"BetaShap (U, from cache) took {time.time() - start_method_time:.4f}s")

start_method_time = time.time()
results_exp["LOO"] = harness.compute_loo()
print(f"LOO (from cache) took {time.time() - start_method_time:.4f}s")

# Display Results
print("\n\n--- Experiment Harness: Comparison Table ---")
valid_results_exp = {k:v for k, v in results_exp.items() if v is not None}
if valid_results_exp:
    results_df_exp = pd.DataFrame(valid_results_exp, index=[f'Item {i}' for i in range(harness.n_items)])
    print(results_df_exp.round(4))
else:
    print("No valid results were computed by the harness.")

In [None]:
if "Exact" in valid_results_exp:
            print("\n--- Experiment Harness: Metrics vs Exact Shapley ---")
            exact_scores = valid_results_exp["Exact"]
            for method, approx_scores in valid_results_exp.items():
                if method != "Exact" and approx_scores is not None and len(approx_scores) == len(exact_scores):
                    
                    # Calculate Pearson and Spearman, handle potential constant arrays
                    if np.all(exact_scores == exact_scores[0]) or np.all(approx_scores == approx_scores[0]):
                        # If one or both are constant, correlation might be ill-defined or 0/1
                        # pearsonr/spearmanr might return NaN or raise warning for constant input.
                        pearson_corr_val = 1.0 if np.allclose(exact_scores, approx_scores) else 0.0
                        spearman_corr_val = 1.0 if np.allclose(exact_scores, approx_scores) else 0.0
                    else:
                        try:
                            pearson_corr_val, _ = pearsonr(exact_scores, approx_scores)
                            spearman_corr_val, _ = spearmanr(exact_scores, approx_scores)
                        except ValueError: # e.g. if NaNs are present or other issues
                            pearson_corr_val = np.nan
                            spearman_corr_val = np.nan
                            
                    print(f"{method}: Pearson={pearson_corr_val:.4f}, Spearman={spearman_corr_val:.4f}")
                elif method != "Exact":
                    print(f"{method}: Could not compute metrics (scores missing or length mismatch).")

--- Experiment Harness: Metrics vs Exact Shapley ---
ContextCite: Pearson=0.9978, Spearman=1.0000
WSS: Pearson=0.9926, Spearman=1.0000
TMC: Pearson=0.9995, Spearman=1.0000
BetaShap (U): Pearson=0.9864, Spearman=0.9429
LOO: Pearson=0.9678, Spearman=0.9429