In [1]:
import pandas as pd
import numpy as np

In [9]:
result = pd.read_csv("./retrieval_results_heuristic_k5.csv")

In [10]:
result

Unnamed: 0,question_id,question_text,document_idx,embedding_method
0,646,Is the smallest penguin species the Little Blu...,9531182119212042595,bert-base-uncased
1,646,Is the smallest penguin species the Little Blu...,952953967975979,multi-qa-mpnet-base-dot-v1
2,646,Is the smallest penguin species the Little Blu...,9502467247125962642,hkunlp-instructor-large
3,646,Is the smallest penguin species the Little Blu...,951952953979981,intfloat-e5-small-v2
4,873,What does a citizen use to propose changes to ...,256566576582141,bert-base-uncased
...,...,...,...,...
395,794,Are turtles pets?,26842688269226932698,intfloat-e5-small-v2
396,332,Is he buried in the Princeton Cemetery of the ...,52569970613962226,bert-base-uncased
397,332,Is he buried in the Princeton Cemetery of the ...,14752556719672999,multi-qa-mpnet-base-dot-v1
398,332,Is he buried in the Princeton Cemetery of the ...,11728956727182903,hkunlp-instructor-large


In [14]:
from itertools import combinations

# --- helper: turn "953,1182,1192,1204,2595" into a set of ints ---
def parse_doc_idx(s):
    return set(map(int, str(s).split(',')))


# --- function to compute avg Jaccard distance inside one question group ---
def avg_jaccard_distance_per_question(group: pd.DataFrame) -> float:
    sets = group['document_idx'].apply(parse_doc_idx).tolist()
    
    # if fewer than 2 methods, variation is 0 by convention
    if len(sets) < 2:
        return 0.0
    
    dists = []
    for A, B in combinations(sets, 2):
        inter = len(A & B)
        union = len(A | B)
        jaccard_sim = inter / union if union > 0 else 0.0
        dists.append(1.0 - jaccard_sim)  # Jaccard distance
    
    return sum(dists) / len(dists)


# --- 1) average variation PER QUESTION ---
per_question_variation = (
    result.groupby('question_id')
      .apply(avg_jaccard_distance_per_question)
      .rename('avg_docidx_variation')
)

# --- 2) overall average variation across all questions (optional) ---
overall_avg_variation = per_question_variation.mean()
print("Overall avg variation across questions:", overall_avg_variation)


Overall avg variation across questions: 0.8935052910052909


  .apply(avg_jaccard_distance_per_question)


In [15]:
eval = pd.read_csv("result/rag_evaluation_summary.csv")

In [11]:
pd.concat([result,eval],axis=1).groupby('embedding_method').agg({'f1_score':['mean','std'],'cosine_similarity':['mean','std'],'bertscore_f1':['mean','std']}).reset_index()

Unnamed: 0_level_0,embedding_method,f1_score,f1_score,cosine_similarity,cosine_similarity,bertscore_f1,bertscore_f1
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std
0,bert-base-uncased,0.217856,0.380715,0.653507,0.28832,0.73982,0.120993
1,hkunlp-instructor-large,0.324301,0.439535,0.674187,0.293843,0.7715,0.133702
2,intfloat-e5-small-v2,0.277485,0.413854,0.659634,0.292439,0.749955,0.120672
3,multi-qa-mpnet-base-dot-v1,0.332001,0.437982,0.694386,0.287651,0.770692,0.12889
