In [1]:
import json
import glob
from tqdm import tqdm

from collections import defaultdict, Counter
from sklearn.metrics import cohen_kappa_score
import copy
import tiktoken

In [2]:
modality = "passage"

In [5]:
rel_map = {"has nothing to do with the query" : 0, 
            "related to the query but does not answer it" : 1, 
            "has some answer for the query, but the answer may be a bit unclear, or hidden amongst extraneous information" : 2, 
            "dedicated to the query and contains the exact answer" : 3}

def load_human_qrels(path):
    model_id = path.split("log")[-1].replace(".json", "")
    model_name = f"rater_{model_id}"
    qrels = json.load(open(path))
    normed_qrels = defaultdict(list)
    for assessment in qrels:
        entry = {"model" : model_name, "judgment" : rel_map[assessment['relevance']], "docid" : int(assessment['id'])}
        normed_qrels[assessment['topic']].append(entry)
    return model_name, dict(normed_qrels)

def load_machine_qrels(path):
    model_name = path.split("pool_")[-1].replace(".json", "") 
    qrels = json.load(open(path))
    return model_name, qrels
    
    

def make_qrels_comparable(qrels_a, qrels_b):
    topic_ids_a = {topic:[ele['docid'] for ele in qrels_a[topic]] for topic in qrels_a.keys()} 
    topic_ids_b = {topic:[ele['docid'] for ele in qrels_b[topic]] for topic in qrels_b.keys()}
    topic_id_overlap = {k: list(set(topic_ids_a[k]) & set(topic_ids_b[k])) for k in topic_ids_a.keys()}
    
    new_qrels_a = {}
    new_qrels_b = {}
    for key in topic_id_overlap.keys():
        new_qrels_a[key] = [ele for ele in qrels_a[key] if ele['docid'] in topic_id_overlap[key]]
        new_qrels_b[key] = [ele for ele in qrels_b[key] if ele['docid'] in topic_id_overlap[key]]
    return new_qrels_a, new_qrels_b

def check_res_status(qrel_data, verbose=False):
    sanity_cnt = 0
    bad_cnt = 0
    for query_id, query_data in tqdm(qrel_data.items()):
        for assessment in query_data:
            if assessment['result_status'] == 1:
                sanity_cnt += 1
            else:
                if verbose:
                    print(assessment)
                bad_cnt += 1
    print(f"Sanity count: {sanity_cnt}, Bad count: {bad_cnt}")
    print(f"Coverage: {(sanity_cnt + bad_cnt) / 10000} ")

def eval_status(qrel_paths):
    for qrel_path in qrel_paths:
        try:    
            qrel_data = json.load(open(qrel_path))
            print(qrel_path)
            check_res_status(qrel_data)
            print("-"*100)
        except Exception as e:
            print(f"Error loading {qrel_path}: {e}")
            continue

def flatten_ratings(ratings_dict):
    """
    Converts a nested ratings dictionary with the structure:
    
        {
          'topic1': [
             {'model': 'rater_X', 'judgment': int, 'docid': int}, 
             ...
          ],
          'topic2': [...],
          ...
        }
    
    into a flat dictionary mapping (topic, docid) -> judgment.
    This way, if the same docid appears in different topics, they are treated
    as separate rating items.
    """
    flattened = {}
    for topic, rating_list in ratings_dict.items():
        for entry in rating_list:
            # Use (topic, docid) as the key to avoid overriding entries
            key = (topic, entry['docid'])
            flattened[key] = entry['judgment']
    return flattened

def compute_cohens_kappa(dict1, dict2):
    """
    Computes Cohen's Kappa between two dictionaries with nested rating entries.
    Ratings are aligned based on the composite key (topic, docid).
    """
    # Flatten the dictionaries using the composite key (topic, docid)
    ratings1 = flatten_ratings(dict1)
    ratings2 = flatten_ratings(dict2)
    print(ratings1)
    # Find common (topic, docid) pairs to compare.
    common_keys = set(ratings1.keys()).intersection(ratings2.keys())
    if not common_keys:
        #skip if no overlapping (topic, docid) pairs between the two dictionaries.  
        return 0
        #raise ValueError("No overlapping (topic, docid) pairs between the two dictionaries.")
    
    # Create parallel lists of judgments for these common keys.
    judgments1 = [ratings1[key] for key in common_keys]
    judgments2 = [ratings2[key] for key in common_keys]
    
    # Compute and return Cohen's Kappa.
    kappa = cohen_kappa_score(judgments1, judgments2)
    return kappa

In [6]:
if modality == "passage":
    human_qrels_paths = glob.glob(f"/workspace/src/data/human_qrels/*_chunk*")
    machine_qrels_paths = glob.glob(f"/workspace/src/data/qrels_passage_pool*")  

elif modality == "table":
    human_qrels_paths = glob.glob(f"/workspace/src/data/human_qrels/*_table*")
    machine_qrels_paths = glob.glob(f"/workspace/src/data/qrels_table_pool*")


human_qrels = {}
for path in human_qrels_paths:
    name, qrels = load_human_qrels(path)
    human_qrels[name] = qrels

machine_qrels = {}
for path in machine_qrels_paths:
    name, qrels = load_machine_qrels(path)
    machine_qrels[name] = qrels


In [7]:
machine_qrels.keys()

dict_keys(['meta-llama_Llama-3.2-3B-Instruct', 'Qwen_Qwen2.5-14B-Instruct', 'mistralai_Mistral-7B-Instruct-v0.3', 'gpt-4o-2024-11-20', 'majority_vote', 'gpt-4o-mini-2024-07-18', 'o3-mini-2025-01-31', 'microsoft_phi-4', 'tiiuae_Falcon3-7B-Instruct', 'google_gemma-2-9b-it', 'mistralai_Mistral-Small-Instruct-2409'])

In [26]:
def estimate_token_cost(judgments, model_name, input_cost = 0.15, output_cost = 0.6,reasoning_const=100):
    encoding = tiktoken.get_encoding("o200k_base")

    input_text = ""
    output_text = ""
    judgments_count = 0

    for topic, item_list in judgments.items():
        for item in item_list:
            input_text += item['prompt'] + "\n"
            output_text += item['prediction'] + "\n"
            judgments_count += 1
    input_tokens = encoding.encode(input_text)
    output_tokens = encoding.encode(output_text)


    input_cost = len(input_tokens) * input_cost / 1000**2
    output_cost = len(output_tokens) * output_cost / 1000**2
    print(len(input_tokens))
    if "o3" in model_name:
        input_cost = (len(input_tokens)+(reasoning_const*judgments_count)) * input_cost / 1000**2

    print(f"Input cost: {input_cost}, Output cost: {output_cost}")
    print(f"Total cost for {model_name}: {input_cost + output_cost}$")
    print(f"Cost per judgment: {round((input_cost + output_cost) / judgments_count, 6)}$")

In [27]:
import pandas as pd

def print_inter_rater_analysis_df(modality, human_qrels, machine_qrels, filter_machine = []):
    # Create an empty list to collect rows of data. 
    rows = []
    # Loop over each human rater and machine system in sorted order.
    for human in sorted(human_qrels.keys()):
        for machine in sorted(machine_qrels.keys()):
            kappa = compute_cohens_kappa(human_qrels[human], machine_qrels[machine])
            #prevent machine human pair is aleady processed
            reverse_rater_list = [row for row in rows if row['Human Rater'] == machine and row['Machine System'] == human]
            if reverse_rater_list:
                continue

            if machine in filter_machine:
                continue
            
            if kappa != 0 and kappa != 1:
                rows.append({
                    'Human Rater': human,
                    'Machine System': machine,
                    "Cohen's Kappa": kappa
                })
    
    # Create a DataFrame from the list of dictionaries.
    df = pd.DataFrame(rows)
    # Format the "Cohen's Kappa" column to display 4 decimal places.
    df["Cohen's Kappa"] = df["Cohen's Kappa"].map(lambda x: f'{x:.4f}')
    
    # Print the title and the DataFrame.
    print(f"Inter-Rater Analysis of {modality} Qrels:\n")
    return df

In [28]:
pd.set_option('display.max_rows', 500)
filter_machine = ["meta-llama_Llama-3.2-3B-Instruct"]


In [None]:
human_qrels.keys()

In [None]:
df = print_inter_rater_analysis_df(modality, human_qrels, machine_qrels, filter_machine=["meta-llama_Llama-3.2-3B-Instruct"])
df["Cohen's Kappa"] = pd.to_numeric(df["Cohen's Kappa"], errors="coerce")

# Group by "Machine System" and compute the mean Cohen's Kappa (ignoring NaN values)
avg_kappa = df.groupby("Machine System")["Cohen's Kappa"].mean().sort_values(ascending=False)
avg_kappa

In [None]:
print_inter_rater_analysis_df(modality, human_qrels, human_qrels)

In [15]:
#generate majority vote qrels
gpt_qrels_passage_paths = sorted([path for path in glob.glob(f"/workspace/src/data/qrels_{modality}*") if "gpt" in path or "o3" in path])

In [16]:
gpt_qrels_passage= [load_machine_qrels(path)[1] for path in gpt_qrels_passage_paths]

In [None]:
gpt_qrels_passage_paths

In [18]:
def majority_vote(lst):
    """
    Given a list of numbers, return the majority vote.
    If there is a tie, return the candidate closest to the average.
    """
    counts = Counter(lst)
    max_count = max(counts.values())
    # Get all candidates with the maximum count.
    candidates = [num for num, count in counts.items() if count == max_count]
    
    if len(candidates) == 1:
        return candidates[0]
    else:
        # In case of tie, calculate the average of the list.
        avg = sum(lst) / len(lst)
        # Return the candidate that is closest to the average.
        return min(candidates, key=lambda x: abs(x - avg))

In [19]:
def generate_majority_vote_qrels(gpt_qrels_table):
    vote_list = [{k:[gpt_qrels_table[j][k][i]['judgment'] for i in range(len(gpt_qrels_table[j][k]))] for k in list(gpt_qrels_table[j].keys())} for j in range(len(gpt_qrels_table))]
    all_votes = {}
    for key in vote_list[0]:
        k = len(vote_list[0][key])
        # For each index in the inner list, collect the elements from all dictionaries.
        aggregated = [[d[key][i] for d in vote_list] for i in range(k)]
        all_votes[key] = aggregated

    majority_results = {}
    for key, list_of_votes in all_votes.items():
        majority_results[key] = [majority_vote(votes) for votes in list_of_votes]

    dummy_qrel = copy.deepcopy(gpt_qrels_table[0])
    
    for topic, items in majority_results.items():
        for i in range(len(items)):
            dummy_qrel[topic][i]['judgment'] = items[i]
            dummy_qrel[topic][i]['model'] = "majority_vote"
            dummy_qrel[topic][i]['prediction'] = "-"
            dummy_qrel[topic][i]['result_status'] = 1

    return dummy_qrel



In [20]:
majority_vote_qrel = generate_majority_vote_qrels(gpt_qrels_passage)

In [21]:
#safe majority vote qrels to json
with open(f"/workspace/src/data/qrels_{modality}_pool_majority_vote.json", "w") as f:
    json.dump(majority_vote_qrel, f)