In [1]:
import json
from collections import Counter

def analyze_results(csv):
    # Load the results from the output file
    with open(csv, 'r') as f:
        results = json.load(f)

    # Initialize counters
    total_claims = len(results)
    correct_predictions = 0
    true_correct = 0
    target_in_top_10 = 0
    correct_gold_entailment = 0
    gold_entailment_differs = 0
    entailment_counter = Counter()

    # Iterate over the results to calculate the metrics
    for result in results:
        claim = result['claim']
        retrieved_passages = result['retrieved_passages']
        target_passage = result['target_passage']
        gold_entailment = result['gold_entailment']
        target_entailment = result['target_entailment']

        entailment_counter[target_entailment] += 1
        
        predicted_entailment = "entailment" if any(p['entailment'] == "entailment" for p in retrieved_passages) else "contradiction"
        gold_entailment = "entailment" if gold_entailment == "entailment" else "contradiction"
        
        # Calculate accuracy for claim entailment
        if predicted_entailment == target_entailment:
            correct_predictions += 1
        
        # Check if target passage is in the top 10 retrieved passages
        if any(p['passage'] == target_passage for p in retrieved_passages):
            target_in_top_10 += 1
            if (gold_entailment == predicted_entailment) and (predicted_entailment == target_entailment):
                true_correct += 1

        # Calculate accuracy for gold entailment
        if gold_entailment == target_entailment:
            correct_gold_entailment += 1
        
        # Check if gold entailment differs from the claim results
        if gold_entailment != predicted_entailment:
            gold_entailment_differs += 1

    # Calculate the overall accuracy
    accuracy = correct_predictions / total_claims
    true_accuracy = true_correct / total_claims
    percentage_target_in_top_10 = target_in_top_10 / total_claims
    accuracy_gold_entailment = correct_gold_entailment / total_claims
    percentage_gold_entailment_differs = gold_entailment_differs / total_claims

    # Print the results
    print(f"Overall Accuracy: {accuracy * 100:.2f}%")
    print(f"True Accuracy (correct because gold passage is in retrieved and used to get the correct entailment): {true_accuracy * 100:.2f}%")
    print(f"Percentage of Time Target Passage is in Top 10: {percentage_target_in_top_10 * 100:.2f}%")
    print(f"Accuracy for Gold Entailment: {accuracy_gold_entailment * 100:.2f}%")
    print(f"Percentage of Time Gold Entailment Differs from Claim Results: {percentage_gold_entailment_differs * 100:.2f}% \n")

    print("Count of Each Target Entailment:")
    for entailment, count in entailment_counter.items():
        print(f"{entailment}: {count}")
    print("\n")
        
    gold_entailment_counter = Counter()
    retrieved_entailment_counter = Counter()

    # Iterate over the results to update the entailment counters
    for result in results:
        gold_entailment = result['gold_entailment']
        gold_entailment_counter[gold_entailment] += 1

        for passage in result['retrieved_passages']:
            passage_entailment = passage['entailment']
            retrieved_entailment_counter[passage_entailment] += 1

    # Print the count of each gold entailment type
    print("Count of Each Gold Entailment Type:")
    for entailment, count in gold_entailment_counter.items():
        print(f"{entailment}: {count}")

    # Print the count of each retrieved passage entailment type
    print("\nCount of Each Retrieved Passage Entailment Type (average per top 10 claims):")
    for entailment, count in retrieved_entailment_counter.items():
        print(f"{entailment}: {count/total_claims}")

In [30]:
analyze_results("results_dpr_bert.json")

Overall Accuracy: 23.06%
True Accuracy (correct because gold passage is in retrieved and used to get the correct entailment): 2.49%
Percentage of Time Target Passage is in Top 10: 17.46%
Accuracy for Gold Entailment: 22.85%
Percentage of Time Gold Entailment Differs from Claim Results: 1.60% 

Count of Each Target Entailment:
entailment: 8941
contradiction: 2593


Count of Each Gold Entailment Type:
neutral: 9180
contradiction: 2303
entailment: 51

Count of Each Retrieved Passage Entailment Type (average per top 10 claims):
neutral: 7.797208253858159
contradiction: 2.187619212762268
entailment: 0.015172533379573435


In [31]:
analyze_results("results_dpr_bart.json")

Overall Accuracy: 43.32%
True Accuracy (correct because gold passage is in retrieved and used to get the correct entailment): 13.24%
Percentage of Time Target Passage is in Top 10: 17.46%
Accuracy for Gold Entailment: 72.78%
Percentage of Time Gold Entailment Differs from Claim Results: 46.51% 

Count of Each Target Entailment:
entailment: 8941
contradiction: 2593


Count of Each Gold Entailment Type:
entailment: 7358
neutral: 2486
contradiction: 1690

Count of Each Retrieved Passage Entailment Type (average per top 10 claims):
contradiction: 3.2231662909658403
neutral: 6.249523148950928
entailment: 0.5273105600832322


In [32]:
analyze_results("results_contriever_bert.json")

Overall Accuracy: 23.24%
True Accuracy (correct because gold passage is in retrieved and used to get the correct entailment): 6.57%
Percentage of Time Target Passage is in Top 10: 41.61%
Accuracy for Gold Entailment: 22.85%
Percentage of Time Gold Entailment Differs from Claim Results: 2.14% 

Count of Each Target Entailment:
entailment: 8941
contradiction: 2593


Count of Each Gold Entailment Type:
neutral: 9180
contradiction: 2303
entailment: 51

Count of Each Retrieved Passage Entailment Type (average per top 10 claims):
neutral: 7.519420842725854
contradiction: 2.4594243107334837
entailment: 0.021154846540662388


In [33]:
analyze_results("results_contriever_bart.json")

Overall Accuracy: 62.68%
True Accuracy (correct because gold passage is in retrieved and used to get the correct entailment): 30.87%
Percentage of Time Target Passage is in Top 10: 41.61%
Accuracy for Gold Entailment: 72.78%
Percentage of Time Gold Entailment Differs from Claim Results: 31.53% 

Count of Each Target Entailment:
entailment: 8941
contradiction: 2593


Count of Each Gold Entailment Type:
entailment: 7358
neutral: 2486
contradiction: 1690

Count of Each Retrieved Passage Entailment Type (average per top 10 claims):
entailment: 1.224206693254725
neutral: 4.926651638633605
contradiction: 3.84914166811167


In [34]:
analyze_results("results_bm25_bert.json")

Overall Accuracy: 23.63%
True Accuracy (correct because gold passage is in retrieved and used to get the correct entailment): 6.88%
Percentage of Time Target Passage is in Top 10: 45.10%
Accuracy for Gold Entailment: 22.85%
Percentage of Time Gold Entailment Differs from Claim Results: 2.42% 

Count of Each Target Entailment:
entailment: 8941
contradiction: 2593


Count of Each Gold Entailment Type:
neutral: 9180
contradiction: 2303
entailment: 51

Count of Each Retrieved Passage Entailment Type (average per top 10 claims):
neutral: 7.859285590428299
contradiction: 2.115918155019941
entailment: 0.024796254551760013


In [35]:
analyze_results("results_bm25_bart.json")

Overall Accuracy: 66.25%
True Accuracy (correct because gold passage is in retrieved and used to get the correct entailment): 34.09%
Percentage of Time Target Passage is in Top 10: 45.10%
Accuracy for Gold Entailment: 72.78%
Percentage of Time Gold Entailment Differs from Claim Results: 29.77% 

Count of Each Target Entailment:
entailment: 8941
contradiction: 2593


Count of Each Gold Entailment Type:
entailment: 7358
neutral: 2486
contradiction: 1690

Count of Each Retrieved Passage Entailment Type (average per top 10 claims):
entailment: 1.289491936882261
neutral: 5.025229755505462
contradiction: 3.685278307612277


In [4]:
analyze_results("results_contriever_fine_tuned_bart.json")

Overall Accuracy: 73.94%
True Accuracy (correct because gold passage is in retrieved and used to get the correct entailment): 64.15%
Percentage of Time Target Passage is in Top 10: 94.24%
Accuracy for Gold Entailment: 72.78%
Percentage of Time Gold Entailment Differs from Claim Results: 13.24% 

Count of Each Target Entailment:
entailment: 8941
contradiction: 2593


Count of Each Gold Entailment Type:
entailment: 7358
neutral: 2486
contradiction: 1690

Count of Each Retrieved Passage Entailment Type (average per top 10 claims):
entailment: 1.146263221779088
contradiction: 2.882434541355991
neutral: 5.971302236864921


In [1]:
import pandas as pd

# Data to be written to the CSV
data = {
    "Model": ["DPR", "Contriever", "BM25", "Contriever Fine-Tuned"],
    "Overall Accuracy (%)": [43.32, 62.68, 66.25, 73.94],
    "True Accuracy (%)": [13.24, 30.87, 34.09, 64.15],
    "Percentage of Time Target Passage is in Top 10 (%)": [17.46, 41.61, 45.10, 94.24],
    "Accuracy for Gold Entailment (%)": [72.78, 72.78, 72.78, 72.78],
    "Percentage of Time Gold Entailment Differs from Claim Results (%)": [46.51, 31.53, 29.77, 13.24],
    "Count of Each Target Entailment (Entailment)": [8941, 8941, 8941, 8941],
    "Count of Each Target Entailment (Contradiction)": [2593, 2593, 2593, 2593],
    "Count of Each Gold Entailment Type (Entailment)": [7358, 7358, 7358, 7358],
    "Count of Each Gold Entailment Type (Neutral)": [2486, 2486, 2486, 2486],
    "Count of Each Gold Entailment Type (Contradiction)": [1690, 1690, 1690, 1690],
    "Count of Each Retrieved Passage Entailment Type (Average per top 10 claims) (Entailment)": [0.5273, 1.2242, 1.2895, 1.1463],
    "Count of Each Retrieved Passage Entailment Type (Average per top 10 claims) (Neutral)": [6.2495, 4.9267, 5.0252, 5.9713],
    "Count of Each Retrieved Passage Entailment Type (Average per top 10 claims) (Contradiction)": [3.2232, 3.8491, 3.6853, 2.8824]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Saving DataFrame to CSV
df.to_csv('retrieval_models_comparison.csv', index=False)
