In [None]:
import jsonlines

def jsonl_reader(input_file):
    data = []
    with jsonlines.open(input_file) as reader:
        for obj in reader:
            data.append(obj)
    return data

In [None]:
# write a function to count number of sentences and words in a paragraph

def count_sentences_words(paragraph):
    sentences = paragraph.split('.')
    num_sentences = len(sentences)
    num_words = 0
    for sentence in sentences:
        num_words += len(sentence.split())
    return {'num_sentences': num_sentences, 'num_words': num_words}

# `raw_data/hotpotqa/wikpedia-paragraphs`

In [None]:
import bz2

with bz2.open("raw_data/hotpotqa/wikpedia-paragraphs/AA/wiki_00.bz2", "rt") as bzinput:
    data = bzinput.read()
    print(type(data))
    print(data[1000:2000])



# `processed_data/hotpotqa/train.jsonl`

In [None]:
data = jsonl_reader('processed_data/hotpotqa/train.jsonl')

In [None]:
print(type(data))
print(data[0].keys())

In [None]:
print(data[0]['level'])
print(data[0]['type'])

In [None]:
data[0]['answers_objects']

In [None]:
print(f"Number of contexts: {len(data[0]['contexts'])}")
data[0]['contexts'][2:5]

In [None]:
print(count_sentences_words(data[0]['contexts'][2]['paragraph_text']))

In [None]:
import matplotlib.pyplot as plt

num_words_list = [] # for plotting histogram
max_num_words = 0
max_idx = 0
min_num_words = 1000000
min_idx = 0
num_samples = len(data)

for sample in data:
    for context in sample['contexts']:
        num_words = count_sentences_words(context['paragraph_text'])['num_words']
        num_words_list.append(num_words)
        if num_words > max_num_words:
            max_num_words = num_words
        if num_words < min_num_words:
            min_num_words = num_words

print(f"Number of samples: {num_samples}")
print(f"Max number of words: {max_num_words}")
print(f"max_idx: {max_idx}")
print(f"Min number of words: {min_num_words}")
print(f"min_idx: {min_idx}")

In [None]:
plt.hist(num_words_list, bins=100)
plt.show()


# `annotated_only_train.jsonl`

In [None]:
data = jsonl_reader('processed_data/hotpotqa/annotated_only_train.jsonl')

In [None]:
type(data)


In [None]:
data[0].keys()

In [None]:
data[0]['question_text']

In [None]:
data[0]['type']

In [None]:
data[0]['answers_objects']

In [None]:
print(f"Number of contexts: {len(data[0]['contexts'])}")
data[0]['contexts'][:3]

In [None]:
data[0]['reasoning_steps']

# Experiment Results

In [None]:
# for each predicted paragraph, check if the gold paragraph is in the list of predicted paragraphs
# if yes, identify the index of the gold paragraph in the list of predicted answers
# if no, set the index to -1
# then, plot the histogram of the indices

from matplotlib import pyplot as plt
import json
import csv
import os

def get_idx_count(predicted_file_name, gold_answers_file_name):
    # load them as dict
    with open(predicted_file_name) as f:
        predicted_answers = json.load(f)

    with open(gold_answers_file_name) as f:
        gold_answers = json.load(f)

    from collections import defaultdict
    idx_count = defaultdict(int)
    for key in predicted_answers.keys():
        gold_paragraphs = gold_answers[key]
        predicted_paragraphs = predicted_answers[key]
        for gold_paragraph in gold_paragraphs:
            if gold_paragraph in predicted_paragraphs:
                idx = predicted_paragraphs.index(gold_paragraph)
            else:
                idx = -1
            idx_count[idx] += 1
    idx_count = dict(sorted(idx_count.items()))
    return idx_count

def get_file_name_given_sim_threshold(threshold=90):
    predicted_file_name = f'predictions_saved/sim_{threshold}/ircot_flan_t5_large_hotpotqa____ircot____hotpotqa_to_hotpotqa__best/prediction__hotpotqa_to_hotpotqa__test_subsampled.json'
    gold_file_name = f'predictions_saved/sim_{threshold}/ircot_flan_t5_large_hotpotqa____ircot____hotpotqa_to_hotpotqa__best/ground_truth__hotpotqa_to_hotpotqa__test_subsampled.json'
    
    assert os.path.isfile(predicted_file_name), f"File not found: {predicted_file_name}"
    
    return predicted_file_name, gold_file_name

In [None]:
""" get the counts of all indices for different similarity thresholds. """

sim_thresholds = [90, 95]
idx_counts = []

for sim_threshold in sim_thresholds:
    predicted_file_name, gold_file_name = get_file_name_given_sim_threshold(sim_threshold)
    idx_count = get_idx_count(predicted_file_name, gold_file_name)
    idx_counts.append(idx_count)

indices = list(idx_counts[0].keys())

print(idx_counts)
print(indices)

In [None]:
# write sim_thresholds and idx_counts into csv
# where each row is a sim_threshold
# and columns are indices


output_path = 'sim_thresholds_idx_counts.csv'
with open(output_path, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['sim_threshold'] + indices)
    for i in range(len(sim_thresholds)):
        writer.writerow([sim_thresholds[i]] + [idx_counts[i][idx] for idx in indices])

    
    print(f'Done writing to {os.path.abspath(output_path)}')

In [None]:
x_y_pairs = []

for idx_count in idx_counts:
    x = list(idx_count.keys())
    y = list(idx_count.values())
    x_y_pairs.append((x, y))

In [None]:
width = 0.3

all_x = x_y[0]
all_x_shift = [x + width for x in all_x]
all_x_mid = [(x1 + x2) / 2 for (x1, x2) in zip(all_x, all_x_shift)] # for plotting the xticks

all_y = [x_y[1] for x_y in x_y_pairs]
plt.bar(all_x, all_y[0], width=width)
plt.bar(all_x_shift, all_y[1], width=width)


# show the values on top of the bars
# for a, b in zip(all_x, all_y[0]):
#     plt.text(a-1.5*width, b, str(b))

# for a, b in zip(all_x_shift, all_y[1]):
#     plt.text(a, b, str(b))


# description: plot the histogram of the indices of gold paragraphs in the list of predicted paragraphs
plt.title('#samples with gold paragraph at different positions')
plt.xlabel('Position of gold paragraph in the list of predicted paragraphs')
plt.ylabel('Number of samples')

plt.xticks(all_x_mid, all_x)

plt.legend(sim_thresholds)

# Similarity Threshold Exp on Retrieval (No QA!!!)

## exp-1. Read all metrics json file and convert to a csv


In [16]:
import json
import csv
import os
from collections import defaultdict
exp2_path = '/home/guest/r11944026/all_ircot/ircot_exp2/ircot'
sim_thresholds = [99, 90, 70, 50, 30]
print(sim_thresholds)

[99, 90, 70, 50, 30]


In [None]:

def get_metric_file_name_given_sim_threshold(threshold=90, count=15):
    metric_file_name = os.path.join(exp2_path, f'predictions_archived/oner_qa/oner_qa_flan_t5_base_hotpotqa_similarity_{threshold}____prompt_set_1___bm25_retrieval_count__{count}___distractor_count__2/evaluation_metrics__hotpotqa_to_hotpotqa__dev_subsampled.json')
    assert os.path.isfile(metric_file_name), f"File not found: {metric_file_name}"
    return metric_file_name

def get_metric_dict(metric_file_name):
    with open(metric_file_name) as f:
        metric_dict = json.load(f)
    return metric_dict

def get_threshold_metric_dict(thresholds, count):
    threshold_metric_dict = {}
    for threshold in thresholds:
        metric_file_name = get_metric_file_name_given_sim_threshold(threshold, count)
        metric_dict = get_metric_dict(metric_file_name)
        threshold_metric_dict[threshold] = metric_dict
    return threshold_metric_dict

def convert_threshold_metric_dict_to_csv(threshold_metric_dict, count=15):
    # threshold_metric_dict: [{threshold1: metric_dict1}, {threshold2: metric_dict2}, ...]
    keys = list(threshold_metric_dict[sim_thresholds[0]].keys())
    keys.sort()
    print("keys:", keys)
    output_path = os.path.join(exp2_path, f'results/metric_count_{count}.csv')
    with open(output_path, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['sim_threshold'] + keys)
        for threshold, metric_dict in threshold_metric_dict.items():
            writer.writerow([threshold] + [metric_dict[key] for key in keys])

    print(f'Done writing to {os.path.abspath(output_path)}')
    

In [3]:
# 5, 7, 9, 11, 13, 15
for count in [5, 7, 9, 11, 13, 15]:
    threshold_metric_dict = get_threshold_metric_dict(sim_thresholds, count)
    convert_threshold_metric_dict_to_csv(threshold_metric_dict, count)
    print(f"Done for count = {count}")


keys: ['count', 'em', 'f1', 'git_hash', 'precision', 'recall', 'sp_em', 'sp_f1', 'sp_precision', 'sp_recall']
Done writing to /home/guest/r11944026/all_ircot/ircot_exp2/ircot/results/metric_count_5.csv
Done for count = 5
keys: ['count', 'em', 'f1', 'git_hash', 'precision', 'recall', 'sp_em', 'sp_f1', 'sp_precision', 'sp_recall']
Done writing to /home/guest/r11944026/all_ircot/ircot_exp2/ircot/results/metric_count_7.csv
Done for count = 7
keys: ['count', 'em', 'f1', 'git_hash', 'precision', 'recall', 'sp_em', 'sp_f1', 'sp_precision', 'sp_recall']
Done writing to /home/guest/r11944026/all_ircot/ircot_exp2/ircot/results/metric_count_9.csv
Done for count = 9
keys: ['count', 'em', 'f1', 'git_hash', 'precision', 'recall', 'sp_em', 'sp_f1', 'sp_precision', 'sp_recall']
Done writing to /home/guest/r11944026/all_ircot/ircot_exp2/ircot/results/metric_count_11.csv
Done for count = 11
keys: ['count', 'em', 'f1', 'git_hash', 'precision', 'recall', 'sp_em', 'sp_f1', 'sp_precision', 'sp_recall']
Done

## exp-2. Hit Position / MMR Analysis (kind of like top-K hits?)
* Analyze at which index the hit occurs 
* Calculate MMR (mean reciprocal rank)

In [54]:
# for each predicted paragraph, check if the gold paragraph is in the list of predicted paragraphs
# if yes, identify the index of the gold paragraph in the list of predicted answers
# if no, set the index to -1

def get_hit_position(predicted_file_name, gold_answers_file_name):
    """
    predicted_file_name: path to the file containing predicted answers
    gold_answers_file_name: path to the file containing gold answers
    hit_position: a dict mapping from hit position 
        (the index of the gold paragraph in the list of predicted paragraphs) 
        to the number of samples
        where -1 means the gold paragraph is not in the list of predicted paragraphs
    """
    with open(predicted_file_name) as f:
        predicted_answers = json.load(f)

    with open(gold_answers_file_name) as f:
        gold_answers = json.load(f)

    hit_position = defaultdict(int)
    for key in predicted_answers.keys():
        gold_paragraphs = gold_answers[key]
        predicted_paragraphs = predicted_answers[key]
        for gold_paragraph in gold_paragraphs:
            if gold_paragraph in predicted_paragraphs:
                idx = predicted_paragraphs.index(gold_paragraph)
            else:
                idx = -1
            hit_position[idx] += 1
    hit_position = dict(sorted(hit_position.items()))
    return hit_position

def get_file_name_given_sim_threshold(count=30, threshold=90):
    gold_file_name = os.path.join(exp2_path, f"predictions_archived/oner/count{count}/oner_hotpotqa_similarity_{threshold}____oner____hotpotqa_to_hotpotqa__best/ground_truth__hotpotqa_to_hotpotqa__test_subsampled.json")
    predicted_file_name = os.path.join(exp2_path, f"predictions_archived/oner/count{count}/oner_hotpotqa_similarity_{threshold}____oner____hotpotqa_to_hotpotqa__best/prediction__hotpotqa_to_hotpotqa__test_subsampled.json")
    # predicted_file_name = os.path.join(exp2_path, f'predictions_archived/oner_qa/oner_qa_flan_t5_base_hotpotqa_similarity_{threshold}____prompt_set_1___bm25_retrieval_count__{count}___distractor_count__2/prediction__hotpotqa_to_hotpotqa__test_subsampled.json')
    # gold_file_name = os.path.join(exp2_path, f'predictions_archived/oner_qa/oner_qa_flan_t5_base_hotpotqa_similarity_{threshold}____prompt_set_1___bm25_retrieval_count__{count}___distractor_count__2/ground_truth__hotpotqa_to_hotpotqa__test_subsampled.json')
    # predicted_file_name = os.path.join(exp2_path, f'predictions_archived/oner_qa/oner_qa_flan_t5_base_hotpotqa_similarity_{threshold}____oner_qa____hotpotqa_to_hotpotqa__best/prediction__hotpotqa_to_hotpotqa__test_subsampled.json')
    # gold_file_name = os.path.join(exp2_path, f'predictions_archived/oner_qa/oner_qa_flan_t5_base_hotpotqa_similarity_{threshold}____oner_qa____hotpotqa_to_hotpotqa__best/ground_truth__hotpotqa_to_hotpotqa__test_subsampled.json')
    
    assert os.path.isfile(predicted_file_name), f"File not found: {predicted_file_name}"
    
    return predicted_file_name, gold_file_name


In [55]:
import numpy as np

def mean_reciprocal_rank(hit_position):
    """
    hit_position: a dict mapping from hit position 
        (the index of the gold paragraph in the list of predicted paragraphs) 
        to the number of samples
        where -1 means the gold paragraph is not in the list of predicted paragraphs
    """
    mrr_score = 0.0
    num_samples = sum(hit_position.values())
    # print(f"num_samples: {num_samples}")
    for position, num_samples_at_position in hit_position.items():
        if position == -1:
            continue
        rr = num_samples_at_position / (position + 1)
        mrr_score += rr / num_samples
        # print(f"position: {position}, rr: {rr}")
    return mrr_score

In [56]:
""" get the counts of all indices for different similarity thresholds. """

indices = list(range(-1, 15))

for count in [15, 30, 50, 100]:
    hit_positions = []
    mmrs = []

    for sim_threshold in sim_thresholds:
        predicted_file_name, gold_file_name = get_file_name_given_sim_threshold(count, sim_threshold)
        hit_position = get_hit_position(predicted_file_name, gold_file_name)
        
        # for those indices that are not in the list, set their counts to 0
        for idx in indices:
            if idx not in hit_position.keys():
                hit_position[idx] = 0

        hit_positions.append(hit_position)
        mmrs.append(mean_reciprocal_rank(hit_position))
    
    print(f"hit_position at count = {count}: {hit_positions}")

    # write sim_thresholds and idx_counts into csv
    output_path = os.path.join(exp2_path, f'results/count{count}_sim_thresholds_hit_positions.csv')
    with open(output_path, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['sim_threshold'] + indices)
        for i in range(len(sim_thresholds)):
            writer.writerow([sim_thresholds[i]] + [hit_positions[i][idx] for idx in indices])
        print(f'Done writing to {os.path.abspath(output_path)}')

    # write mmr into csv
    output_path = os.path.join(exp2_path, f'results/count{count}_sim_thresholds_mrr.csv')
    with open(output_path, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['sim_threshold', 'mrr'])
        for i in range(len(sim_thresholds)):
            writer.writerow([sim_thresholds[i], mmrs[i]])
        print(f'Done writing to {os.path.abspath(output_path)}')
    
    print("")

hit_position at count = 15: [{-1: 385, 0: 343, 1: 119, 2: 40, 3: 16, 4: 14, 5: 20, 6: 11, 7: 8, 8: 5, 9: 1, 10: 13, 11: 4, 12: 10, 13: 7, 14: 4}, {-1: 385, 0: 343, 1: 119, 2: 41, 3: 15, 4: 14, 5: 20, 6: 12, 7: 7, 8: 6, 9: 1, 10: 12, 11: 4, 12: 11, 13: 6, 14: 4}, {-1: 385, 0: 343, 1: 121, 2: 40, 3: 14, 4: 16, 5: 21, 6: 11, 7: 8, 8: 4, 9: 1, 10: 11, 11: 7, 12: 10, 13: 5, 14: 3}, {-1: 394, 0: 343, 1: 120, 2: 43, 3: 18, 4: 18, 5: 18, 6: 9, 7: 7, 8: 5, 9: 4, 10: 8, 11: 6, 12: 1, 13: 4, 14: 2}, {-1: 539, 0: 343, 1: 88, 2: 22, 3: 6, 5: 2, 4: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0}]
indices [-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Done writing to /home/guest/r11944026/all_ircot/ircot_exp2/ircot/results/count15_sim_thresholds_hit_positions.csv
Done writing to /home/guest/r11944026/all_ircot/ircot_exp2/ircot/results/count15_sim_thresholds_mrr.csv
hit_position at count = 30: [{-1: 385, 0: 343, 1: 119, 2: 40, 3: 16, 4: 14, 5: 20, 6: 11, 7: 8, 8: 5, 9: 1, 10: 13,

## exp-3. MAP
Calculate mean average precision (MAP).

In [57]:
def mean_average_precision(predicted_file_name, gold_answers_file_name):
    """
    predicted_file_name: path to the file containing predicted answers
    gold_answers_file_name: path to the file containing gold answers
    """
    with open(predicted_file_name) as f:
        predicted_answers = json.load(f)

    with open(gold_answers_file_name) as f:
        gold_answers = json.load(f)

    average_precision = []
    for key in predicted_answers.keys():
        gold_paragraphs = gold_answers[key]
        predicted_paragraphs = predicted_answers[key]
        num_gold_paragraphs = len(gold_paragraphs)
        num_predicted_paragraphs = len(predicted_paragraphs)
        if num_gold_paragraphs == 0 or num_predicted_paragraphs == 0:
            average_precision.append(0)
            continue
        num_correct = 0
        precision = []
        for i in range(num_predicted_paragraphs):
            if predicted_paragraphs[i] in gold_paragraphs:
                num_correct += 1
                precision.append(num_correct / (i + 1))
        average_precision.append(sum(precision) / num_gold_paragraphs)
    # compute mean, std, max, min of average_precision

    mean_average_precision = np.mean(average_precision)
    std_average_precision = np.std(average_precision)
    max_average_precision = np.max(average_precision)
    min_average_precision = np.min(average_precision)

    return {'mean': mean_average_precision, 'std': std_average_precision, 'max': max_average_precision, 'min': min_average_precision}

In [58]:
for count in [15, 30, 50, 100]:
    for sim_threshold in sim_thresholds:
        predicted_file_name, gold_file_name = get_file_name_given_sim_threshold(count, sim_threshold)
        map_result = mean_average_precision(predicted_file_name, gold_file_name)
        print(f"count={count}, sim_threshold={sim_threshold}, MAP={map_result['mean']}")

    # write to file
    output_path = os.path.join(exp2_path, f'results/count{count}_sim_thresholds_map.csv')
    with open(output_path, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['sim_threshold', 'mean', 'std', 'max', 'min'])
        for sim_threshold in sim_thresholds:
            predicted_file_name, gold_file_name = get_file_name_given_sim_threshold(count, sim_threshold)
            map_result = mean_average_precision(predicted_file_name, gold_file_name)
            writer.writerow([sim_threshold, map_result['mean'], map_result['std'], map_result['max'], map_result['min']])
        print(f'Done writing to {os.path.abspath(output_path)}')

count=15, sim_threshold=99, MAP=0.4894315268065268
count=15, sim_threshold=90, MAP=0.48956390831390834
count=15, sim_threshold=70, MAP=0.4903341713841714
count=15, sim_threshold=50, MAP=0.4908047119547119
count=15, sim_threshold=30, MAP=0.4294166666666666
Done writing to /home/guest/r11944026/all_ircot/ircot_exp2/ircot/results/count15_sim_thresholds_map.csv
count=30, sim_threshold=99, MAP=0.4894315268065268
count=30, sim_threshold=90, MAP=0.489830574980575
count=30, sim_threshold=70, MAP=0.49108801753801756
count=30, sim_threshold=50, MAP=0.4930643134643135
count=30, sim_threshold=30, MAP=0.43250833333333333
Done writing to /home/guest/r11944026/all_ircot/ircot_exp2/ircot/results/count30_sim_thresholds_map.csv
count=50, sim_threshold=99, MAP=0.4894315268065268
count=50, sim_threshold=90, MAP=0.489830574980575
count=50, sim_threshold=70, MAP=0.4912213508713509
count=50, sim_threshold=50, MAP=0.4937865356865357
count=50, sim_threshold=30, MAP=0.43498055555555554
Done writing to /home/gue