In [1]:
import os
import sqlite3
import json
import unicodedata

foldername = "wice"
year = "2023" if foldername == "wice" else "2017"


BASE_PATH = os.path.join(os.path.abspath(os.curdir), "..", "..", "data")
DB_PATH = os.path.join(BASE_PATH, "db_files", f"enwiki-{year}-original-full.db")
OUT_PATH = os.path.join(os.path.abspath(os.curdir), "..", "..", "out")

conn = sqlite3.connect(DB_PATH)
wiki_db = conn.cursor()
results = wiki_db.execute(
    "SELECT id FROM documents ORDER BY id COLLATE NOCASE ASC"
).fetchall()
title_id_dict = {title[0]: id for id, title in enumerate(results)}

### LABELED

In [2]:
DEV_FILE = os.path.join(BASE_PATH, foldername, f"{foldername}_dev_release_v1.1.json")
count = 0

def dict_ids(json_file):
    # Create a dictionary to map uid to title using title_id_dict
    uid_to_title_dict = {}
    global count
    for item in json_file:
        uid = item['uid']
        uid_to_title_dict[uid] = {}
        # Retrieve title from title_id_dict using supporting facts or some logic
        for fact in item['supporting_facts']:
            title = unicodedata.normalize("NFD", fact[0])  # Assuming title is the first element
            if title in title_id_dict:
                uid_to_title_dict[uid][title_id_dict[title]] = 1
            else:
                count += 1

    return uid_to_title_dict

with open(DEV_FILE,"r") as json_file:
    claim_json = json.load(json_file)

original_labeled = dict_ids(claim_json)
print(count)
with open(f'../../out/doc_retrieval/{foldername}_dev_dict.json', 'w') as json_file:
    json.dump(original_labeled, json_file, indent=4)

20


### RESULTS

In [3]:
count = set()
def dict_ids_results(json_file, id_key='uid'):
    uid_to_title_dict = {}
    global count
    count = set()
    for item in json_file:
        uid = item[id_key]
        uid_to_title_dict[uid] = {}
        # Retrieve title from title_id_dict using supporting facts or some logic
        [c[0] for c in claim_json[0]['context']]
        for fact in item['context']:
            title = unicodedata.normalize("NFD", fact[0]) 
            if title in title_id_dict:
                uid_to_title_dict[uid][title_id_dict[title]] = 1
            else:
                count.add(title)
    return uid_to_title_dict

BM25

In [4]:
setting_files = ["original-full", "cite-full", "claim-full", "fusion-full"]

for setting in setting_files:
    DOC_PATH = os.path.join(BASE_PATH, "hover_files", foldername, "bm25", setting, "doc_retrieval", "hover_dev_doc_retrieval.json")
    with open(DOC_PATH,"r") as json_file:
        claim_json = json.load(json_file)

    results_labeled = dict_ids_results(claim_json)
    setting_name = setting.split('-')[0]
    print(setting_name, len(count))

    with open(f'{OUT_PATH}/doc_retrieval/bm25/{foldername}/{foldername}_dev_dict_bm25_{setting}.json', 'w') as json_file:
        json.dump(results_labeled, json_file, indent=4)

original 0
cite 0
claim 0
fusion 0


FAISS

python3 -m src.retrieval.faiss.run_faiss_search --dataset_name=hover --setting=enwiki-2017-cite-full --hover_stage=sent_retrieval --precompute_embed --use_gpu  
python3 -m src.retrieval.faiss.run_faiss_search --dataset_name=hover --setting=enwiki-2017-claim-full --hover_stage=sent_retrieval --precompute_embed --use_gpu  
python3 -m src.retrieval.faiss.run_faiss_search --dataset_name=hover --setting=enwiki-2017-fusion-full --hover_stage=sent_retrieval --precompute_embed --use_gpu  

python3 -m src.retrieval.faiss.run_faiss_search --dataset_name=wice --setting=enwiki-2023-cite-full --hover_stage=sent_retrieval --precompute_embed --use_gpu  
python3 -m src.retrieval.faiss.run_faiss_search --dataset_name=wice --setting=enwiki-2023-claim-full --hover_stage=sent_retrieval --precompute_embed --use_gpu  
python3 -m src.retrieval.faiss.run_faiss_search --dataset_name=wice --setting=enwiki-2023-fusion-full --hover_stage=sent_retrieval --precompute_embed --use_gpu  

In [5]:
setting_files = ["original-full-select", "cite-full", "claim-full", "fusion-full"]
for setting in setting_files:
    DOC_PATH = os.path.join(BASE_PATH, "hover_files", foldername, "faiss", setting, "sent_retrieval", "hover_dev_sent_retrieval.json")
    with open(DOC_PATH,"r") as json_file:
        claim_json = json.load(json_file)
    results_labeled = dict_ids_results(claim_json, id_key='id')
    setting_name = setting.split('-')[0]
    print(setting_name, len(count))
    with open(f'{OUT_PATH}/doc_retrieval/faiss/{foldername}/{foldername}_dev_dict_faiss_{setting_name}.json', 'w') as json_file:
        json.dump(results_labeled, json_file, indent=4)

original 0
cite 0
claim 0
fusion 0


JPQ

python -m src.retrieval.JPQ.run_inference --dataset_name=hover --setting=enwiki-2017-cite-full --subvectors_num 96 --sent_select --use_gpu  
python -m src.retrieval.JPQ.run_inference --dataset_name=hover --setting=enwiki-2017-claim-full --subvectors_num 96 --sent_select --use_gpu  
python -m src.retrieval.JPQ.run_inference --dataset_name=hover --setting=enwiki-2017-fusion-full --subvectors_num 96 --sent_select --use_gpu  

python -m src.retrieval.JPQ.run_inference --dataset_name=wice --setting=enwiki-2023-cite-full --subvectors_num 96 --sent_select --use_gpu  
python -m src.retrieval.JPQ.run_inference --dataset_name=wice --setting=enwiki-2023-claim-full --subvectors_num 96 --sent_select --use_gpu  
python -m src.retrieval.JPQ.run_inference --dataset_name=wice --setting=enwiki-2023-fusion-full --subvectors_num 96 --sent_select --use_gpu  

In [6]:
setting_files = ["original-full-compress-select", "cite-full-compress", "claim-full-compress", "fusion-full-compress"]
for setting in setting_files:
    DOC_PATH = os.path.join(BASE_PATH, "hover_files", foldername, "faiss", setting, "sent_retrieval", "hover_dev_sent_retrieval.json")
    with open(DOC_PATH,"r") as json_file:
        claim_json = json.load(json_file)
    results_labeled = dict_ids_results(claim_json, id_key='id')
    setting_name = setting.split('-')[0]
    print(setting_name, len(count))
    with open(f'{OUT_PATH}/doc_retrieval/jpq/{foldername}/{foldername}_dev_dict_jpq_{setting_name}.json', 'w') as json_file:
        json.dump(results_labeled, json_file, indent=4)

original 0
cite 0
claim 0
fusion 0
