In [None]:
# get context.json
def get_context(file_path):
    output_path = file_path[:file_path.rfind("/")]
    contexts = {}
    for folder in sorted(os.listdir(file_path)):
        if ".DS_Store" in folder:
            continue
        contexts[f'{folder}'] = {}

        for file in sorted(os.listdir(f"{file_path}/{folder}")): #folder = pmcid
            if "json" not in file:
                continue
            with open(f"{file_path}/{folder}/{file}") as f:
                data = json.load(f)
            start = 1000000000
            end = -1
            for span in data['citation_context']:
                if span['start'] < start:
                    start = span['start']
                if span['end'] > end:
                    end = span['end']
            claim = data['citing_paragraph'][start:end].strip().replace('\n', ' ') # the citation context
            if claim != '' and claim is not None:
                contexts[f'{folder}'][f'{file}'] = claim
            else:
                print(f'{folder}{file}')

    with open(f'{output_path}/contexts_multivers.json','w') as f:
        json.dump(contexts,f,indent=4)

get_context(test_file_path)
get_context(dev_file_path)
get_context(train_file_path)

In [None]:
# generating sentence folder
def generate_sentences_forbm25(path):
    result_folder = path[:path.rfind("/")]
    print(result_folder)
    for file in os.listdir(path):
        print(file)
        sents = []
        with open(f"{path}/{file}") as f:
            lines = f.readlines()
            for line in lines:
                line = json.loads(line)
                for each_sent in line['sentences']:
                    sents.append(each_sent)
        with open(f'{result_folder}/Sentences/{file}','w') as f:
            json.dump(sents,f,indent=4)

generate_sentences_forbm25(test_path)
generate_sentences_forbm25(train_path)
generate_sentences_forbm25(dev_path)

In [None]:
import os
import json
import locale
import time

from pygaggle.rerank.base import Query, Text
from pygaggle.rerank.transformer import MonoT5

from pyserini.search.lucene import LuceneSearcher
import pyserini.index.lucene

import subprocess

reranker =  MonoT5('castorini/monot5-base-med-msmarco')
BM25_k = 60
TOP_n = 50

In [None]:
results = {}
outliers = []
scores = []
DATATYPE = "Train"
SENTENCE_FOLDER = f"{DATATYPE}_input_Data/Sentences"

In [None]:
def get_relevant_sentences(cs, num_bm25=60, num=50):
    global results

    for pmcid in cs:
        # print(pmcid)
        curr_ref_file = pmcid

        CURR_PMC = pmcid

        # if CURR_PMC != '009_PMC3607626':
        #     continue
        searcher = LuceneSearcher(f'{DATATYPE}_input_Data/indexes/{pmcid}')

        results[CURR_PMC] = {}

        idx = 0
        for file, claim in cs[pmcid].items():
            # print(f'Claim {idx}')

            idx += 1
            hits = searcher.search(claim, k=num_bm25)

            if len(hits) == 0:
                print(f'Claim {idx}, {claim}')

            sentences = []
            for i in range(len(hits)):
                sentences.append(json.loads(hits[i].raw)['contents'])


            texts = [Text(sentences[i], {'sent_idx' : i}, 0) for i in range(len(sentences))]
            query = Query(claim)
            reranked = reranker.rerank(query, texts)
            current_sentences = []
            # Print out reranked results:
            if (len(reranked) < num):
                outliers.append((pmcid, file))

            for i in range(0, min(num, len(reranked))):
                current_sentences.append((reranked[i].text, reranked[i].score))
                scores.append(reranked[i].score)
            results[CURR_PMC][file] = current_sentences


        with open(f"Results_{DATATYPE}/results_goldsent_ccsent.json", 'w') as f:
            json.dump(results, f, indent=4)

    with open(f"Results_{DATATYPE}/outliers.txt",'w') as f1:
        json.dump(str(outliers),f1, indent=4)

    with open(f"Results_{DATATYPE}/scores.txt",'w') as f1:
        json.dump(str(sorted(scores)),f1, indent=4)

    return



def main():

    for file in sorted(os.listdir(SENTENCE_FOLDER)):
        if "json" not in file:
            continue
        with open(f"{SENTENCE_FOLDER}/{file}") as f:
            d = json.load(f)
            res = []
            for i, s in enumerate(d):
                curr = {
                    "id" : f"sent_{i}",
                    "contents" : s
                }
                res.append(curr)

        if not os.path.isdir(f"{DATATYPE}_input_Data/LuceneDocuments"):
            os.mkdir(f"{DATATYPE}_input_Data/LuceneDocuments")
        os.mkdir(f"{DATATYPE}_input_Data/LuceneDocuments/{file[:-5]}")
        with open(f"{DATATYPE}_input_Data/LuceneDocuments/{file[:-5]}/documents.json", "w") as f:
            json.dump(res, f,indent=4)

    files = [f for f in os.listdir(SENTENCE_FOLDER) if 'json' in f]


    locale.getpreferredencoding = lambda: "UTF-8"


    if not os.path.isdir(f'{DATATYPE}_input_Data/indexes'):
        os.mkdir(f'{DATATYPE}_input_Data/indexes')

    os.system('clear') #clean the output of terminal
    print("Starting Indexing of the files")

    start_indexing = time.time()

    for file in files:
        command = [
        "python", "-m", "pyserini.index.lucene",
        "--collection", "JsonCollection",
        "--input", f"{DATATYPE}_input_Data/LuceneDocuments/{file[:-5]}",
        "--index", f"{DATATYPE}_input_Data/indexes/{file[:-5]}",
        "--generator", "DefaultLuceneDocumentGenerator",
        "--threads", "1",
        "--storePositions", "--storeDocvectors", "--storeRaw"
        ]
        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

        # Check if there is an error (returncode != 0)
        if result.returncode != 0:
            # An error occurred, display the error message (stderr) and any output (stdout)
            print("Error occurred:")
            print(result.stderr.decode())
            print(result.stdout.decode())
        print(f"Completed indexing of {file}")

    end_indexing = time.time()
    print(f"Indexing took {end_indexing-start_indexing:.2f} secs")

    #reranker shifted to top

    os.system('clear')

    #results = {}

    with open(f'{DATATYPE}_input_Data/{DATATYPE}_contexts_goldsent_ccsent.json') as f:
        cs = json.load(f)

    start_bm25 = time.time()

    print("Starting Bm25 and T5 Reranker")
    get_relevant_sentences(cs, num_bm25=BM25_k, num=TOP_n)

    end_bm25 = time.time()
    print(f"Indexing took {end_indexing-start_indexing:.2f} secs")
    print(f"Bm_25 and T5 Reranker took {end_bm25-start_bm25:.2f} secs")


if __name__ == "__main__":
    main()