In [None]:
import pandas as pd
import numpy as np

### Data Preparation

In [None]:
'''
National Library of Medicine (NLM) releases every year MEDLINE, a snapshot of all the currently available 
PubMed papers in its library (www.nlm.nih.gov/databases/download/pubmed_medline.html).

The latest snapshot for 2022 includes 33.4M paper abstracts and metadata. Not all of them are suitable, so
we removed all the papers with empty abstracts, with unfinished abstracts, and with non-English abstracts.

This yields 20.6M paper instances. It can be downloaded from: https://zenodo.org/records/7849020

There is a separate file for abstracts and another one for metadata (authors, journal, publication year...).

'''


metadata = pd.read_csv("./data/pubmed_landscape_data.csv")

abstracts = pd.read_csv("./data/pubmed_landscape_abstracts.csv")
abstracts_text = abstracts.AbstractText.tolist()

In [None]:
'''
Code for loading the three datasets used in our experiments.
'''
import json

#HealthFC
healthfc_df = pd.read_csv("healthFC_annotated.csv")
hfc_questions = healthfc_df.en_claim.tolist()
hfc_labels = healthfc_df.label.tolist()


#TREC-Health
trec_df = pd.read_csv("trec_health.csv")
trec_questions = trec_df.description.tolist()
trec_labels = trec_df.label.tolist()


#BioASQ-7b
with open("BioASQ-train-yesno-7b.json", "r") as f:
    bioasq_content = json.load(f)
    
bioasq_questions = list()
bioasq_answers = list()
for qid in range(len(bioasq_content["data"][0]["paragraphs"])):
    question = bioasq_content["data"][0]["paragraphs"][qid]["qas"][0]["question"]
    answer = bioasq_content["data"][0]["paragraphs"][qid]["qas"][0]["answers"]
    if question not in bioasq_questions:
        bioasq_questions.append(question)
        bioasq_answers.append(answer)


In [None]:
'''
Create a sparse index for BM25 search. We use the library retriv.
'''

from retriv import SparseRetriever

sr = SparseRetriever(
  index_name="pubmed-index",
  model="bm25",
  min_df=10,
  tokenizer="whitespace",
  stemmer="english",
  stopwords="english",
  do_lowercasing=True,
  do_ampersand_normalization=True,
  do_special_chars_normalization=True,
  do_acronyms_normalization=True,
  do_punctuation_removal=True,
)

corpus_path = "/mnt/mydrive/PubMed/pubmed_landscape_abstracts.csv"


#Construct the inverted index.
import time
start = time.time()

sr = sr.index_file(
  path=corpus_path,  # File kind is automatically inferred
  show_progress=True,         # Default value
  callback=lambda doc: {      # Callback defaults to None.
    "id": doc["PMID"],
    "text": doc["AbstractText"],          
    }
  )

duration = time.time() - start
print(duration) #Duration: 5772.615013837814


#Serialize the inverted and index and save it as a pickled file.
import pickle
file = open('/mnt/mydrive/pickled_sr', 'wb')
pickle.dump(sr, file)


In [None]:

import pickle
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util

# Open the file with the pickled inverted index (sparse retriever).
file = open('/mnt/mydrive/PubMed/pickled_sr', 'rb')
inverted_index = pickle.load(file)
file.close()

#Load the dataset HealthFC, as an example.
df = pd.read_csv("healthFC_annotated.csv")
claims = df.en_claim.tolist()
labels = df.label.tolist()


#Create a list of queries from the claims.
query_list = list()
idx = 0
for claim in claims:
    claim = claim.lower()
    d = dict()
    d["id"] = str(idx)
    d["text"] = claim
    query_list.append(d)
    idx += 1

#Retrieve the top 100 results for each query (claim)
results = inverted_index.msearch(
  queries=query_list,
  cutoff=100,
)

#Get all the document IDs from results.
all_ids = list()
for claim_id, document_results in results.items():
    ids = list()
    for doc_id, score in document_results.items():
        ids.append(int(doc_id))
    all_ids.append(ids)
    

#Load all sentences of all 100 abstracts for each claim into a big list of lists.
claim_sentences = list()
for ids in all_ids:
    all_sentences = list()
    
    for doc_id in ids:
        abstract = abstracts_text[doc_id]
        sentences = sent_tokenize(abstract)
        claim_sentences.append(sentences)
    

#Load sentence transformer for selecting evidence sentences.
model = SentenceTransformer('copenlu/spiced')
print("loaded sentence model!")


#Find top 3 most similar sentences to query from each document, to condense it to most important parts.
top_sentences = list()
for idx in range(len(claims)):
    claim = claims[idx]
    print(idx)
    
    for j in range(100):
        start_index = idx*100 + j 
        
        sents = claim_sentences[start_index]

        sents_embeddings = model.encode(sents, convert_to_tensor=True)
        claim_embedding = model.encode(claim, convert_to_tensor=True)
        cos_scores = util.cos_sim(claim_embedding, sents_embeddings)[0]

        new_k = 3
        if len(sents) < 3:
            new_k = len(sents)

        top_results = torch.topk(cos_scores, k=new_k)

        np_results = top_results[1].detach().cpu().numpy()
        top_sentences.append(np_results)

selected_sentences = list()
for idx in range(len(top_sentences)):
    top = top_sentences[idx]
    top = np.sort(top)
    sents = np.array(claim_sentences[idx])[top]    
    selected_sentences.append(sents)
  

# Create a joint list of concatenated claims and evidence, in form of "claim [SEP] evidence1 evidence2 evidence3"
joint_list = list()
for idx in range(len(claims)):
    claim = claims[idx]
    for j in range(100):
        start_index = idx*100 + j
        joint = claim + " [SEP] "

        for s in selected_sentences[start_index]:
            joint += s
            joint += " "
        joint_list.append(joint)
        

with open("jointselect_healthfc_100.txt", "w") as f:
    for line in joint_list:
        f.write(line)
        f.write("\n")

with open("jointselect_healthfc_100.txt", "r") as f:
    lines = f.readlines()
    flatlines = list()
    for l in lines:
        l = l.strip()
        if "[SEP]" in l:
            flatlines.append(l)
   
    

In [None]:
import torch 
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification

#A PyTorch class for our dataset, containing text encodings and final labels (answers).
class QADataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
    
#Final QA answer prediction loop. 
def get_result(input_list, model, tokenizer):

    input_encoded = tokenizer(input_list, return_tensors='pt',
                             truncation_strategy='only_first', add_special_tokens=True, padding=True)

    input_dataset = QADataset(input_encoded, np.zeros(len(input_list)))

    test_loader = DataLoader(input_dataset, batch_size=16,
                             drop_last=False, shuffle=False, num_workers=4)

    model.eval()
    model = model.to("cuda")
    result = np.zeros((len(test_loader.dataset),3))
    index = 0
    with torch.no_grad():
        for batch_num, instances in enumerate(test_loader):
            print(batch_num)
            input_ids = instances["input_ids"].to("cuda")
            attention_mask = instances["attention_mask"].to("cuda")
            logits = model(input_ids=input_ids,
                                          attention_mask=attention_mask)[0]
            probs = logits.softmax(dim=1)
            result[index : index + probs.shape[0]] = probs.to("cpu")
            index += probs.shape[0]

    return result


#Load the NLI model.
MODEL_NAME = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, model_max_length=1024)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

#Run the QA final prediction loop.
result = get_result(flatlines, model, tokenizer)


#Load the dataset.
df = pd.read_csv("healthFC_annotated.csv")
claims = df.en_claim.tolist()
labels = df.label.tolist()

#Store the aggregated final predictions for top 100 evidence documents, for each question.
start_index = 0
end_index = 0
with open("jsl_healthfc_aggregated_results.txt", "w") as f:

    for idx in range(len(claims)):
        claim = claims[idx]
        num_abs = 100

        end_index += num_abs
        predictions = result[start_index : end_index]
        start_index += num_abs

        f.write(claim + "\n")
        f.write("True label: " + str(labels[idx]))
        f.write("\n")

        verdicts = dict(SUP=0, NEI=0, REF=0)
        for pred in predictions:
            verdict = np.argmax(pred)
            if verdict==0:
                verdicts["SUP"] += 1
            elif verdict==1:
                verdicts["NEI"] += 1
            elif verdict==2:
                verdicts["REF"] += 1

        f.write("Verdicts: " + str(verdicts) + "\n")
        f.write("Mean: " + str(np.mean(predictions, axis=0)) + "\n\n")

#Store the probabilities of the three final predictions for top 100 evidence documents, for each question.
with open("jsl_healthfc_label_probabilities.txt", "w") as f:
    for idx in range(len(result)):
        fl = flatlines[idx]
        r = result[idx]
        f.write(fl + "\n" + str(list(r)))
        f.write("\n")

#### Experiments with TOP K, publication year, number of citations

In [None]:
'''
Load all the publication years of all top-100 documents for each question.
'''

import ast 

#Collect  all IDs of top 100 documents for each question in this dataset.
with open("results_healthfc_100.txt", "r") as f:
    lines = f.readlines()
    
flag = 0
dataset_ids = list()
for l in lines:
    l = l.strip()
    if flag%4==2:
        ids = ast.literal_eval(l)
        dataset_ids.extend(ids)
    flag += 1

all_ids = list()
for ids in dataset_ids:
    all_ids.extend(ids)


#Load the file with all documents' metadata.
metadata = pd.read_csv("/mnt/mydrive/PubMed/pubmed_landscape_data.csv")

#Create a list of all years.
YEARS = list()
for idx in all_ids:
    y = metadata[metadata.PMID == idx].Year.values
    if len(y) == 0:
        y = 0
    else:
        y = int(y[0])
    YEARS.append(y)

YEARS[:100]

In [None]:
'''
Load the number of citations for each of the top-10 documents for each question.

Since this parameter is not available in the original metadata, we use the Semantic Scholar API to query the number of citations in their knowledge base.
'''

import requests
import ast

#Collect the PubMed IDs (PMIDs) from all top 100 documents for each question in this dataset.
all_claims = list()
all_ids = list()
all_scores = list()
with open("results_healthfc_100.txt", "r") as f:
    lines = [line.rstrip() for line in f]

    idx = 0
    for line in lines:
        if idx%4==0:
            claim = line
            all_claims.append(claim)
        elif idx%4==1:
            all_scores = ast.literal_eval(line)
        elif idx%4==2:
            ids = ast.literal_eval(line)
            all_ids.append(ids)

        idx += 1
        
metadata = pd.read_csv("/mnt/mydrive/PubMed/pubmed_landscape_data.csv")
pmids = metadata.PMID.tolist()

NUM_CITATIONS = list()
total = 0
for ids in all_ids:

    for doc_id in ids[:10]:
        pmid = pmids[doc_id]

        # Get the number of citations of the paper from Semantic Scholar based on its PMID.
        url = "https://api.semanticscholar.org/graph/v1/paper/PMID:" + str(pmid)
        resp = requests.get(url=url)
        data = resp.json()
        if "error" in data:
            NUM_CITATIONS.append(0)
            continue
            
        ss_pid = data["paperId"]
        total += 1
        
        # Each page only displays up to 100 citations.
        # If there are multiple pages, then keep going until reaching the last. Add 100 citations per page.
        total_citations = 0
        idx = 0
        count = 100
        while count == 100:
            offset = idx*100
            url = "https://api.semanticscholar.org/graph/v1/paper/" + \
                    str(ss_pid) + "/citations?fields=title,authors&offset=" + str(offset)
            
            resp = requests.get(url=url)
            data = resp.json()
            if "error" in data:
                NUM_CITATIONS.append(total_citations)
                break
            
            count = len(data["data"])
            total_citations += count
            total += 1
            idx += 1

        NUM_CITATIONS.append(total_citations)
    
    if len(NUM_CITATIONS)%1000==0:
        print(NUM_CITATIONS)
    
NUM_CITATIONS[:100]

In [None]:
'''
Experiments with top k, with publication year, and with citation count.
'''
import ast


#First load the prediction probabilities.
with open("jsl_bioasq_ss_finetunede6_linesprobs.txt", "r") as f:
    lines = f.readlines()
    #lines = [l for l in lines if "[SEP]" in l]

prediction_probabilities = list()
flag = 0
for l in lines:
    l = l.strip()
    if flag%2==1:
        l = l.replace("    ", " ").replace("   ", " ").replace("  ", " ")
        l = l.replace(" ", ", ").replace(",,", ", ")
        prediction_probabilities.append(ast.literal_eval(l))
    flag += 1
    

prediction_counts = list()

TOP_K = 10             #Set the maximum number of evidence documents to retrieve.
MIN_YEAR = 2015        #Set the lowest cut-off publication year for retrieved articles. 
MIN_CITATIONS = 50     #Set the lowest cut-off number of citations for retrieved articles.


prediction_counts = list()
for idx in range(len(prediction_probabilities)):
    all_probs = prediction_probabilities[idx]

    if idx %100==0:
        if idx != 0:
            prediction_counts.append(all_predictions)
        all_predictions = np.array([0, 0, 0])

    # Retrieve only the top k documents. If larger index than top_k, skip the rest of the documents.
    if idx%100 >= TOP_K:
        continue

    #Retrieve only the documents released on the or after the minimum year.
    if YEARS[idx] < MIN_YEAR:
        continue

    # Retrieve only the documents with at least the minimum number of citations.
    ceil_idx = idx//100
    inside_idx = idx%10
    paper_citations = NUM_CITATIONS[ceil_idx + inside_idx] 
    if paper_citations < MIN_CITATIONS:
        continue

    verdict = np.array(all_probs).argmax()
    all_predictions[verdict] += 1

prediction_counts.append(all_predictions)

majority_votes = list()
BINARY = True
TERNARY = False

for counts in prediction_counts:

    #If there are only two labels, just look at entailment and contradiction predictions.
    if BINARY:
        if counts[0] > counts[2]:
            majority_votes.append(1)
        if counts[2] > counts[0]:
            majority_votes.append(0)
        if counts[0] == counts[2]:
            majority_votes.append(0)

    #If there are three labels, take the highest scoring class amonf the three predictions.
    elif TERNARY:
        v = np.array(counts).argmax()  
        if np.sum(counts) == 0:
            majority_votes.append(1)
            continue

        majority_votes.append(v)

majority_votes = np.array(majority_votes)



In [None]:
'''
Print the results of the experiments.
'''

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def print_scores(actual_values, predicted_values):
    # Example arrays
    #actual_values = yesno_labels
    #predicted_values = result

    # Calculate precision
    precision = precision_score(actual_values, predicted_values, average = "macro")

    # Calculate recall
    recall = recall_score(actual_values, predicted_values, average = "macro")

    # Calculate F1 score
    f1 = f1_score(actual_values, predicted_values, average = "macro")

    # Calculate accuracy
    accuracy = accuracy_score(actual_values, predicted_values)

    # Print the results
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Accuracy:", accuracy)
    
predicted = majority_votes
true_labels = labels
print_scores(true_labels, predicted)

#### Experiments with TOP J

In [None]:
'''
The remaining cells focus on the experiments for running the TOP J tests.
'''

import ast
from nltk.tokenize import sent_tokenize, word_tokenize
import torch
from sentence_transformers import SentenceTransformer, util

# Load all claims and document IDs from the results file.
all_claims = list()
all_ids = list()
with open("openhealth_results.txt", "r") as f:
    lines = [line.rstrip() for line in f]
    idx = 0
    for line in lines:
        if idx%4==0:
            claim = line
            all_claims.append(claim)
        elif idx%4==1:
            scores = ast.literal_eval(line)
        elif idx%4==2:
            ids = ast.literal_eval(line)
            all_ids.append(ids)
        
        idx += 1


#Load all PubMed abstracts.
abstracts = pd.read_csv("/mnt/mydrive/PubMed/pubmed_landscape_abstracts.csv")
abstracts_text = abstracts.AbstractText.tolist()


#Load all sentences of top 20 abstracts for each claim into a big list of lists.
claim_sentences = list()
for ids in all_ids:
    all_sentences = list()
    
    for doc_id in ids[:20]:
        abstract = abstracts_text[doc_id]
        sentences = sent_tokenize(abstract)
        all_sentences.extend(sentences)
        all_sentences = [s.lower() for s in all_sentences]   
    claim_sentences.append(all_sentences)
    

#Load the sentence transformer for selecting evidence sentences.
model = SentenceTransformer('copenlu/spiced')


#Find top 20 sentences for each claim. 
#Later, a subset of top-j sentences (1, 3, 5, 10...) will be chosen out of these 20 in experiments.
top_sentences = list()
for idx in range(len(all_claims)):
    claim = all_claims[idx]
    print(idx)
    sents = claim_sentences[idx]
    
    sents_embeddings = model.encode(sents, convert_to_tensor=True)
    claim_embedding = model.encode(claim, convert_to_tensor=True)
    cos_scores = util.cos_sim(claim_embedding, sents_embeddings)[0]
    top_results = torch.topk(cos_scores, k=20)
    
    np_results = top_results[1].detach().cpu().numpy()
    top_sentences.append(np_results)


selected_sentences = list()
for idx in range(len(all_claims)):
    top = top_sentences[idx]
   
    top = np.sort(top)
    sents = np.array(claim_sentences[idx])[top]    

    selected_sentences.append(sents)
  
 # Create a joint list of concatenated claims and evidence, in form of "claim [SEP] evidence1 ||| evidence2 ... ||| evidence20"
joint_list = list()
for idx in range(len(all_claims)):
    joint = all_claims[idx] + " [SEP] "
    for s in selected_sentences[idx]:
        s = s.replace("\n", " ")
        joint += s
        joint += " ||| "
    joint_list.append(joint)

    
#Save this in a file before the final step.
with open("healthfc_joint20.txt", "w") as f:
	for example in joint_list:
		f.write(example)
		f.write("\n")



In [None]:
### TOP J experiments


from torch.utils.data import DataLoader
import torch
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader

# Load the stored files from the previous step
with open("healthfc_joint20.txt", "r") as f:
    lines = f.readlines()
    flatlines = list()
    for l in lines:
        l = l.strip()
        if "[SEP]" in l:
            flatlines.append(l)

#Load the lines and split the 20 sentences up.         
jointlines = list()
for fl in flatlines:
    claim, evs = fl.split(" [SEP] ")
    evidences = evs.split(" ||| ")

    for ev in evidences:
        new_string = claim + " [SEP] "
        new_string += ev
        jointlines.append(new_string)

#Torch dataset used in the prediction pipeline.
class QADataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
    
#Final QA answer prediction loop. 
def get_result(input_list, model, tokenizer):

    input_encoded = tokenizer(input_list, return_tensors='pt',
                             truncation_strategy='only_first', add_special_tokens=True, padding=True)

    input_dataset = QADataset(input_encoded, np.zeros(len(input_list)))

    test_loader = DataLoader(input_dataset, batch_size=16,
                             drop_last=False, shuffle=False, num_workers=4)

    model.eval()
    model = model.to("cuda")
    result = np.zeros((len(test_loader.dataset),3))
    index = 0
    with torch.no_grad():
        for batch_num, instances in enumerate(test_loader):
            print(batch_num)
            input_ids = instances["input_ids"].to("cuda")
            attention_mask = instances["attention_mask"].to("cuda")
            logits = model(input_ids=input_ids,
                                          attention_mask=attention_mask)[0]
            probs = logits.softmax(dim=1)
            result[index : index + probs.shape[0]] = probs.to("cpu")
            index += probs.shape[0]

    return result


#Load the NLI model.
MODEL_NAME = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, model_max_length=1024)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

#Run the QA final prediction loop.
result = get_result(flatlines, model, tokenizer)

#Load the dataset with questions (claims) and labels.
df = pd.read_csv("healthFC_annotated.csv")
claims = df.en_claim.tolist()
labels = df.label.tolist()


yesno_indices = np.where(np.array(labels) != 1)[0]
yesno_claims = claims
yesno_labels = labels
start_index = 0
end_index = 0

#Generate a file with results (overall verdicts for each pair of "question + evidence_sentence")
with open("topj_20sentences_healthfc_results.txt", "w") as f:

    for idx in range(len(yesno_claims)):
        claim = yesno_claims[idx]
        num_abs = 20

        end_index += num_abs
        predictions = result[start_index : end_index]
        start_index += num_abs

        f.write(claim + "\n")
        f.write("True label: " + str(yesno_labels[idx]))
        f.write("\n")

        verdicts = dict(SUP=0, NEI=0, REF=0)
        for pred in predictions:
            verdict = np.argmax(pred)
            if verdict==0:
                verdicts["SUP"] += 1
            elif verdict==1:
                verdicts["NEI"] += 1
            elif verdict==2:
                verdicts["REF"] += 1

        f.write("Verdicts: " + str(verdicts) + "\n")
        f.write("Mean: " + str(np.mean(predictions, axis=0)) + "\n\n")
        

#Generate the label prediction probabilitiees for each of the top 20 sentences selected (so for "question + evidence_sentence" pairs).
#In the experiments, top j is set to 1, 3, 5, 10... and then only top-j sentences are selected for the majority vote.
with open("topj_20sentences_healthfc_probabilities.txt", "w") as f:
    for idx in range(len(result)):
        fl = jointlines[idx]
        r = result[idx]
        f.write(fl + "\n" + str(list(r)))
        f.write("\n")

In [None]:
'''
Experiments similar to TOP K, just change some parameters in the main loop.
'''

TOP_J = 5 

prediction_counts = list()
for idx in range(len(prediction_probabilities)):
    all_probs = prediction_probabilities[idx]

    #Only 20 sentences per questions, so change %100 to %20.
    if idx%20==0:
        if idx != 0:
            prediction_counts.append(all_predictions)
        all_predictions = np.array([0, 0, 0])

    # Do the count with only the TOP-J sentences.
    if idx%20 >= TOP_J:
        continue

    verdict = np.array(all_probs).argmax()
    all_predictions[verdict] += 1

prediction_counts.append(all_predictions)

majority_votes = list()
BINARY = True
TERNARY = False

for counts in prediction_counts:

    #If there are only two labels, just look at entailment and contradiction predictions.
    if BINARY:
        if counts[0] > counts[2]:
            majority_votes.append(1)
        if counts[2] > counts[0]:
            majority_votes.append(0)
        if counts[0] == counts[2]:
            majority_votes.append(0)

    #If there are three labels, take the highest scoring class amonf the three predictions.
    elif TERNARY:
        v = np.array(counts).argmax()  
        if np.sum(counts) == 0:
            majority_votes.append(1)
            continue

        majority_votes.append(v)

majority_votes = np.array(majority_votes)

predicted = majority_votes
true_labels = labels
print_scores(true_labels, predicted)
