In [1]:
import pandas as pd
import numpy as np
#import torch
import os
from tqdm.auto import tqdm
import json
import gzip

#Pyserini search
from pyserini.search import pysearch

#XML parsing
import xml.etree.ElementTree as ET

#TREC_EVAL
from trectools import misc, TrecRun, TrecQrel, procedures

In [35]:
Pyserini_files = r'C:\Users\Jimmy Chen\Box\COVID19\CORD19\Pyserini_Lucene_CORD_index'
COVID_INDEX_full_text = os.path.join(Pyserini_files, 'lucene-index-covid-full-text-2020-04-10')
COVID_INDEX_paragraph = os.path.join(Pyserini_files, 'lucene-index-covid-paragraph-2020-04-10')
TREC_COVID_root = r'C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID'
#Load CSV
R1_topics= pd.read_csv(os.path.join(Pyserini_files, 'Round1_Topics.csv'))

In [36]:
full_searcher = pysearch.SimpleSearcher(COVID_INDEX_full_text)

full_searcher.set_bm25_similarity(k1=1.5, b=0.4)
full_searcher.set_lm_dirichlet_similarity(mu = 2000)
full_searcher.set_rm3_reranker(fb_terms=20, fb_docs=10, original_query_weight=0.7)

In [4]:
#Query design
from pyserini.analysis.pyanalysis import get_lucene_analyzer, Analyzer
import nltk
from nltk.corpus import stopwords 

#Stopwords for tokenization - manual review
stopwords_manual = ['seek', 'seeking', 'look', 'looking', 'studies', 'study', 'information', 
             'about', 'range', 'studies', 'its', 'coronaviru',
            'other', '2', '19', 'well', ' will', 'from', 'have', 'more', 'covid', 'any', 'what', 
            'should', 'may', 'due', 'help', 'non', 's', 'those', 'people', 'ways', 'all', 'gain',
            'possible', 'toward', 'specifically', 'learned', 'number', 'proportion', 'including',
            'etc', 'still', 'while', 'human', 'specific', 'result', 'results', 'assess', 'need',
            'between', 'take', 'taking', 'patient', 'type', 'cause' ,'frequency', 'less', 'face',
            'likely', 'infect', 'upon', 'develop', 'represent', 'promising', 'step', 'related',
            'papers', 'describe', 'also', 'relevant', 'who', 'show', 'science', 'basic', 'complete',
            'do', 'how', 'been', 'against', 'use', 'to', 'had', 'has', 'approach', 'Studies', 'Stud', 'Inst', 'Divi' ,'Thomae',
            'Brigham', 'Young', 'Univ', 'studies', 'volition', 'severe acute respiratory syndrome', 'affect', 'affected',
            'focus', 'discuss', 'speculative', 'must', 'include', 'draw', 'everyday', 'person', 'another', 'everyday', 'kind']

stopwords = list(set(stopwords.words('english')))
stopwords_manual = list(np.append(stopwords_manual, stopwords))

token_narrative_list = []

#Extract important narrative text
for i in range(len(R1_topics)):
    analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz'))
    tokens = analyzer.analyze(R1_topics['Narrative'][i])
    #Remove stopwords and duplicates from token
    tokens = [w for w in tokens if not w in stopwords_manual]
    tokens = list(set(tokens))
    token_narrative_list.append(tokens)

#Tokenize question
token_question_list = []

#Extract important question text - NOT USED YET
for i in range(len(R1_topics)):
    analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz'))
    tokens = analyzer.analyze(R1_topics['Question'][i])
    #Remove stopwords and duplicates from token
    tokens = [w for w in tokens if not w in stopwords_manual]
    tokens = list(set(tokens))
    token_question_list.append(tokens)

#Anserini searcher can take both query and keywords
keywords_list2 = '2019-nCoV, SARS-CoV-2, COVID-19'
keywords_list = 'COVID-19'

In [5]:
#Manual keywords
manual = [
            'originated', #1
            'temperature, humidity',
            'cross-reactive, crossprotective, cross immunity, specific antibody response, neutralize, adaptive immunity',
            'harms, mortality, dead, risk factors',
            'non-human',
            'identification, detection, PCR, point of care',
            'serology, antigen, antibodies',
            'ascertainment, underestimate',
            'cov, canadian',
            'home, restriction, social-distancing', #10
            'resources, stratification',
            'closure, restricted, movement, gathering',
            'source, route, transmitted',
            'super-spreader',
            'aerosol, contact, droplet',
            'live, days, contaminated',
            'randomized, randomised, controlled',
            'personal, protective, equipment, PPE, face',
            'hygiene, alcohol-based',
            'ARBs, blocker', #20
            'death',
            'coronary',
            'blood, pressure',
            'mellitus',
            '',
            'onset, new, presentation',
            'SARS-CoV-2, 2019-nCoV',
            'chloroquine ',
            'binding',
            ''
            ]

In [37]:
#Extract search results from the searcher
def get_search_res_list(index_searcher, n_papers, paragraph=False):
    docid_list = []
    rank_list = []
    score_list = []
    topic_id_list = []
    title_list = []
    doi_list = []
    
    #Search more papers if searching the paragraph index
    if (paragraph == True):
        n_papers = n_papers * 15

    #Search all 3 indices (abstract, full text, and paragraphs) using question as a query
    for ii, row in R1_topics.iterrows():
        query = R1_topics['Query'][ii]
        question = R1_topics['Question'][ii]
        topic_num = R1_topics['Topic'][ii]
        token_topic = ', '.join(token_narrative_list[ii])
        token_question = ','.join(token_question_list[ii])
        manual_query = manual[ii] 
        input_query = query + '. ' + token_question + '. ' +  token_topic + ', ' + manual_query + ' . ' + keywords_list 

        hits = index_searcher.search(q = input_query, k=n_papers)
        print(topic_num)
        #Each key is a qid, value is the anserini search list
        for i in tqdm(range(0, n_papers), position = 0, leave = True):
            topic_id_list.append(topic_num)
            if (paragraph == True):
                docid_list.append(str(hits[i].docid).split('.')[0])
            else:
                docid_list.append(hits[i].docid)
            rank_list.append(str(i+1))
            score_list.append(hits[i].score)
            title_list.append(hits[i].lucene_document.get("title"))
            doi_list.append('https://doi.org/' + str(hits[i].lucene_document.get("doi")))  

    return topic_id_list, docid_list, rank_list, score_list, title_list, doi_list

In [38]:
#Search extra - will drop excess documents later since the index has duplicates
n_papers = 3000

#Each key is a qid, value is the anserini search list
full_topic, full_docid, full_rank, full_score, full_title, full_doi = get_search_res_list(full_searcher, n_papers, paragraph=False)
     

1


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


2


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


3


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


4


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


5


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


6


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


7


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


8


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


9


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


10


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


11


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


12


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


13


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


14


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


15


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


16


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


17


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


18


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


19


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


20


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


21


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


22


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


23


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


24


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


25


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


26


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


27


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


28


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


29


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))


30


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




In [44]:
#Make the dataframe TREC_EVAL compliant
#Make dataframe from lists generated from search
def TREC_df(topic_id_list, docid_list, rank_list, score_list, title_list, doi_list, run_param):
    #Run-tag for TREC run requirements
    Q0 = ['q0'] * len(topic_id_list) 
    qid = [run_param] * len(topic_id_list)

    df  = {'topic': topic_id_list , 'q0':Q0, 'docid':docid_list, 'rank':rank_list, 
                                 'score':score_list, 'title': title_list, 'doi':doi_list, 'qid':qid}
    df = pd.DataFrame(df)
    df = df[['topic', 'q0', 'docid', 'rank', 'score', 'title', 'doi', 'qid']]
    

    #Remove duplicates
    df.drop_duplicates(subset=['topic', 'docid'], keep='first', inplace = True)
    df.reset_index(drop=True, inplace=True) #Reset indexes to match row
    
    #Re-rank
    df['rank'] = df.groupby('topic')['score'].rank(ascending=False)
    df['rank'] = df['rank'].astype(int)
    
    #For each topic, save up to 1000 searches (since we drop duplicates)
    #TODO: Temporary condition, cut down to 2000 for BERT
    #df = df[df['rank'] <= 2000] 
    #Reset index
    df.reset_index(drop=True, inplace=True)
    
    #Get columns for submission
    #df = df[['topic', 'q0', 'docid', 'rank', 'score', 'qid']]
    
    return df

In [45]:
#Run-tag for TREC run requirements
full_df = TREC_df(full_topic, full_docid, full_rank, full_score, full_title, full_doi, 'R1_BERT')

In [46]:
#Save output
results_folder = r'C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Results_Before_BERT'
if(os.path.exists(results_folder) == False):
    os.mkdir(results_folder)
#full_df.to_csv(os.path.join(results_folder, 'R1_forBert.txt'), sep=' ', index=False, header=None)

full_df.to_csv(os.path.join(results_folder, 'R1_forBert.txt'), sep=' ', index=False, header=True)

In [22]:
def trectools_eval(res_fol, qrels_file):
    
    qrels = TrecQrel(qrels_file)

    #Generate metrics for all 3 indices (1000 docs retrieved for each)
    runs = procedures.list_of_runs_from_path(res_fol, "*.txt")
    results = procedures.evaluate_runs(runs, qrels, per_query=True)
    p5 = procedures.extract_metric_from_results(results, "P_5")
    p10 = procedures.extract_metric_from_results(results, "P_10")
    Bpref = procedures.extract_metric_from_results(results, "bpref")
    Mean_avgP = procedures.extract_metric_from_results(results, 'map')

    #Aggregate results to dataframe
    runs_names = [os.path.basename(str(x)).split('.')[0] for x in runs]
    p5_list = []
    p10_list = []
    map_list = []
    bpref_list = []
    ndcg_list = []

    for i in range(len(runs)):
        p5_list.append(p5[i][1])
        p10_list.append(p10[i][1])
        map_list.append(Mean_avgP[i][1])
        bpref_list.append(Bpref[i][1])

    Result_df  = {'Run':runs_names, 'P@5': p5_list, 'P@10': p10_list, 'MAP': map_list, 'Bpref': bpref_list}
    Result_df = pd.DataFrame(Result_df)
    
    print(Result_df.sort_values('Bpref'))

In [42]:
trectools_eval(res_fol= r'C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Results_Before_BERT',
               qrels_file=r'C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round_1_Results\qrels-1.txt')

Found 4 runs in path C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Results_Before_BERT
            Run       P@5      P@10       MAP     Bpref
0   abstract_R1  0.493333  0.463333  0.220424  0.425081
2  paragraph_R1  0.566667  0.466667  0.265508  0.444095
1       full_R1  0.700000  0.626667  0.277332  0.448437
3    R1_forBert  0.746667  0.683333  0.327621  0.530201
