In [2]:
import pandas as pd
import numpy as np
import torch
import os
from tqdm.auto import tqdm
import json
from datetime import datetime

#Pyserini search
from pyserini.search import SimpleSearcher, SimpleFusionSearcher
from jnius import autoclass

#XML parsing
import xml.etree.ElementTree as ET

#TREC_EVAL
from trectools import misc, TrecRun, TrecQrel, procedures, fusion


In [3]:
#Code can start here
Pyserini_files = r'C:\Users\Jimmy Chen\Box\COVID19\CORD19\Pyserini_Lucene_CORD_index'
TREC_COVID_root = r'C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID'

#Round 2 indexes
R3_abstract = os.path.join(Pyserini_files, 'lucene-index-cord19-abstract-2020-05-19')
R3_fulltext = os.path.join(Pyserini_files, 'lucene-index-cord19-full-text-2020-05-19')
R3_paragraphs = os.path.join(Pyserini_files, 'lucene-index-cord19-paragraph-2020-05-19')
with open(os.path.join(Pyserini_files, 'docids-rnd3.txt')) as f:
    R3_valid = f.read().splitlines()
    
#Load R1 qrels and extract docids searched already for each topic
qrels_file1 = os.path.join(TREC_COVID_root, 'Round_1_Results', 'qrels-1.txt')
qrels_file2 = os.path.join(TREC_COVID_root, 'Round_2_Results', 'qrels-rnd2.txt')
qrels_table1 = pd.read_csv(qrels_file1, sep = ' ', header = None)
qrels_table2 = pd.read_csv(qrels_file2, sep = ' ', header = None)
qrels_table = qrels_table1.append(qrels_table2, ignore_index=True)
qrels_table.columns = ['Topic', 'Q0' , 'Unnamed', 'docid', 'relevance']
qrels_table.drop('Unnamed', axis=1, inplace=True)

In [4]:
#XML files for TREC-COVID rounds
Topics = os.path.join(Pyserini_files, 'topics-rnd3.xml')
Tree = ET.parse(Topics)

Root = Tree.getroot()

topicid = []
query = []
question = []
narrative = []

for child in Root.iter():
    tag =child.tag
    text = child.text
    attrib = child.attrib
    if (tag == 'topic'):
        topicid.append(attrib['number'])
    if (tag == 'query'):
        query.append(text)
    if (tag == 'question'):
        question.append(text)
    if (tag == 'narrative'):
        narrative.append(text)

#Join to CSV
Round_dict  = {'Topic':topicid, 'Query':query, 'Question':question , 'Narrative':narrative}
Round_df = pd.DataFrame(Round_dict)
Round_df = Round_df[['Topic', 'Query', 'Question', 'Narrative']]
Round_df.to_csv(os.path.join(Pyserini_files, 'Round3_Topics.csv'), sep = ' ', header = True)
Round_df

Unnamed: 0,Topic,Query,Question,Narrative
0,1,coronavirus origin,what is the origin of COVID-19,seeking range of information about the SARS-Co...
1,2,coronavirus response to weather changes,how does the coronavirus respond to changes in...,seeking range of information about the SARS-Co...
2,3,coronavirus immunity,will SARS-CoV2 infected people develop immunit...,seeking studies of immunity developed due to i...
3,4,how do people die from the coronavirus,what causes death from Covid-19?,Studies looking at mechanisms of death from Co...
4,5,animal models of COVID-19,what drugs have been active against SARS-CoV o...,Papers that describe the results of testing d...
5,6,coronavirus test rapid testing,what types of rapid testing for Covid-19 have ...,Looking for studies identifying ways to diagno...
6,7,serological tests for coronavirus,are there serological tests that detect antibo...,Looking for assays that measure immune respons...
7,8,coronavirus under reporting,how has lack of testing availability led to un...,Looking for studies answering questions of imp...
8,9,coronavirus in Canada,how has COVID-19 affected Canada,"seeking data related to infections (confirm, s..."
9,10,coronavirus social distancing impact,has social distancing had an impact on slowing...,seeking specific information on studies that h...


In [5]:
#Query design
from pyserini.analysis import Analyzer, get_lucene_analyzer
import nltk
from nltk.corpus import stopwords 
nltk.download('stopwords')

#Stopwords for tokenization - manual review
stopwords_manual = ['seek', 'seeking', 'look', 'looking', 'studies', 'study', 'information', 
             'about', 'range', 'studies', 'its', 'coronaviru',
            'other', '2', '19', 'well', ' will', 'from', 'have', 'more', 'covid', 'any', 'what', 
            'should', 'may', 'due', 'help', 'non', 's', 'those', 'people', 'ways', 'all', 'gain',
            'possible', 'toward', 'specifically', 'learned', 'number', 'proportion', 'including',
            'etc', 'still', 'while', 'human', 'specific', 'result', 'results', 'assess', 'need',
            'between', 'take', 'taking', 'patient', 'type', 'cause' ,'frequency', 'less', 'face',
            'likely', 'infect', 'upon', 'develop', 'represent', 'promising', 'step', 'related',
            'papers', 'describe', 'also', 'relevant', 'who', 'show', 'science', 'basic', 'complete',
            'do', 'how', 'been', 'against', 'use', 'to', 'had', 'has', 'approach', 'Studies', 'Stud', 'Inst', 'Divi' ,'Thomae',
            'Brigham', 'Young', 'Univ', 'studies', 'volition', 'severe acute respiratory syndrome', 'affect', 'affected',
            'focus', 'discuss', 'speculative', 'must', 'include', 'draw', 'everyday', 'person', 'another', 'everyday', 'kind',
            'comparison', 'direct', 'previous', 'articles', 'among', 'ncov', 'mechanism', 'implication', 'anti',
            'potential', 'occur', 'often','pathogenesi', 'observe', 'provide', 'em', '2019']

stopwords = list(set(stopwords.words('english')))
stopwords_manual = list(np.append(stopwords_manual, stopwords))

token_narrative_list = []

#Extract important narrative text
for i in range(len(Round_df)):
    analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz'))
    tokens = analyzer.analyze(Round_df['Narrative'][i])
    for i, word in enumerate(tokens):
        if ('exclud' in word):
            tokens = tokens[:i]
            break
    #Remove stopwords and duplicates from token
    tokens = [w for w in tokens if not w in stopwords_manual]
    tokens = list(set(tokens))
    #Remove all tokens after exclude
    
    token_narrative_list.append(tokens)

#Tokenize question
token_question_list = []

#Extract important question text - NOT USED YET
for i in range(len(Round_df)):
    analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz'))
    tokens = analyzer.analyze(Round_df['Question'][i])
    #Remove stopwords and duplicates from token
    tokens = [w for w in tokens if not w in stopwords_manual]
    tokens = list(set(tokens))
    token_question_list.append(tokens)

#Anserini searcher can take both query and keywords
#keywords_list = '2019-nCoV, SARS-CoV-2, COVID-19'
keywords_list = 'COVID-19'

#Manual keywords
manual = [
            'originated', #1
            'temperature, humidity',
            'cross-reactive, crossprotective, cross immunity, specific antibody response, neutralize, adaptive immunity',
            'harms, mortality, dead, risk factors',
            'non-human',
            'identification, detection, PCR, point of care',
            'serology, antigen, antibodies',
            'ascertainment, underestimate',
            'cov, canadian',
            'home, restriction, social-distancing', #10
            'resources, stratification',
            'closure, restricted, movement, gathering',
            'source, route, transmitted',
            'super-spreader',
            'aerosol, contact, droplet',
            'live, days, contaminated',
            'randomized, randomised, controlled',
            'personal, protective, equipment, PPE, face',
            'hygiene, alcohol-based',
            'ARBs, blocker', #20
            'death',
            'coronary',
            'blood, pressure',
            'mellitus',
            '',
            'onset, new, presentation',
            'SARS-CoV-2, 2019-nCoV',
            'chloroquine ',
            'binding',
            '', #30
            '', #31
            'type', #32
            '', #33
            'long-term, survivors', #34
            '', #35
            'S-protein, S Protein',
            '',
            'pathogenesis, immune',
            'treatment',
            '',
]

[nltk_data] Downloading package stopwords to C:\Users\Jimmy
[nltk_data]     Chen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
#Create SimpleSearcher for full text and paragraph indexes - Do this for Round 3 data
abstract_searcher = SimpleSearcher(R3_abstract)
full_searcher = SimpleSearcher(R3_fulltext)
paragraph_searcher = SimpleSearcher(R3_paragraphs)

#Tuned hyperparameters
abstract_searcher.set_bm25(k1=1.5, b=0.4)
abstract_searcher.set_qld(mu = 2000)
abstract_searcher.set_rm3(fb_terms=20, fb_docs=10, original_query_weight=0.7) #Pseudo-reranker

full_searcher.set_bm25(k1=1.5, b=0.4)
full_searcher.set_qld(mu = 2000)
full_searcher.set_rm3(fb_terms=20, fb_docs=10, original_query_weight=0.7)

paragraph_searcher.set_bm25(k1=1.5, b=0.4)
paragraph_searcher.set_qld(mu = 2000)
paragraph_searcher.set_rm3(fb_terms=20, fb_docs=10, original_query_weight=0.7)

In [96]:
input_queries = []

for ii, row in Round_df.iterrows():
    query = Round_df['Query'][ii]
    question = Round_df['Question'][ii]
    topic_num = Round_df['Topic'][ii]
    token_topic = ', '.join(token_narrative_list[ii])
    token_question = ','.join(token_question_list[ii])
    manual_query = manual[ii] 
    input_query = query + '. ' + token_question + '. ' +  token_topic + ', ' + manual_query + ' . ' + keywords_list 
    input_queries.append(input_query)

#Save
with open(r"C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Search Queries\Round3_QQNTokens.txt", "w") as outfile:
    outfile.write("\n".join(input_queries))

In [8]:
#Extract search results from the searcher
def get_search_res_list(index_searcher, n_papers, paragraph = False):
    docid_list = []
    rank_list = []
    score_list = []
    topic_id_list = []
    title_list = []
    doi_list = []
    publish_list = []
    
    if (paragraph == True):
        n_papers = n_papers * 20
    
    #Search all 3 indices (abstract, full text, and paragraphs) using question as a query
    for ii, row in Round_df.iterrows():
        query = Round_df['Query'][ii]
        question = Round_df['Question'][ii]
        topic_num = Round_df['Topic'][ii]
        token_topic = ', '.join(token_narrative_list[ii])
        token_question = ','.join(token_question_list[ii])
        manual_query = manual[ii] 
        input_query = query + '. ' + token_question + '. ' +  token_topic + ', ' + manual_query + ' . ' + keywords_list 
        
        hits = index_searcher.search(q = input_query, k=n_papers)
        print(topic_num)
        #Each key is a qid, value is the anserini search list
        for i in tqdm(range(0, n_papers), position = 0, leave = True):
            topic_id_list.append(topic_num)
            if (paragraph == True):
                docid_list.append(str(hits[i].docid).split('.')[0])
            else:
                docid_list.append(hits[i].docid)
            rank_list.append(str(i+1))
            score_list.append(hits[i].score)
            title_list.append(hits[i].lucene_document.get("title"))
            doi_list.append('https://doi.org/' + str(hits[i].lucene_document.get("doi")))  
            
            #Get published date
            doc_json = json.loads(hits[i].raw)
            metadata = json.loads(json.dumps(doc_json['csv_metadata']))
            publish_time_str = metadata['publish_time']
            if len(publish_time_str) > 0:
                try:
                    publish_time = datetime.strptime(publish_time_str, '%Y-%m-%d')
                except:
                    publish_time = datetime.strptime(publish_time_str, '%Y')
            else:
                publish_time = ''
            
            publish_list.append(publish_time)

    return topic_id_list, docid_list, rank_list, score_list, title_list, doi_list, publish_list

In [9]:
#Search 2000 so that we have room to remove up to 1000 docs from Round 1
#Search extra - will drop duplicates and will use extra for BERT reranking
n_papers = 1200

#Each key is a qid, value is the anserini search list
full_topic, full_docid, full_rank, full_score, full_title, full_doi, full_publish = get_search_res_list(full_searcher, n_papers)
paragraph_topic, paragraph_docid, paragraph_rank, paragraph_score, paragraph_title, paragraph_doi, paragraph_publish = get_search_res_list(paragraph_searcher, n_papers, paragraph = True)
abstract_topic, abstract_docid, abstract_rank, abstract_score, abstract_title, abstract_doi, abstract_publish = get_search_res_list(abstract_searcher, n_papers)


1


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


2


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


3


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


4


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


5


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


6


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


7


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


8


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


9


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


10


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


11


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


12


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


13


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


14


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


15


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


16


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


17


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


18


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


19


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


20


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


21


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


22


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


23


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


24


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


25


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


26


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


27


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


28


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


29


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


30


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


31


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


32


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


33


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


34


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


35


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


36


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


37


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


38


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


39


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


40


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


1


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


2


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


3


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


4


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


5


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


6


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


7


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


8


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


9


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


10


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


11


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


12


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


13


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


14


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


15


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


16


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


17


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


18


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


19


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


20


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


21


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


22


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


23


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


24


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


25


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


26


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


27


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


28


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


29


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


30


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


31


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


32


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


33


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


34


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


35


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


36


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


37


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


38


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


39


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


40


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))


1


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


2


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


3


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


4


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


5


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


6


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


7


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


8


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


9


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


10


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


11


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


12


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


13


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


14


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


15


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


16


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


17


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


18


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


19


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


20


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


21


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


22


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


23


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


24


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


25


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


26


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


27


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


28


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


29


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


30


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


31


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


32


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


33


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


34


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


35


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


36


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


37


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


38


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


39


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))


40


HBox(children=(FloatProgress(value=0.0, max=1200.0), HTML(value='')))




In [79]:
#Make dataframe from lists generated from search
def TREC_df(topic_id_list, docid_list, rank_list, score_list, title_list, doi_list, publish_list, run_param, date_param, drop_1000, drop_dups):
    #Run-tag for TREC run requirements
    Q0 = ['q0'] * len(topic_id_list) 
    qid = [run_param] * len(topic_id_list)

    df  = {'topic': topic_id_list , 'q0':Q0, 'docid':docid_list, 'rank':rank_list, 
                                 'score':score_list, 'title': title_list, 'doi':doi_list,'date':publish_list, 'qid':qid}
    df = pd.DataFrame(df)
    df = df[['topic', 'q0', 'docid', 'rank', 'score', 'title', 'doi', 'date', 'qid']]
    
    #Filter by time
    if (date_param == True):
        df = df[df['date'].dt.year >= 2020]
    
    #Remove duplicates
    if (drop_dups == True):
        df.drop_duplicates(subset=['topic', 'docid'], keep='first', inplace = True)
        df.reset_index(drop=True, inplace=True) #Reset indexes to match row
    
    #Drop judged docids in R1 + R2
    index_list = []
    qrel_topics = qrels_table.groupby('Topic')   
    for topic, group in tqdm(qrel_topics, position=0, leave = True):
        #Get all docids of that topic from qrels
        topic_docid = list(group['docid'])
        for i in range(len(df)):
            if(topic == int(df['topic'][i]) and df['docid'][i] in topic_docid):
                index_list.append(i)
    
    df.drop(df.index[index_list], inplace=True)
    
    #Make sure all docids are valid
    df.reset_index(inplace = True)
    index_list = []
    for i in range(len(df)):
        if(df['docid'][i] not in R3_valid):
            index_list.append(i)
    
    df.drop(df.index[index_list], inplace=True)
    #Re-rank
    df['rank'] = df.groupby('topic')['score'].rank(ascending=False)
    df['rank'] = df['rank'].astype(int)
    
    #For each topic, save up to 1000 searches (since we drop duplicates)
    if (drop_1000 == True):
        df = df[df['rank'] <= 1000] 
    #Reset index
    df.reset_index(drop=True, inplace=True)
    
    #Get columns for submission
    df = df[['topic', 'q0', 'docid', 'rank', 'score', 'qid']]
    
    return df

In [11]:
#Make dataframe from lists generated from search
#Instead of grouping qrels, just remove any qrels t
def TREC_df_v2(topic_id_list, docid_list, rank_list, score_list, title_list, doi_list, publish_list, run_param, date_param, drop_1000, drop_dups):
    #Run-tag for TREC run requirements
    Q0 = ['q0'] * len(topic_id_list) 
    qid = [run_param] * len(topic_id_list)

    df  = {'topic': topic_id_list , 'q0':Q0, 'docid':docid_list, 'rank':rank_list, 
                                 'score':score_list, 'title': title_list, 'doi':doi_list,'date':publish_list, 'qid':qid}
    df = pd.DataFrame(df)
    df = df[['topic', 'q0', 'docid', 'rank', 'score', 'title', 'doi', 'date', 'qid']]
    
    #Filter by time
    if (date_param == True):
        df = df[df['date'].dt.year >= 2020]
    
    #Remove duplicates
    if (drop_dups == True):
        df.drop_duplicates(subset=['topic', 'docid'], keep='first', inplace = True)
        df.reset_index(drop=True, inplace=True) #Reset indexes to match row
    
    #Drop judged docids in R1 + R2
    index_list = []
    judged_docids = qrels_table['docid'].tolist()
    for i in range(len(df)):
        if(df['docid'][i] in judged_docids):
            index_list.append(i)
    
    df.drop(df.index[index_list], inplace=True)
    
    #Make sure all docids are valid
    df.reset_index(inplace = True)
    index_list = []
    for i in range(len(df)):
        if(df['docid'][i] not in R3_valid):
            index_list.append(i)
    
    df.drop(df.index[index_list], inplace=True)
    #Re-rank
    df['rank'] = df.groupby('topic')['score'].rank(ascending=False)
    df['rank'] = df['rank'].astype(int)
    
    #For each topic, save up to 1000 searches (since we drop duplicates)
    if (drop_1000 == True):
        df = df[df['rank'] <= 1000] 
    #Reset index
    df.reset_index(drop=True, inplace=True)
    
    #Get columns for submission
    df = df[['topic', 'q0', 'docid', 'rank', 'score', 'qid']]
    
    return df

In [17]:
#v2 dfs
full_df_v2 = TREC_df_v2(full_topic, full_docid, full_rank, full_score, full_title, full_doi, full_publish, 'FullTxt_R3_Orig', False, True, True)
full_df_time_v2 = TREC_df_v2(full_topic, full_docid, full_rank, full_score, full_title, full_doi, full_publish, 'FullTxt_R3_Time', True, True, True)
paragraph_df_v2 = TREC_df_v2(paragraph_topic, paragraph_docid, paragraph_rank, paragraph_score, paragraph_title, paragraph_doi, paragraph_publish, 'paragraphTxt_R3_Orig', False, True, True)
paragraph_df_time_v2 = TREC_df_v2(paragraph_topic, paragraph_docid, paragraph_rank, paragraph_score, paragraph_title, paragraph_doi, paragraph_publish, 'paragraphTxt_R3_Time', True, True, True)
abstract_df_v2 = TREC_df_v2(abstract_topic, abstract_docid, abstract_rank, abstract_score, abstract_title, abstract_doi, abstract_publish, 'abstractTxt_R3_Orig', False, True, True)
abstract_df_time_v2 = TREC_df_v2(abstract_topic, abstract_docid, abstract_rank, abstract_score, abstract_title, abstract_doi, abstract_publish, 'abstractTxt_R3_Time', True, True, True)

In [59]:
full_df = TREC_df(full_topic, full_docid, full_rank, full_score, full_title, full_doi, full_publish, 'FullTxt_R3_Orig', False, False, True)
full_df_time = TREC_df(full_topic, full_docid, full_rank, full_score, full_title, full_doi, full_publish, 'FullTxt_R3_Time', True, False, True)
paragraph_df = TREC_df(paragraph_topic, paragraph_docid, paragraph_rank, paragraph_score, paragraph_title, paragraph_doi, paragraph_publish, 'paragraphTxt_R3_Orig', False, False, True)
paragraph_df_time = TREC_df(paragraph_topic, paragraph_docid, paragraph_rank, paragraph_score, paragraph_title, paragraph_doi, paragraph_publish, 'paragraphTxt_R3_Time', True, False, True)
abstract_df = TREC_df(abstract_topic, abstract_docid, abstract_rank, abstract_score, abstract_title, abstract_doi, abstract_publish, 'abstractTxt_R3_Orig', False, False, True)
abstract_df_time = TREC_df(abstract_topic, abstract_docid, abstract_rank, abstract_score, abstract_title, abstract_doi, abstract_publish, 'abstractTxt_R3_Time', True, False, True)

HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




In [71]:
full_df_1000 = TREC_df(full_topic, full_docid, full_rank, full_score, full_title, full_doi, full_publish, 'OHSU_FullTxt_R3_Orig', False, True, True)
full_df_time_1000 = TREC_df(full_topic, full_docid, full_rank, full_score, full_title, full_doi, full_publish, 'OHSU_FullTxt_R3_Time', True, True, True)
paragraph_df_1000 = TREC_df(paragraph_topic, paragraph_docid, paragraph_rank, paragraph_score, paragraph_title, paragraph_doi, paragraph_publish, 'OHSU_paragraphTxt_R3_Orig', False, True, True)
paragraph_df_time_1000 = TREC_df(paragraph_topic, paragraph_docid, paragraph_rank, paragraph_score, paragraph_title, paragraph_doi, paragraph_publish, 'OHSU_paragraphTxt_R3_Time', True, True, True)
abstract_df_1000 = TREC_df(abstract_topic, abstract_docid, abstract_rank, abstract_score, abstract_title, abstract_doi, abstract_publish, 'OHSU_abstractTxt_R3_Orig', False, True, True)
abstract_df_time_1000 = TREC_df(abstract_topic, abstract_docid, abstract_rank, abstract_score, abstract_title, abstract_doi, abstract_publish, 'OHSU_abstractTxt_R3_Time', True, True, True)

HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




In [72]:
Results_folder = r'C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round3_Runs'
if(os.path.exists(Results_folder) == False):
    os.mkdir(Results_folder)
    
full_df.to_csv(os.path.join(Results_folder, 'FullTxt.txt'), sep=' ', index=False, header=None)
full_df_time.to_csv(os.path.join(Results_folder, 'FullTxt_time.txt'), sep=' ', index=False, header=None)

paragraph_df.to_csv(os.path.join(Results_folder, 'ParaTxt.txt'), sep=' ', index=False, header=None)
paragraph_df_time.to_csv(os.path.join(Results_folder, 'ParaTxt_time.txt'), sep=' ', index=False, header=None)

abstract_df.to_csv(os.path.join(Results_folder, 'AbstractTxt.txt'), sep=' ', index=False, header=None)
abstract_df_time.to_csv(os.path.join(Results_folder, 'AbstractTxt_time.txt'), sep=' ', index=False, header=None)

full_df_time_1000.to_csv(os.path.join(Results_folder, 'FullTxt_1000.txt'), sep=' ', index=False, header=None)
full_df_time_1000.to_csv(os.path.join(Results_folder, 'FullTxt_time_1000.txt'), sep=' ', index=False, header=None)

paragraph_df_1000.to_csv(os.path.join(Results_folder, 'ParaTxt_1000.txt'), sep=' ', index=False, header=None)
paragraph_df_time_1000.to_csv(os.path.join(Results_folder, 'ParaTxt_time_1000.txt'), sep=' ', index=False, header=None)

abstract_df_1000.to_csv(os.path.join(Results_folder, 'AbstractTxt_1000.txt'), sep=' ', index=False, header=None)
abstract_df_time_1000.to_csv(os.path.join(Results_folder, 'AbstractTxt_time_1000.txt'), sep=' ', index=False, header=None)



In [22]:
#For v2 runs
Results_folder = r'C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round3_Runs_v2'
if(os.path.exists(Results_folder) == False):
    os.mkdir(Results_folder)
    
full_df_v2.to_csv(os.path.join(Results_folder, 'FullTxt_v2.txt'), sep=' ', index=False, header=None)
full_df_time_v2.to_csv(os.path.join(Results_folder, 'FullTxt_time_v2.txt'), sep=' ', index=False, header=None)

paragraph_df_v2.to_csv(os.path.join(Results_folder, 'ParaTxt_v2.txt'), sep=' ', index=False, header=None)
paragraph_df_time_v2.to_csv(os.path.join(Results_folder, 'ParaTxt_time_v2.txt'), sep=' ', index=False, header=None)

abstract_df_v2.to_csv(os.path.join(Results_folder, 'AbstractTxt_v2.txt'), sep=' ', index=False, header=None)
abstract_df_time_v2.to_csv(os.path.join(Results_folder, 'AbstractTxt_time_v2.txt'), sep=' ', index=False, header=None)

In [15]:
#Perform fusion on 3 df
def fuse_runs(path1, path2, path3, output_name, max_docs = 1000):
    r1 = TrecRun(path1)
    r2 = TrecRun(path2)
    r3 = TrecRun(path3)

    # Easy way to create new baselines by fusing existing runs:
    fused_run = fusion.reciprocal_rank_fusion([r1, r2, r3], max_docs=max_docs)
    
    fused_run.print_subset(os.path.join(Results_folder, str(output_name) + '.txt'), topics=fused_run.topics())

    return os.path.join(Results_folder, str(output_name) + '.txt')

#### Perform fusion on 3 df
def fuse_runs2(path1, path2, path3,path4, path5, path6, output_name, max_docs = 1000):
    r1 = TrecRun(path1)
    r2 = TrecRun(path2)
    r3 = TrecRun(path3)
    r4 = TrecRun(path4)
    r5 = TrecRun(path5)
    r6 = TrecRun(path6)

    # Easy way to create new baselines by fusing existing runs:
    fused_run = fusion.reciprocal_rank_fusion([r1, r2, r3], max_docs=max_docs)
    
    fused_run.print_subset(os.path.join(Results_folder, str(output_name) + '.txt'), topics=fused_run.topics())

    return os.path.join(Results_folder, str(output_name) + '.txt')

In [77]:
#Orig
orig_fusion = fuse_runs(
    os.path.join(Results_folder, 'FullTxt.txt'), os.path.join(Results_folder, 'ParaTxt.txt'), os.path.join(Results_folder, 'AbstractTxt.txt'),
    'orig_fusion', max_docs = 2000
)

#Time
time_fusion = fuse_runs(
    os.path.join(Results_folder, 'FullTxt_time.txt'), os.path.join(Results_folder, 'ParaTxt_time.txt'), os.path.join(Results_folder, 'AbstractTxt_time.txt'),
    'time_fusion', max_docs = 2000                                                                                                              
)
                                                                
#All 6 fusion
total_fusion = fuse_runs2(
    os.path.join(Results_folder, 'FullTxt.txt'), os.path.join(Results_folder, 'ParaTxt.txt'), os.path.join(Results_folder, 'AbstractTxt.txt'),
    os.path.join(Results_folder, 'FullTxt_time.txt'), os.path.join(Results_folder, 'ParaTxt_time.txt'), os.path.join(Results_folder, 'AbstractTxt_time.txt'),
    'total_fusion', max_docs = 2000
)

#Orig
orig_fusion_1000 = fuse_runs(
    os.path.join(Results_folder, 'FullTxt_1000.txt'), os.path.join(Results_folder, 'ParaTxt_1000.txt'), os.path.join(Results_folder, 'AbstractTxt_1000.txt'),
    'orig_fusion_1000', max_docs = 1000
)

#Time
time_fusion_1000 = fuse_runs(
    os.path.join(Results_folder, 'FullTxt_time_1000.txt'), os.path.join(Results_folder, 'ParaTxt_time_1000.txt'), os.path.join(Results_folder, 'AbstractTxt_time_1000.txt'),
    'time_fusion_1000', max_docs = 1000                                                                                                              
)
                                                                
#All 6 fusion
total_fusion_1000 = fuse_runs2(
    os.path.join(Results_folder, 'FullTxt_1000.txt'), os.path.join(Results_folder, 'ParaTxt_1000.txt'), os.path.join(Results_folder, 'AbstractTxt_1000.txt'),
    os.path.join(Results_folder, 'FullTxt_time_1000.txt'), os.path.join(Results_folder, 'ParaTxt_time_1000.txt'), os.path.join(Results_folder, 'AbstractTxt_time_1000.txt'),
    'total_fusion_1000', max_docs = 1000
)

File C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round3_Runs\orig_fusion.txt writen.
File C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round3_Runs\time_fusion.txt writen.
File C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round3_Runs\total_fusion.txt writen.
File C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round3_Runs\orig_fusion_1000.txt writen.
File C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round3_Runs\time_fusion_1000.txt writen.
File C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round3_Runs\total_fusion_1000.txt writen.


In [23]:
#Fuse v2 runs
#Orig
orig_fusion = fuse_runs(
    os.path.join(Results_folder, 'FullTxt_v2.txt'), os.path.join(Results_folder, 'ParaTxt_v2.txt'), os.path.join(Results_folder, 'AbstractTxt_v2.txt'),
    'orig_fusion_v2', max_docs = 1000
)

File C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round3_Runs_v2\orig_fusion_v2.txt writen.


In [78]:
#Load fused files for 2000 docs
orig_fusion_csv = pd.read_csv(orig_fusion, sep = ' ', header = None)
time_fusion_csv = pd.read_csv(time_fusion, sep = ' ', header = None)
total_fusion_csv = pd.read_csv(time_fusion, sep = ' ', header = None)
#Load fused files for 1000 docs
orig_fusion_1000_csv = pd.read_csv(orig_fusion_1000, sep = ' ', header = None)
time_fusion_1000_csv = pd.read_csv(time_fusion_1000, sep = ' ', header = None)
total_fusion_1000_csv = pd.read_csv(time_fusion_1000, sep = ' ', header = None)


#Re-name fusion runtags
orig_fusion_csv[5] = 'OHSU_R3_origfusion'
time_fusion_csv[5] = 'OHSU_R3_timefusion'
total_fusion_csv[5] = 'OHSU_R3_totalfusion'
orig_fusion_1000_csv[5] = 'OHSU_R3_origfusion1000'
time_fusion_1000_csv[5] = 'OHSU_R3_timefusion1000'
total_fusion_1000_csv[5] = 'OHSU_R3_totalfusion1000'

#Save csvs again
orig_fusion_csv.to_csv(os.path.join(Results_folder, 'R3_orig_fusion.txt'), sep=' ', index=False, header=None)
time_fusion_csv.to_csv(os.path.join(Results_folder, 'R3_time_fusion.txt'), sep=' ', index=False, header=None)
total_fusion_csv.to_csv(os.path.join(Results_folder, 'R3_time_fusion.txt'), sep=' ', index=False, header=None)
orig_fusion_1000_csv.to_csv(os.path.join(Results_folder, 'R3_orig_fusion_1000.txt'), sep=' ', index=False, header=None)
time_fusion_1000_csv.to_csv(os.path.join(Results_folder, 'R3_time_fusion_1000.txt'), sep=' ', index=False, header=None)
total_fusion_1000_csv.to_csv(os.path.join(Results_folder, 'R3_time_fusion_1000.txt'), sep=' ', index=False, header=None)

# TREC Tools

In [24]:
def trectools_eval(res_fol, qrels_file):
    
    qrels = TrecQrel(qrels_file)

    #Generate metrics for all 3 indices (1000 docs retrieved for each)
    runs = procedures.list_of_runs_from_path(res_fol, "*.txt")
    results = procedures.evaluate_runs(runs, qrels, per_query=True)
    p5 = procedures.extract_metric_from_results(results, "P_5")
    p10 = procedures.extract_metric_from_results(results, "P_10")
    Bpref = procedures.extract_metric_from_results(results, "bpref")
    Mean_avgP = procedures.extract_metric_from_results(results, 'map')

    #Aggregate results to dataframe
    runs_names = [os.path.basename(str(x)).split('.')[0] for x in runs]
    p5_list = []
    p10_list = []
    map_list = []
    bpref_list = []
    ndcg_list = []

    for i in range(len(runs)):
        p5_list.append(p5[i][1])
        p10_list.append(p10[i][1])
        map_list.append(Mean_avgP[i][1])
        bpref_list.append(Bpref[i][1])

    Result_df  = {'Run':runs_names, 'P@5': p5_list, 'P@10': p10_list, 'MAP': map_list, 'Bpref': bpref_list}
    Result_df = pd.DataFrame(Result_df)
    
    print(Result_df.sort_values('MAP', ascending = False))

In [25]:
trectools_eval(res_fol= Results_folder,
               qrels_file=os.path.join(Pyserini_files, 'qrels-covid_d3_j2.5-3.txt'))

Found 11 runs in path C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round3_Runs_v2
                    Run    P@5    P@10       MAP     Bpref
4          combo_fusion  0.790  0.7425  0.322697  0.582784
7      orig_fusion_1000  0.690  0.6025  0.240156  0.512049
0   AbstractTxt_time_v2  0.640  0.6075  0.200943  0.384932
9       ParaTxt_time_v2  0.635  0.5650  0.187657  0.446578
8        orig_fusion_v2  0.605  0.5175  0.176317  0.435190
2              anserini  0.380  0.3775  0.175438  0.520830
5       FullTxt_time_v2  0.615  0.5500  0.151317  0.331003
3              anserini  0.345  0.3625  0.147120  0.535081
1        AbstractTxt_v2  0.480  0.4450  0.141236  0.402371
10           ParaTxt_v2  0.485  0.4225  0.130786  0.390150
6            FullTxt_v2  0.545  0.4975  0.129651  0.343574


# BERT setup

In [97]:
#Use Pyjnius to extract full texts from collection
JString = autoclass('java.lang.String')
JIndexReaderUtils = autoclass('io.anserini.index.IndexReaderUtils')
reader = JIndexReaderUtils.getReader(R3_fulltext)

In [104]:
total_fusion_csv.columns = ['topic', 'q0', 'docid', 'rank', 'score', 'qid']

In [127]:
# Fetch raw document contents by docid:
rawdoc = JIndexReaderUtils.documentRaw(reader, JString(total_fusion_csv['docid'][12224]))
json_doc = json.dumps(rawdoc) # dict to string
doc_json = json.loads(json_doc)
doc_json = doc_json.replace('\n        ', '')
doc_json = json.loads(doc_json)

print(doc_json['csv_metadata']['abstract'])
print(doc_json['csv_metadata']['title'])

# try:
#     abstract_doc = doc_json['abstract'][0]['text']
# except:
#     abstract_doc = ''
# try:
#     title_doc = str(doc_json['metadata']['title']).strip("[]")
# except:
#     title_doc = str(doc_json['title']).strip("[]")

    
# print(abstract_doc)
# print(title_doc)

Background: Mounting evidence suggests that there is an undetected pool of COVID-19 asymptomatic but infectious cases. Estimating the number of asymptomatic infections has been crucial to understand the virus and contain its spread, which is, however, hard to be accurately counted. Methods: We propose an approach of machine learning based fine-grained simulator (MLSim), which integrates multiple practical factors including disease progress in the incubation period, cross-region population movement, undetected asymptomatic patients, and prevention and containment strength. The interactions among these factors are modeled by virtual transmission dynamics with several undetermined parameters, which are determined from epidemic data by machine learning techniques. When MLSim learns to match the real data closely, it also models the number of asymptomatic patients. MLSim is learned from the open Chinese global epidemic data. Findings: MLSim showed better forecast accuracy than the SEIR and 

In [99]:
for i in tqdm(range(len(full_df_BERT)), position = 0, leave = True):
    query = queries[int(full_df_BERT['topic'][i]) - 1]
    docid = full_df_BERT['docid'][i]
    rank = full_df_BERT['rank'][i]
    score = full_df_BERT['score'][i]
      
    # Fetch raw document contents by docid:
    rawdoc = JIndexReaderUtils.documentRaw(reader, JString(docid))
    json_doc = json.dumps(rawdoc) # dict to string
    doc_json = json.loads(json_doc)
    doc_json = doc_json.replace('\n        ', '')
    doc_json = json.loads(doc_json)
    
    
    
    paragraph_states = []
    #Load document as title + abstract + paragraph
    try:
        abstract_doc = doc_json['abstract'][0]['text']
    except:
        abstract_doc = ''
    try:
        title_doc = str(doc_json['metadata']['title']).strip("[]")
    except:
        title_doc = str(doc_json['title']).strip("[]")

Unnamed: 0,0,1,2,3,4,5
0,1,Q0,4dtk1kyh,1,0.046155,OHSU_R3_totalfusion
1,1,Q0,8ybfiz8f,2,0.043278,OHSU_R3_totalfusion
2,1,Q0,puqcbf8t,3,0.040643,OHSU_R3_totalfusion
3,1,Q0,yqeifpoy,4,0.040640,OHSU_R3_totalfusion
4,1,Q0,wim5q9a5,5,0.039433,OHSU_R3_totalfusion
...,...,...,...,...,...,...
61230,40,Q0,dxabs45r,1210,0.000947,OHSU_R3_totalfusion
61231,40,Q0,xkg0ylz8,1211,0.000946,OHSU_R3_totalfusion
61232,40,Q0,7zxuh0f1,1212,0.000945,OHSU_R3_totalfusion
61233,40,Q0,6wolrfvk,1213,0.000944,OHSU_R3_totalfusion
