In [1]:
import pandas as pd
import numpy as np
import torch
import os
from tqdm.auto import tqdm
import json
from datetime import datetime

#Pyserini search
from pyserini.search import SimpleSearcher, SimpleFusionSearcher
from jnius import autoclass

#XML parsing
import xml.etree.ElementTree as ET

#TREC_EVAL
from trectools import misc, TrecRun, TrecQrel, procedures, fusion


In [19]:
#Code can start here
Pyserini_files = r'C:\Users\Jimmy Chen\Box\COVID19\CORD19\Pyserini_Lucene_CORD_index'
TREC_COVID_root = r'C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID'

#Round 2 indexes
R4_abstract = os.path.join(Pyserini_files, 'lucene-index-cord19-abstract-2020-06-19')
R4_fulltext = os.path.join(Pyserini_files, 'lucene-index-cord19-full-text-2020-06-19')
R4_paragraphs = os.path.join(Pyserini_files, 'lucene-index-cord19-paragraph-2020-06-19')
with open(os.path.join(Pyserini_files, 'docids-rnd4.txt')) as f:
    R4_valid = f.read().splitlines()
    
#Load R1 qrels and extract docids searched already for each topic
qrels_file1 = os.path.join(Pyserini_files, 'qrels-1.txt')
qrels_file2 = os.path.join(Pyserini_files, 'qrels-rnd2.txt')
qrels_file3 = os.path.join(Pyserini_files, 'qrels-covid_d3_j0.5-3.txt')
qrels_table1 = pd.read_csv(qrels_file1, sep = ' ', header = None)
qrels_table2 = pd.read_csv(qrels_file2, sep = ' ', header = None)
qrels_table3 = pd.read_csv(qrels_file3, sep = ' ', header = None)
qrels_table = qrels_table1.append(qrels_table2, ignore_index=True)
qrels_table.columns = ['Topic', 'Q0' , 'Unnamed', 'docid', 'relevance']
qrels_table.drop('Unnamed', axis=1, inplace=True)
qrels_table3.columns = ['Topic', 'Q0' , 'docid', 'relevance']
qrels_table = qrels_table.append(qrels_table3, ignore_index = True)


In [8]:
#Extract topics 
#XML files for TREC-COVID rounds
Topics = os.path.join(Pyserini_files, 'topics-rnd4.xml')
Tree = ET.parse(Topics)

Root = Tree.getroot()

topicid = []
query = []
question = []
narrative = []

for child in Root.iter():
    tag =child.tag
    text = child.text
    attrib = child.attrib
    if (tag == 'topic'):
        topicid.append(attrib['number'])
    if (tag == 'query'):
        query.append(text)
    if (tag == 'question'):
        question.append(text)
    if (tag == 'narrative'):
        narrative.append(text)

#Join to CSV
Round_dict  = {'Topic':topicid, 'Query':query, 'Question':question , 'Narrative':narrative}
Round_df = pd.DataFrame(Round_dict)
Round_df = Round_df[['Topic', 'Query', 'Question', 'Narrative']]
Round_df.to_csv(os.path.join(Pyserini_files, 'Round4_Topics.csv'), sep = ' ', header = True)
Round_df

Unnamed: 0,Topic,Query,Question,Narrative
0,1,coronavirus origin,what is the origin of COVID-19,seeking range of information about the SARS-Co...
1,2,coronavirus response to weather changes,how does the coronavirus respond to changes in...,seeking range of information about the SARS-Co...
2,3,coronavirus immunity,will SARS-CoV2 infected people develop immunit...,seeking studies of immunity developed due to i...
3,4,how do people die from the coronavirus,what causes death from Covid-19?,Studies looking at mechanisms of death from Co...
4,5,animal models of COVID-19,what drugs have been active against SARS-CoV o...,Papers that describe the results of testing d...
5,6,coronavirus test rapid testing,what types of rapid testing for Covid-19 have ...,Looking for studies identifying ways to diagno...
6,7,serological tests for coronavirus,are there serological tests that detect antibo...,Looking for assays that measure immune respons...
7,8,coronavirus under reporting,how has lack of testing availability led to un...,Looking for studies answering questions of imp...
8,9,coronavirus in Canada,how has COVID-19 affected Canada,"seeking data related to infections (confirm, s..."
9,10,coronavirus social distancing impact,has social distancing had an impact on slowing...,seeking specific information on studies that h...


In [33]:
#Udelq query processor
Udel4Queries = os.path.join(Pyserini_files, 'topics-rnd4-udel.xml')
Tree = ET.parse(Udel4Queries)
Root = Tree.getroot()
UDel_query = []
for child in Root.iter():
    tag =child.tag
    text = child.text
    attrib = child.attrib
    if (tag == 'topic number'):
        topicid.append(attrib['number'])
    if (tag == 'query'):
        UDel_query.append(text)
        
#Save
with open(r"C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Search Queries\Udel4Queries.txt", "w") as outfile:
    outfile.write("\n".join(UDel_query))

In [11]:
#OHSU Query design
from pyserini.analysis import Analyzer, get_lucene_analyzer
import nltk
from nltk.corpus import stopwords 
nltk.download('stopwords')

#Stopwords for tokenization - manual review
stopwords_manual = ['seek', 'seeking', 'look', 'looking', 'studies', 'study', 'information', 
             'about', 'range', 'studies', 'its', 'coronaviru',
            'other', '2', '19', 'well', ' will', 'from', 'have', 'more', 'covid', 'any', 'what', 
            'should', 'may', 'due', 'help', 'non', 's', 'those', 'people', 'ways', 'all', 'gain',
            'possible', 'toward', 'specifically', 'learned', 'number', 'proportion', 'including',
            'etc', 'still', 'while', 'human', 'specific', 'result', 'results', 'assess', 'need',
            'between', 'take', 'taking', 'patient', 'type', 'cause' ,'frequency', 'less', 'face',
            'likely', 'infect', 'upon', 'develop', 'represent', 'promising', 'step', 'related',
            'papers', 'describe', 'also', 'relevant', 'who', 'show', 'science', 'basic', 'complete',
            'do', 'how', 'been', 'against', 'use', 'to', 'had', 'has', 'approach', 'Studies', 'Stud', 'Inst', 'Divi' ,'Thomae',
            'Brigham', 'Young', 'Univ', 'studies', 'volition', 'severe acute respiratory syndrome', 'affect', 'affected',
            'focus', 'discuss', 'speculative', 'must', 'include', 'draw', 'everyday', 'person', 'another', 'everyday', 'kind',
            'comparison', 'direct', 'previous', 'articles', 'among', 'ncov', 'mechanism', 'implication', 'anti',
            'potential', 'occur', 'often','pathogenesi', 'observe', 'provide', 'em', '2019', 'increase', 'decrease', 'rate',
            'association', 'contain', 'analyze', 'u.s', 'rest' , 'differ', 'reason', 'drug', 'burden',
            'explore', 'sar', 'cov', 'cryo', 'crystallography', 'reference', 'lab']

udel_stopwords = [
        'a', 'about', 'above', 'according', 'across', 'after', 'afterwards', 'again',
        'against', 'albeit', 'all', 'almost', 'alone', 'along', 'already', 'also',
        'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another',
        'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere',
        'apart', 'are', 'around', 'as', 'at', 'av', 'be', 'became', 'because',
        'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind',
        'being', 'below', 'beside', 'besides', 'between', 'beyond', 'both', 'but',
        'by', 'can', 'cannot', 'canst', 'certain', 'cf', 'choose', 'contrariwise',
        'cos', 'could', 'cu', 'day', 'do', 'does', 'doesn', 't', 'doing', 'dost',
        'doth', 'double', 'down', 'dual', 'during', 'each', 'either', 'else',
        'elsewhere', 'enough', 'et', 'etc', 'even', 'ever', 'every', 'everybody',
        'everyone', 'everything', 'everywhere', 'except', 'excepted', 'excepting',
        'exception', 'exclude', 'excluding', 'exclusive', 'far', 'farther',
        'farthest', 'few', 'ff', 'first', 'for', 'formerly', 'forth', 'forward',
        'from', 'front', 'further', 'furthermore', 'furthest', 'get', 'go', 'had',
        'halves', 'hardly', 'has', 'hast', 'hath', 'have', 'he', 'hence',
        'henceforth', 'her', 'here', 'hereabouts', 'hereafter', 'hereby', 'herein',
        'hereto', 'hereupon', 'hers', 'herself', 'him', 'himself', 'hindmost', 'his',
        'hither', 'hitherto', 'how','however', 'howsoever', 'i', 'ie', 'if', 'in',
        'inasmuch', 'inc', 'include', 'included', 'including', 'indeed', 'indoors',
        'inside', 'insomuch', 'instead', 'into', 'inward', 'inwards', 'is', 'it',
        'its', 'itself', 'just', 'kg', 'kind', 'km', 'last', 'latter', 'latterly',
        'less', 'lest', 'let', 'like', 'little', 'ltd', 'many', 'may', 'maybe', 'me',
        'meantime', 'meanwhile', 'might', 'more','moreover', 'most', 'mostly', 'mr',
        'mrs', 'ms', 'much', 'must', 'my', 'myself', 'namely', 'need', 'neither',
        'never', 'nevertheless', 'next', 'no', 'nobody', 'none', 'nonetheless',
        'noone', 'nope', 'nor', 'not', 'nothing', 'notwithstanding', 'now',
        'nowadays', 'nowhere', 'of', 'off', 'often', 'ok', 'on', 'once', 'one',
        'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our',
        'ours', 'ourselves', 'out', 'outside', 'over', 'own', 'per', 'perhaps',
        'plenty', 'provide', 'quite', 'rather', 'really', 'round', 'said', 'sake',
        'same', 'sang', 'save', 'saw', 'see', 'seeing', 'seem', 'seemed', 'seeming',
        'seems', 'seen', 'seldom', 'selves', 'sent', 'several', 'shalt', 'she',
        'should', 'shown', 'sideways', 'since', 'slept', 'slew', 'slung', 'slunk',
        'smote', 'so', 'some', 'somebody', 'somehow', 'someone', 'something',
        'sometime', 'sometimes', 'somewhat', 'somewhere', 'spake', 'spat', 'spoke',
        'spoken', 'sprang', 'sprung', 'stave', 'staves', 'still', 'such', 'supposing',
        'than', 'that', 'the', 'thee', 'their', 'them', 'themselves', 'then', 'thence',
        'thenceforth', 'there', 'thereabout', 'thereabouts', 'thereafter', 'thereby',
        'therefore', 'therein', 'thereof', 'thereon', 'thereto', 'thereupon', 'these',
        'they', 'this', 'those', 'thou', 'though', 'thrice', 'through', 'throughout',
        'thru', 'thus', 'thy', 'thyself', 'till', 'to', 'together', 'too', 'toward',
        'towards', 'ugh', 'unable', 'under', 'underneath', 'unless', 'unlike', 'until',
        'up', 'upon', 'upward', 'upwards', 'us', 'use', 'used', 'using', 'very', 'via',
        'vs', 'want', 'was', 'we', 'week', 'well', 'were', 'what', 'whatever',
        'whatsoever', 'when', 'whence', 'whenever', 'whensoever', 'where', 'whereabouts',
        'whereafter', 'whereas', 'whereat', 'whereby', 'wherefore', 'wherefrom',
        'wherein', 'whereinto', 'whereof', 'whereon', 'wheresoever', 'whereto', 'whereunto',
        'whereupon', 'wherever', 'wherewith', 'whether', 'whew', 'which', 'whichever',
        'whichsoever', 'while', 'whilst', 'whither', 'who', 'whoa', 'whoever', 'whole',
        'whom', 'whomever', 'whomsoever', 'whose', 'whosoever', 'why', 'will', 'wilt',
        'with', 'within', 'without', 'worse', 'worst', 'would', 'wow', 'ye', 'year', 'yet',
        'yippee', 'you', 'your', 'yours', 'yourself', 'yourselves'
]

stopwords = list(set(stopwords.words('english')))
stopwords_manual = list(np.append(stopwords_manual, stopwords))
stopwords_manual = list(np.append(stopwords_manual, udel_stopwords))

token_narrative_list = []

#Extract important narrative text
for i in range(len(Round_df)):
    analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz'))
    tokens = analyzer.analyze(Round_df['Narrative'][i])
    for i, word in enumerate(tokens):
        if ('exclud' in word):
            tokens = tokens[:i]
            break
    #Remove stopwords and duplicates from token
    tokens = [w for w in tokens if not w in stopwords_manual]
    tokens = list(set(tokens))
    #Remove all tokens after exclude
    
    token_narrative_list.append(tokens)

#Tokenize question
token_question_list = []

#Extract important question text - NOT USED YET
for i in range(len(Round_df)):
    analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz'))
    tokens = analyzer.analyze(Round_df['Question'][i])
    #Remove stopwords and duplicates from token
    tokens = [w for w in tokens if not w in stopwords_manual]
    tokens = list(set(tokens))
    token_question_list.append(tokens)

#Anserini searcher can take both query and keywords
#keywords_list = '2019-nCoV, SARS-CoV-2, COVID-19'
keywords_list = 'COVID-19'

#Manual keywords
manual = [
            'originated', #1
            'temperature, humidity',
            'cross-reactive, crossprotective, cross immunity, specific antibody response, neutralize, adaptive immunity',
            'harms, mortality, dead, risk factors',
            'non-human',
            'identification, detection, PCR, point of care',
            'serology, antigen, antibodies',
            'ascertainment, underestimate',
            'cov, canadian',
            'home, restriction, social-distancing', #10
            'resources, stratification',
            'closure, restricted, movement, gathering',
            'source, route, transmitted',
            'super-spreader',
            'aerosol, contact, droplet',
            'live, days, contaminated',
            'randomized, randomised, controlled',
            'personal, protective, equipment, PPE, face',
            'hygiene, alcohol-based',
            'ARBs, blocker', #20
            'death',
            'coronary',
            'blood, pressure',
            'mellitus',
            '',
            'onset, new, presentation',
            'SARS-CoV-2, 2019-nCoV',
            'chloroquine ',
            'binding',
            '', #30
            '', #31
            'type', #32
            '', #33
            'long-term, survivors', #34
            '', #35
            'S-protein, S Protein',
            '',
            'pathogenesis, immune',
            'treatment',
            '', #40
            '',
            '',
            'abuse',
            'facemask, face masks, face',
            '' #45
]

[nltk_data] Downloading package stopwords to C:\Users\Jimmy
[nltk_data]     Chen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
#Combine preprocessed data to form OHSU queries

input_queries = []

for ii, row in Round_df.iterrows():
    query = Round_df['Query'][ii]
    question = Round_df['Question'][ii]
    topic_num = Round_df['Topic'][ii]
    token_topic = ' '.join(token_narrative_list[ii])
    token_question = ' '.join(token_question_list[ii])
    manual_query = manual[ii] 
    input_query = query + '. ' + token_question + '. ' +  token_topic + ' ' + manual_query + ' . ' + keywords_list 
    input_queries.append(input_query)

#Save
with open(r"C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Search Queries\Round4_QQNTokens.txt", "w") as outfile:
    outfile.write("\n".join(input_queries))

In [96]:
#Create SimpleSearcher for full text and paragraph indexes - Do this for Round 3 data
abstract_searcher = SimpleSearcher(R4_abstract)
full_searcher = SimpleSearcher(R4_fulltext)
paragraph_searcher = SimpleSearcher(R4_paragraphs)

#Tuned hyperparameters
abstract_searcher.set_bm25(k1=1.5, b=0.4)
abstract_searcher.set_qld(mu = 2000)
abstract_searcher.set_rm3(fb_terms=20, fb_docs=10, original_query_weight=0.9) #Pseudo-reranker

full_searcher.set_bm25(k1=1.5, b=0.4)
full_searcher.set_qld(mu = 2000)
full_searcher.set_rm3(fb_terms=20, fb_docs=10, original_query_weight=0.9)

paragraph_searcher.set_bm25(k1=1.5, b=0.4)
paragraph_searcher.set_qld(mu = 2000)
paragraph_searcher.set_rm3(fb_terms=20, fb_docs=10, original_query_weight=0.9)

In [29]:
#Extract search results from the searcher using OHSU query generator
def get_search_res_list(index_searcher, n_papers, paragraph = False):
    docid_list = []
    rank_list = []
    score_list = []
    topic_id_list = []
    title_list = []
    doi_list = []
    publish_list = []
    
    if (paragraph == True):
        n_papers = n_papers * 20
    
    #Search all 3 indices (abstract, full text, and paragraphs) using question as a query
    for ii, row in Round_df.iterrows():
        query = Round_df['Query'][ii]
        question = Round_df['Question'][ii]
        topic_num = Round_df['Topic'][ii]
        token_topic = ', '.join(token_narrative_list[ii])
        token_question = ','.join(token_question_list[ii])
        manual_query = manual[ii] 
        input_query = query + '. ' + token_question + '. ' +  token_topic + ', ' + manual_query + ' . ' + keywords_list 
        
        hits = index_searcher.search(q = input_query, k=n_papers)
        print(topic_num)
        #Each key is a qid, value is the anserini search list
        for i in tqdm(range(0, n_papers), position = 0, leave = True):
            topic_id_list.append(topic_num)
            if (paragraph == True):
                docid_list.append(str(hits[i].docid).split('.')[0])
            else:
                docid_list.append(hits[i].docid)
            rank_list.append(str(i+1))
            score_list.append(hits[i].score)
            title_list.append(hits[i].lucene_document.get("title"))
            doi_list.append('https://doi.org/' + str(hits[i].lucene_document.get("doi")))  
            
            #Get published date
            doc_json = json.loads(hits[i].raw)
            metadata = json.loads(json.dumps(doc_json['csv_metadata']))
            publish_time_str = metadata['publish_time']
            if len(publish_time_str) > 0:
                try:
                    publish_time = datetime.strptime(publish_time_str, '%Y-%m-%d')
                except:
                    publish_time = datetime.strptime(publish_time_str, '%Y')
            else:
                publish_time = ''
            
            publish_list.append(publish_time)

    return topic_id_list, docid_list, rank_list, score_list, title_list, doi_list, publish_list

In [66]:
#Extract search results from the searcher using UDel query generator
def search_UDel(index_searcher, n_papers, paragraph = False):
    docid_list = []
    rank_list = []
    score_list = []
    topic_id_list = []
    title_list = []
    doi_list = []
    publish_list = []
    
    if (paragraph == True):
        n_papers = n_papers * 20
    
    #Search all 3 indices (abstract, full text, and paragraphs) using question as a query
    for ii, row in Round_df.iterrows():
        input_query = UDel_query[ii] 
        topic_num = Round_df['Topic'][ii]
        hits = index_searcher.search(q = input_query, k=n_papers)
        print(topic_num)
        #Each key is a qid, value is the anserini search list
        for i in tqdm(range(0, n_papers), position = 0, leave = True):
            topic_id_list.append(topic_num)
            if (paragraph == True):
                docid_list.append(str(hits[i].docid).split('.')[0])
            else:
                docid_list.append(hits[i].docid)
            rank_list.append(str(i+1))
            score_list.append(hits[i].score)
            title_list.append(hits[i].lucene_document.get("title"))
            doi_list.append('https://doi.org/' + str(hits[i].lucene_document.get("doi")))  
            
            #Get published date
            doc_json = json.loads(hits[i].raw)
            metadata = json.loads(json.dumps(doc_json['csv_metadata']))
            publish_time_str = metadata['publish_time']
            if len(publish_time_str) > 0:
                try:
                    publish_time = datetime.strptime(publish_time_str, '%Y-%m-%d')
                except:
                    publish_time = datetime.strptime(publish_time_str, '%Y')
            else:
                publish_time = ''
            
            publish_list.append(publish_time)

    return topic_id_list, docid_list, rank_list, score_list, title_list, doi_list, publish_list

In [97]:
#Search 1400 so that we have room to remove up to 1000 docs from Round 1
n_papers = 1400

#Each key is a qid, value is the anserini search list
full_topic, full_docid, full_rank, full_score, full_title, full_doi, full_publish = get_search_res_list(full_searcher, n_papers)
paragraph_topic, paragraph_docid, paragraph_rank, paragraph_score, paragraph_title, paragraph_doi, paragraph_publish = get_search_res_list(paragraph_searcher, n_papers, paragraph = True)
abstract_topic, abstract_docid, abstract_rank, abstract_score, abstract_title, abstract_doi, abstract_publish = get_search_res_list(abstract_searcher, n_papers)


1


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


2


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


3


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


4


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


5


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


6


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


7


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


8


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


9


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


10


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


11


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


12


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


13


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


14


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


15


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


16


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


17


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


18


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


19


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


20


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


21


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


22


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


23


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


24


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


25


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


26


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


27


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


28


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


29


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


30


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


31


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


32


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


33


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


34


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


35


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


36


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


37


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


38


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


39


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


40


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


41


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


42


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


43


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


44


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


45


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


1


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


2


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


3


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


4


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


5


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


6


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


7


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


8


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


9


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


10


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


11


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


12


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


13


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


14


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


15


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


16


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


17


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


18


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


19


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


20


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


21


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


22


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


23


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


24


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


25


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


26


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


27


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


28


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


29


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


30


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


31


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


32


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


33


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


34


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


35


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


36


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


37


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


38


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


39


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


40


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


41


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


42


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


43


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


44


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


45


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


1


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


2


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


3


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


4


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


5


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


6


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


7


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


8


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


9


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


10


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


11


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


12


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


13


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


14


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


15


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


16


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


17


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


18


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


19


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


20


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


21


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


22


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


23


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


24


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


25


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


26


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


27


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


28


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


29


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


30


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


31


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


32


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


33


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


34


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


35


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


36


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


37


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


38


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


39


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


40


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


41


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


42


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


43


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


44


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


45


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))




In [98]:
#Search 1400 so that we have room to remove up to 1000 docs from Round 1
n_papers = 1400

#Each key is a qid, value is the anserini search list
full_topic_UDel, full_docid_UDel, full_rank_UDel, full_score_UDel, full_title_UDel, full_doi_UDel, full_publish_UDel = search_UDel(full_searcher, n_papers)
paragraph_topic_UDel, paragraph_docid_UDel, paragraph_rank_UDel, paragraph_score_UDel, paragraph_title_UDel, paragraph_doi_UDel, paragraph_publish_UDel = search_UDel(paragraph_searcher, n_papers, paragraph = True)
abstract_topic_UDel, abstract_docid_UDel, abstract_rank_UDel, abstract_score_UDel, abstract_title_UDel, abstract_doi_UDel, abstract_publish_UDel = search_UDel(abstract_searcher, n_papers)


1


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


2


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


3


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


4


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


5


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


6


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


7


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


8


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


9


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


10


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


11


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


12


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


13


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


14


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


15


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


16


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


17


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


18


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


19


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


20


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


21


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


22


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


23


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


24


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


25


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


26


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


27


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


28


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


29


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


30


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


31


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


32


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


33


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


34


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


35


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


36


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


37


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


38


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


39


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


40


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


41


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


42


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


43


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


44


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


45


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


1


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


2


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


3


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


4


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


5


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


6


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


7


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


8


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


9


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


10


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


11


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


12


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


13


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


14


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


15


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


16


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


17


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


18


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


19


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


20


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


21


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


22


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


23


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


24


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


25


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


26


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


27


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


28


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


29


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


30


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


31


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


32


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


33


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


34


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


35


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


36


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


37


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


38


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


39


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


40


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


41


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


42


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


43


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


44


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


45


HBox(children=(FloatProgress(value=0.0, max=28000.0), HTML(value='')))


1


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


2


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


3


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


4


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


5


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


6


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


7


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


8


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


9


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


10


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


11


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


12


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


13


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


14


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


15


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


16


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


17


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


18


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


19


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


20


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


21


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


22


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


23


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


24


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


25


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


26


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


27


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


28


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


29


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


30


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


31


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


32


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


33


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


34


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


35


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


36


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


37


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


38


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


39


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


40


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


41


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


42


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


43


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


44


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


45


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))




In [43]:
#Make dataframe from lists generated from search
def TREC_df(topic_id_list, docid_list, rank_list, score_list, title_list, doi_list, publish_list, run_param, date_param, drop_1000, drop_dups):
    #Run-tag for TREC run requirements
    Q0 = ['q0'] * len(topic_id_list) 
    qid = [run_param] * len(topic_id_list)

    df  = {'topic': topic_id_list , 'q0':Q0, 'docid':docid_list, 'rank':rank_list, 
                                 'score':score_list, 'title': title_list, 'doi':doi_list,'date':publish_list, 'qid':qid}
    df = pd.DataFrame(df)
    df = df[['topic', 'q0', 'docid', 'rank', 'score', 'title', 'doi', 'date', 'qid']]
    
    #Filter by time
    if (date_param == True):
        df = df[df['date'].dt.year >= 2020]
    
    #Remove duplicates
    if (drop_dups == True):
        df.drop_duplicates(subset=['topic', 'docid'], keep='first', inplace = True)
        df.reset_index(drop=True, inplace=True) #Reset indexes to match row
    
    #Drop judged docids in R1-3
    index_list = []
    qrel_topics = qrels_table.groupby('Topic')   
    for topic, group in tqdm(qrel_topics, position=0, leave = True):
        #Get all docids of that topic from qrels
        topic_docid = list(group['docid'])
        for i in range(len(df)):
            if(topic == int(df['topic'][i]) and df['docid'][i] in topic_docid):
                index_list.append(i)
    
    df.drop(df.index[index_list], inplace=True)
    
    #Make sure all docids are valid
    df.reset_index(inplace = True)
    index_list = []
    for i in range(len(df)):
        if(df['docid'][i] not in R4_valid):
            index_list.append(i)
    
    df.drop(df.index[index_list], inplace=True)
    #Re-rank
    df['rank'] = df.groupby('topic')['score'].rank(ascending=False)
    df['rank'] = df['rank'].astype(int)
    
    #For each topic, save up to 1000 searches (since we drop duplicates)
    if (drop_1000 == True):
        df = df[df['rank'] <= 1000] 
    #Reset index
    df.reset_index(drop=True, inplace=True)
    
    #Sort rank by topic
    df.sort_values(['topic','rank'],ascending=False)
    
    #Get columns for submission
    df = df[['topic', 'q0', 'docid', 'rank', 'score', 'qid']]
    
    return df

In [99]:
#OHSU
full_df_OHSU = TREC_df(full_topic, full_docid, full_rank, full_score, full_title, full_doi, full_publish, 'FullR4_OHSU', False, False, True)
full_df_OHSU_time = TREC_df(full_topic, full_docid, full_rank, full_score, full_title, full_doi, full_publish, 'FullR4_OHSU_Time', True, False, True)
paragraph_df_OHSU = TREC_df(paragraph_topic, paragraph_docid, paragraph_rank, paragraph_score, paragraph_title, paragraph_doi, paragraph_publish, 'paragraphR4_OHSU', False, False, True)
paragraph_df_OHSU_time = TREC_df(paragraph_topic, paragraph_docid, paragraph_rank, paragraph_score, paragraph_title, paragraph_doi, paragraph_publish, 'paragraphR4_OHSU_Time', True, False, True)
abstract_df_OHSU = TREC_df(abstract_topic, abstract_docid, abstract_rank, abstract_score, abstract_title, abstract_doi, abstract_publish, 'abstractR4_OHSU', False, False, True)
abstract_df_OHSU_time = TREC_df(abstract_topic, abstract_docid, abstract_rank, abstract_score, abstract_title, abstract_doi, abstract_publish, 'abstractR4_OHSU_Time', True, False, True)

HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




In [100]:
#UDel
full_df_UDel = TREC_df(full_topic_UDel, full_docid_UDel, full_rank_UDel, full_score_UDel, full_title_UDel, full_doi_UDel, full_publish_UDel, 'FullR4_UDel', False, False, True)
full_df_UDel_time = TREC_df(full_topic_UDel, full_docid_UDel, full_rank_UDel, full_score_UDel, full_title_UDel, full_doi_UDel, full_publish_UDel, 'FullR4_UDel_Time', True, False, True)
paragraph_df_UDel = TREC_df(paragraph_topic_UDel, paragraph_docid_UDel, paragraph_rank_UDel, paragraph_score_UDel, paragraph_title_UDel, paragraph_doi_UDel, paragraph_publish_UDel, 'paragraphR4_UDel', False, False, True)
paragraph_df_UDel_time = TREC_df(paragraph_topic_UDel, paragraph_docid_UDel, paragraph_rank_UDel, paragraph_score_UDel, paragraph_title_UDel, paragraph_doi_UDel, paragraph_publish_UDel, 'paragraphR4_UDel_Time', True, False, True)
abstract_df_UDel = TREC_df(abstract_topic_UDel, abstract_docid_UDel, abstract_rank_UDel, abstract_score_UDel, abstract_title_UDel, abstract_doi_UDel, abstract_publish_UDel, 'abstractR4_UDel', False, False, True)
abstract_df_UDel_time = TREC_df(abstract_topic_UDel, abstract_docid_UDel, abstract_rank_UDel, abstract_score_UDel, abstract_title_UDel, abstract_doi_UDel, abstract_publish_UDel, 'abstractR4_UDel_Time', True, False, True)

HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




In [101]:
#For  runs
Results_folder = r'C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round4_Runs_LessRM3'
if(os.path.exists(Results_folder) == False):
    os.mkdir(Results_folder)
    
full_df_OHSU.to_csv(os.path.join(Results_folder, 'FullOHSU.txt'), sep=' ', index=False, header=None)
full_df_OHSU_time.to_csv(os.path.join(Results_folder, 'FullOHSU_time.txt'), sep=' ', index=False, header=None)

paragraph_df_OHSU.to_csv(os.path.join(Results_folder, 'ParaOHSU.txt'), sep=' ', index=False, header=None)
paragraph_df_OHSU_time.to_csv(os.path.join(Results_folder, 'ParaOHSU_time.txt'), sep=' ', index=False, header=None)

abstract_df_OHSU.to_csv(os.path.join(Results_folder, 'AbstractOHSU.txt'), sep=' ', index=False, header=None)
abstract_df_OHSU_time.to_csv(os.path.join(Results_folder, 'AbstractOHSU_time.txt'), sep=' ', index=False, header=None)

In [102]:
full_df_UDel.to_csv(os.path.join(Results_folder, 'FullUDel.txt'), sep=' ', index=False, header=None)
full_df_UDel_time.to_csv(os.path.join(Results_folder, 'FullUDel_time.txt'), sep=' ', index=False, header=None)

paragraph_df_UDel.to_csv(os.path.join(Results_folder, 'ParaUDel.txt'), sep=' ', index=False, header=None)
paragraph_df_UDel_time.to_csv(os.path.join(Results_folder, 'ParaUDel_time.txt'), sep=' ', index=False, header=None)

abstract_df_UDel.to_csv(os.path.join(Results_folder, 'AbstractUDel.txt'), sep=' ', index=False, header=None)
abstract_df_UDel_time.to_csv(os.path.join(Results_folder, 'AbstractUDel_time.txt'), sep=' ', index=False, header=None)

In [48]:
#Perform fusion on 3 df
def fuse_runs(path1, path2, path3, output_name, max_docs = 1000):
    r1 = TrecRun(path1)
    r2 = TrecRun(path2)
    r3 = TrecRun(path3)

    # Easy way to create new baselines by fusing existing runs:
    fused_run = fusion.reciprocal_rank_fusion([r1, r2, r3], max_docs=max_docs)
    
    fused_run.print_subset(os.path.join(Results_folder, str(output_name) + '.txt'), topics=fused_run.topics())

    return os.path.join(Results_folder, str(output_name) + '.txt')

#Perform fusion on 3 df
def fuse_runs2(path1, path2, path3,path4, path5, path6, output_name, max_docs = 1000):
    r1 = TrecRun(path1)
    r2 = TrecRun(path2)
    r3 = TrecRun(path3)
    r4 = TrecRun(path4)
    r5 = TrecRun(path5)
    r6 = TrecRun(path6)

    # Easy way to create new baselines by fusing existing runs:
    fused_run = fusion.reciprocal_rank_fusion([r1, r2, r3], max_docs=max_docs)
    
    fused_run.print_subset(os.path.join(Results_folder, str(output_name) + '.txt'), topics=fused_run.topics())

    return os.path.join(Results_folder, str(output_name) + '.txt')

In [103]:
#OHSU
orig_fusion_OHSU = fuse_runs(
    os.path.join(Results_folder, 'FullOHSU.txt'), os.path.join(Results_folder, 'ParaOHSU.txt'), os.path.join(Results_folder, 'AbstractOHSU.txt'),
    'orig_fusion_OHSU', max_docs = 1000
)

#OHSU Time
time_fusion_OHSU = fuse_runs(
    os.path.join(Results_folder, 'FullOHSU_time.txt'), os.path.join(Results_folder, 'ParaOHSU_time.txt'), os.path.join(Results_folder, 'AbstractOHSU_time.txt'),
    'time_fusion_OHSU', max_docs = 1000                                                                                                              
)
                                                                
#OHSU All 6 fusion
total_fusion_OHSU = fuse_runs2(
    os.path.join(Results_folder, 'FullOHSU.txt'), os.path.join(Results_folder, 'ParaOHSU.txt'), os.path.join(Results_folder, 'AbstractOHSU.txt'),
    os.path.join(Results_folder, 'FullOHSU_time.txt'), os.path.join(Results_folder, 'ParaOHSU_time.txt'), os.path.join(Results_folder, 'AbstractOHSU_time.txt'),
    'total_fusion_OHSU', max_docs = 1000
)

File C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round4_Runs_LessRM3\orig_fusion_OHSU.txt writen.
File C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round4_Runs_LessRM3\time_fusion_OHSU.txt writen.
File C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round4_Runs_LessRM3\total_fusion_OHSU.txt writen.


In [104]:
#UDel
orig_fusion_UDel = fuse_runs(
    os.path.join(Results_folder, 'FullUDel.txt'), os.path.join(Results_folder, 'ParaUDel.txt'), os.path.join(Results_folder, 'AbstractUDel.txt'),
    'orig_fusion_UDel', max_docs = 1000
)

#UDel Time
time_fusion_UDel = fuse_runs(
    os.path.join(Results_folder, 'FullUDel_time.txt'), os.path.join(Results_folder, 'ParaUDel_time.txt'), os.path.join(Results_folder, 'AbstractUDel_time.txt'),
    'time_fusion_UDel', max_docs = 1000                                                                                                              
)
                                                                
#UDel All 6 fusion
total_fusion_UDel = fuse_runs2(
    os.path.join(Results_folder, 'FullUDel.txt'), os.path.join(Results_folder, 'ParaUDel.txt'), os.path.join(Results_folder, 'AbstractUDel.txt'),
    os.path.join(Results_folder, 'FullUDel_time.txt'), os.path.join(Results_folder, 'ParaUDel_time.txt'), os.path.join(Results_folder, 'AbstractUDel_time.txt'),
    'total_fusion_UDel', max_docs = 1000
)

File C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round4_Runs_LessRM3\orig_fusion_UDel.txt writen.
File C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round4_Runs_LessRM3\time_fusion_UDel.txt writen.
File C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round4_Runs_LessRM3\total_fusion_UDel.txt writen.


In [105]:
#Load fused files
orig_fusion_OHSU_csv = pd.read_csv(orig_fusion_OHSU, sep = ' ', header = None)
time_fusion_OHSU_csv = pd.read_csv(time_fusion_OHSU, sep = ' ', header = None)
total_fusion_OHSU_csv = pd.read_csv(total_fusion_OHSU, sep = ' ', header = None)
orig_fusion_UDel_csv = pd.read_csv(orig_fusion_UDel, sep = ' ', header = None)
time_fusion_UDel_csv = pd.read_csv(time_fusion_UDel, sep = ' ', header = None)
total_fusion_UDel_csv = pd.read_csv(total_fusion_UDel, sep = ' ', header = None)

#Re-name fusion runtags
orig_fusion_OHSU_csv[5] = 'OHSU_R4_origfusion'
time_fusion_OHSU_csv[5] = 'OHSU_R4_timefusion'
total_fusion_OHSU_csv[5] = 'OHSU_R4_totalfusion'
orig_fusion_UDel_csv[5] = 'OHSU_R4_origfusion_Udelgen'
time_fusion_UDel_csv[5] = 'OHSU_R4_timefusion_Udelgen'
total_fusion_UDel_csv[5] = 'OHSU_R4_totalfusion_Udelgen'

#Save csvs again
orig_fusion_OHSU_csv.to_csv(os.path.join(Results_folder, 'R4_orig_fusion.txt'), sep=' ', index=False, header=None)
time_fusion_OHSU_csv.to_csv(os.path.join(Results_folder, 'R4_time_fusion.txt'), sep=' ', index=False, header=None)
total_fusion_OHSU_csv.to_csv(os.path.join(Results_folder, 'R4_total_fusion.txt'), sep=' ', index=False, header=None)
orig_fusion_UDel_csv.to_csv(os.path.join(Results_folder, 'R4_orig_fusion_Udelgen.txt'), sep=' ', index=False, header=None)
time_fusion_UDel_csv.to_csv(os.path.join(Results_folder, 'R4_time_fusion_Udelgen.txt'), sep=' ', index=False, header=None)
total_fusion_UDel_csv.to_csv(os.path.join(Results_folder, 'R4_total_fusion_Udelgen.txt'), sep=' ', index=False, header=None)

In [108]:
#Add headers for reranking
orig_fusion_OHSU_csv.columns = ['topic', 'Q0' , 'docid', 'rank', 'score', 'qid']
time_fusion_OHSU_csv.columns = ['topic', 'Q0' , 'docid', 'rank', 'score', 'qid']
total_fusion_OHSU_csv.columns = ['topic', 'Q0' , 'docid', 'rank', 'score', 'qid']
orig_fusion_UDel_csv.columns = ['topic', 'Q0' , 'docid', 'rank', 'score', 'qid']
time_fusion_UDel_csv.columns = ['topic', 'Q0' , 'docid', 'rank', 'score', 'qid']
total_fusion_UDel_csv.columns = ['topic', 'Q0' , 'docid', 'rank', 'score', 'qid']

orig_fusion_OHSU_csv.to_csv(os.path.join(Results_folder, 'R4_orig_fusion_header.txt'), sep=' ', index=False, header=True)
time_fusion_OHSU_csv.to_csv(os.path.join(Results_folder, 'R4_time_fusion_header.txt'), sep=' ', index=False, header=True)
total_fusion_OHSU_csv.to_csv(os.path.join(Results_folder, 'R4_total_fusion_header.txt'), sep=' ', index=False, header=True)
orig_fusion_UDel_csv.to_csv(os.path.join(Results_folder, 'R4_orig_fusion_Udelgen_header.txt'), sep=' ', index=False, header=True)
time_fusion_UDel_csv.to_csv(os.path.join(Results_folder, 'R4_time_fusion_Udelgen_header.txt'), sep=' ', index=False, header=True)
total_fusion_UDel_csv.to_csv(os.path.join(Results_folder, 'R4_total_fusion_Udelgen_header.txt'), sep=' ', index=False, header=True)

In [95]:
doc = full_searcher.doc('5v1tpi9n')
doc.raw()

'{"paper_id":"0dc4c9c4851bb6fc41b70b449a9fc27dcec3f002","metadata":{"title":"Transmission dynamics of the COVID-19 epidemic in India, and evaluating the impact of asymptomatic carriers and role of expanded testing in the lockdown exit strategy: a modelling approach Original Research Article","authors":[{"first":"Mohak","middle":[],"last":"Gupta","suffix":"","affiliation":{},"email":"xmohakgupta@gmail.com"},{"first":"Saptarshi","middle":[],"last":"Soham","suffix":"","affiliation":{},"email":""},{"first":"Mohanta","middle":[],"last":"2#","suffix":"","affiliation":{"laboratory":"","institution":"Indian Institute of Science Education and Research (IISER)","location":{"settlement":"Pune"}},"email":""},{"first":"Aditi","middle":[],"last":"Rao","suffix":"","affiliation":{},"email":""},{"first":"Giridara","middle":[],"last":"Gopal Parameswaran","suffix":"","affiliation":{},"email":""},{"first":"Mudit","middle":[],"last":"Agarwal","suffix":"","affiliation":{},"email":""},{"first":"Mehak","middl