In [5]:
import pandas as pd
import numpy as np
#import torch
import os
from tqdm.auto import tqdm
import json

#Pyserini search
from pyserini import search
from jnius import autoclass

#XML parsing
import xml.etree.ElementTree as ET

#TREC_EVAL
from trectools import misc, TrecRun, TrecQrel, procedures

from datetime import datetime

from sklearn.preprocessing import MinMaxScaler

In [2]:
#Code can start here
Pyserini_files = r'C:\Users\Jimmy Chen\Box\COVID19\CORD19\Pyserini_Lucene_CORD_index'
TREC_COVID_root = r'C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID'

#Round 2 indexes
R2_fulltext = os.path.join(Pyserini_files, 'lucene-index-cord19-full-text-2020-05-01')
R2_paragraphs = os.path.join(Pyserini_files, 'lucene-index-cord19-paragraph-2020-05-01')
R2_topics= pd.read_csv(os.path.join(Pyserini_files, 'Round2_Topics.csv'))
with open(os.path.join(Pyserini_files, 'docids-rnd2.txt')) as f:
    R2_valid = f.read().splitlines()


In [157]:
#Load qrels and extract docids searched already for each topic
qrels_file = os.path.join(TREC_COVID_root, 'Round_1_Results', 'qrels-1.txt')
qrels_table = pd.read_csv(qrels_file, sep = ' ')
qrels_table.columns = ['Topic', 'Q0' , 'Unnamed', 'docid', 'relevance']
qrels_table.drop('Unnamed', axis=1, inplace=True)
qrels_table

Unnamed: 0,Topic,Q0,docid,relevance
0,1,1.0,02f0opkr,1
1,1,1.0,04ftw7k9,0
2,1,1.0,05qglt1f,0
3,1,1.0,0604jed8,0
4,1,1.0,084o1dmp,0
...,...,...,...,...
8685,30,1.0,za3qypgg,0
8686,30,0.5,zb434ve3,1
8687,30,0.5,zio8yhuy,0
8688,30,1.0,zj34cb6f,0


In [3]:
#Query design
from pyserini.analysis.pyanalysis import get_lucene_analyzer, Analyzer
import nltk
from nltk.corpus import stopwords 
nltk.download('stopwords')

#Stopwords for tokenization - manual review
stopwords_manual = ['seek', 'seeking', 'look', 'looking', 'studies', 'study', 'information', 
             'about', 'range', 'studies', 'its', 'coronaviru',
            'other', '2', '19', 'well', ' will', 'from', 'have', 'more', 'covid', 'any', 'what', 
            'should', 'may', 'due', 'help', 'non', 's', 'those', 'people', 'ways', 'all', 'gain',
            'possible', 'toward', 'specifically', 'learned', 'number', 'proportion', 'including',
            'etc', 'still', 'while', 'human', 'specific', 'result', 'results', 'assess', 'need',
            'between', 'take', 'taking', 'patient', 'type', 'cause' ,'frequency', 'less', 'face',
            'likely', 'infect', 'upon', 'develop', 'represent', 'promising', 'step', 'related',
            'papers', 'describe', 'also', 'relevant', 'who', 'show', 'science', 'basic', 'complete',
            'do', 'how', 'been', 'against', 'use', 'to', 'had', 'has', 'approach', 'Studies', 'Stud', 'Inst', 'Divi' ,'Thomae',
            'Brigham', 'Young', 'Univ', 'studies', 'volition', 'severe acute respiratory syndrome', 'affect', 'affected',
            'focus', 'discuss', 'speculative', 'must', 'include', 'draw', 'everyday', 'person', 'another', 'everyday', 'kind',
            'comparison', 'direct', 'previous', 'articles']

stopwords = list(set(stopwords.words('english')))
stopwords_manual = list(np.append(stopwords_manual, stopwords))

token_narrative_list = []

#Extract important narrative text
for i in range(len(R2_topics)):
    analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz'))
    tokens = analyzer.analyze(R2_topics['Narrative'][i])
    for i, word in enumerate(tokens):
        if ('exclud' in word):
            tokens = tokens[:i]
            break
    #Remove stopwords and duplicates from token
    tokens = [w for w in tokens if not w in stopwords_manual]
    tokens = list(set(tokens))
    #Remove all tokens after exclude
    
    token_narrative_list.append(tokens)

#Tokenize question
token_question_list = []

#Extract important question text - NOT USED YET
for i in range(len(R2_topics)):
    analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz'))
    tokens = analyzer.analyze(R2_topics['Question'][i])
    #Remove stopwords and duplicates from token
    tokens = [w for w in tokens if not w in stopwords_manual]
    tokens = list(set(tokens))
    token_question_list.append(tokens)

#Anserini searcher can take both query and keywords
#keywords_list = '2019-nCoV, SARS-CoV-2, COVID-19'
keywords_list = 'COVID-19'

#Manual keywords
manual = [
            'originated', #1
            'temperature, humidity',
            'cross-reactive, crossprotective, cross immunity, specific antibody response, neutralize, adaptive immunity',
            'harms, mortality, dead, risk factors',
            'non-human',
            'identification, detection, PCR, point of care',
            'serology, antigen, antibodies',
            'ascertainment, underestimate',
            'cov, canadian',
            'home, restriction, social-distancing', #10
            'resources, stratification',
            'closure, restricted, movement, gathering',
            'source, route, transmitted',
            'super-spreader',
            'aerosol, contact, droplet',
            'live, days, contaminated',
            'randomized, randomised, controlled',
            'personal, protective, equipment, PPE, face',
            'hygiene, alcohol-based',
            'ARBs, blocker', #20
            'death',
            'coronary',
            'blood, pressure',
            'mellitus',
            '',
            'onset, new, presentation',
            'SARS-CoV-2, 2019-nCoV',
            'chloroquine ',
            'binding',
            '', #30
            '', #31
            'type', #32
            '', #33
            'long-term, survivors', #34
            '', #35
            ]

[nltk_data] Downloading package stopwords to C:\Users\Jimmy
[nltk_data]     Chen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#Create SimpleSearcher for full text and paragraph indexes - Do this for Round 2 data
full_searcher = pysearch.SimpleSearcher(R2_fulltext)
paragraph_searcher = pysearch.SimpleSearcher(R2_paragraphs)

#Tuned hyperparameters
full_searcher.set_bm25_similarity(k1=1.5, b=0.4)
full_searcher.set_lm_dirichlet_similarity(mu = 2000)
full_searcher.set_rm3_reranker(fb_terms=20, fb_docs=10, original_query_weight=0.7)

#Tuned hyperparameters
paragraph_searcher.set_bm25_similarity(k1=1.5, b=0.4)
paragraph_searcher.set_lm_dirichlet_similarity(mu = 2000)
paragraph_searcher.set_rm3_reranker(fb_terms=20, fb_docs=10, original_query_weight=0.7)

In [5]:
#Extract search results from the searcher
def get_search_res_list(index_searcher, n_papers, paragraph = False):
    docid_list = []
    rank_list = []
    score_list = []
    topic_id_list = []
    title_list = []
    doi_list = []
    publish_list = []
    
    if (paragraph == True):
        n_papers = n_papers * 20
    
    #Search all 3 indices (abstract, full text, and paragraphs) using question as a query
    for ii, row in R2_topics.iterrows():
        query = R2_topics['Query'][ii]
        question = R2_topics['Question'][ii]
        topic_num = R2_topics['Topic'][ii]
        token_topic = ', '.join(token_narrative_list[ii])
        token_question = ','.join(token_question_list[ii])
        manual_query = manual[ii] 
        input_query = query + '. ' + token_question + '. ' +  token_topic + ', ' + manual_query + ' . ' + keywords_list 
    
        hits = index_searcher.search(q = input_query, k=n_papers)
        print(topic_num)
        #Each key is a qid, value is the anserini search list
        for i in tqdm(range(0, n_papers), position = 0, leave = True):
            topic_id_list.append(topic_num)
            if (paragraph == True):
                docid_list.append(str(hits[i].docid).split('.')[0])
            else:
                docid_list.append(hits[i].docid)
            rank_list.append(str(i+1))
            score_list.append(hits[i].score)
            title_list.append(hits[i].lucene_document.get("title"))
            doi_list.append('https://doi.org/' + str(hits[i].lucene_document.get("doi")))  
            
            #Get published date
            doc_json = json.loads(hits[i].raw)
            metadata = json.loads(doc_json['csv_metadata'])
            publish_time_str = metadata['publish_time']
            if len(publish_time_str) > 0:
                try:
                    publish_time = datetime.strptime(publish_time_str, '%Y-%m-%d')
                except:
                    publish_time = datetime.strptime(publish_time_str, '%Y')
            else:
                publish_time = ''
            
            publish_list.append(publish_time)

    return topic_id_list, docid_list, rank_list, score_list, title_list, doi_list, publish_list

In [6]:
#Search 2000 so that we have room to remove up to 1000 docs from Round 1
#Search extra - will drop duplicates and will use extra for BERT reranking
n_papers = 2000

#Each key is a qid, value is the anserini search list
full_topic, full_docid, full_rank, full_score, full_title, full_doi, full_publish = get_search_res_list(full_searcher, n_papers)
paragraph_topic, paragraph_docid, paragraph_rank, paragraph_score, paragraph_title, paragraph_doi, paragraph_publish = get_search_res_list(paragraph_searcher, n_papers, paragraph = True)


1


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


2


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


3


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


4


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


5


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


6


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


7


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


8


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


9


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


10


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


11


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


12


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


13


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


14


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


15


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


16


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


17


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


18


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


19


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


20


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


21


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


22


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


23


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


24


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


25


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


26


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


27


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


28


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


29


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


30


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


31


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


32


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


33


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


34


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


35


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


1


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


2


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


3


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


4


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


5


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


6


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


7


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


8


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


9


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


10


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


11


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


12


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


13


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


14


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


15


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


16


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


17


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


18


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


19


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


20


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


21


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


22


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


23


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


24


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


25


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


26


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


27


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


28


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


29


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


30


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


31


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


32


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


33


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


34


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))


35


HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))




In [172]:
#Make dataframe from lists generated from search
def TREC_df(topic_id_list, docid_list, rank_list, score_list, title_list, doi_list, publish_list, run_param, date_param, drop_1000, drop_dups):
    #Run-tag for TREC run requirements
    Q0 = ['q0'] * len(topic_id_list) 
    qid = [run_param] * len(topic_id_list)

    df  = {'topic': topic_id_list , 'q0':Q0, 'docid':docid_list, 'rank':rank_list, 
                                 'score':score_list, 'title': title_list, 'doi':doi_list,'date':publish_list, 'qid':qid}
    df = pd.DataFrame(df)
    df = df[['topic', 'q0', 'docid', 'rank', 'score', 'title', 'doi', 'date', 'qid']]
    
    #Filter by time
    if (date_param == True):
        df = df[df['date'].dt.year >= 2020]
    
    #Remove duplicates
    if (drop_dups == True):
        df.drop_duplicates(subset=['topic', 'docid'], keep='first', inplace = True)
        df.reset_index(drop=True, inplace=True) #Reset indexes to match row
    
    #Drop judged docids in R1
    index_list = []
    qrel_topics = qrels_table.groupby('Topic')   
    for topic, group in tqdm(qrel_topics, position=0, leave = True):
        #Get all docids of that topic from qrels
        topic_docid = list(group['docid'])
        for i in range(len(df)):
            if(topic == int(df['topic'][i]) and df['docid'][i] in topic_docid):
                index_list.append(i)
    
    df.drop(df.index[index_list], inplace=True)
    #Make sure all docids are valid
    df.reset_index(inplace = True)
    index_list = []
    for i in range(len(df)):
        if(df['docid'][i] not in R2_valid):
            index_list.append(i)
    
    df.drop(df.index[index_list], inplace=True)
    #Re-rank
    df['rank'] = df.groupby('topic')['score'].rank(ascending=False)
    df['rank'] = df['rank'].astype(int)
    
    #For each topic, save up to 1000 searches (since we drop duplicates)
    if (drop_1000 == True):
        df = df[df['rank'] <= 1000] 
    #Reset index
    df.reset_index(drop=True, inplace=True)
    
    #Get columns for submission
    df = df[['topic', 'q0', 'docid', 'rank', 'score', 'qid']]
    
    return df

In [173]:
full_df = TREC_df(full_topic, full_docid, full_rank, full_score, full_title, full_doi, full_publish, 'FullTxt_R2_Orig', False, True, True)
full_df_time = TREC_df(full_topic, full_docid, full_rank, full_score, full_title, full_doi, full_publish, 'FullTxt_R2_Time', True, True, True)

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [174]:
#No date filter, don't drop to 1000, remove dups
full_df_copy = TREC_df(full_topic, full_docid, full_rank, full_score, full_title, full_doi, full_publish, 'FullTxt_R2_OrigCopy', False, False, True)
#Date filter, don't drop to 1000, remove dups
full_df_time_copy = TREC_df(full_topic, full_docid, full_rank, full_score, full_title, full_doi, full_publish, 'FullTxt_R2_TimeCopy', True, False, True)
#No date filter, don't drop to 1000, Keep dups
paragraph_df = TREC_df(paragraph_topic, paragraph_docid, paragraph_rank, paragraph_score, paragraph_title, paragraph_doi, paragraph_publish, 'Paragraph_R2', False, False, False)

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [175]:
#Join processed paragraph and dataframe based on score
def join_df(full_df, para_df, param):
    #In paragraph index, for each topic, average all scores for each paragraph for docids (already stripped of paragraph #)
    #Create df with columns topic, docid, score
    if (param == 'average'):
        new_df = para_df.groupby(['topic', 'docid'])['score'].mean()
        new_df = new_df.reset_index()
        
    #In paragraph index, For each topic, take the max score
    if (param == 'max'):
        new_df = para_df.drop_duplicates(subset=['topic', 'docid'], keep='first')
        new_df.reset_index(drop=True, inplace=True) #Reset indexes to match row
        new_df = new_df[['topic', 'docid', 'score']]
        
    #Merge column to full_df with score from paragraph
    new_df.columns = ['topic', 'docid', 'para_score']
    new_df.reset_index(inplace=True)
    
    full_df = full_df.merge(new_df, how='left', left_on=['topic', 'docid'],
         right_on = ['topic', 'docid'])
    full_df = full_df.drop('index', 1)

    return full_df

In [176]:
#Paragraph index - Calculate final score to re-rank for average
full_df_orig_para_avg = join_df(full_df_copy, paragraph_df, 'average')
full_df_time_para_avg = join_df(full_df_time_copy, paragraph_df, 'average')
#Paragraph index - Calculate final score to re-rank for max
full_df_orig_para_max = join_df(full_df_copy, paragraph_df, 'max')
full_df_time_para_max = join_df(full_df_time_copy, paragraph_df, 'max')

In [177]:
#Try different ways to combine scores

#1 Full + Paragraph score
full_df_orig_para_avg['Processed_score_1'] = full_df_orig_para_avg['score'] + full_df_orig_para_avg['para_score']
full_df_time_para_avg['Processed_score_1'] = full_df_time_para_avg['score'] + full_df_time_para_avg['para_score']
full_df_orig_para_max['Processed_score_1'] = full_df_orig_para_max['score'] + full_df_orig_para_max['para_score']
full_df_time_para_max['Processed_score_1'] = full_df_time_para_max['score'] + full_df_time_para_max['para_score']

#2. Full + half paragraph 
full_df_orig_para_avg['Processed_score_2'] = full_df_orig_para_avg['score'] + 0.5 * full_df_orig_para_avg['para_score']
full_df_time_para_avg['Processed_score_2'] = full_df_time_para_avg['score'] + 0.5 * full_df_time_para_avg['para_score']
full_df_orig_para_max['Processed_score_2'] = full_df_orig_para_max['score'] + 0.5 * full_df_orig_para_max['para_score']
full_df_time_para_max['Processed_score_2'] = full_df_time_para_max['score'] + 0.5 * full_df_time_para_max['para_score']

#3. Full + Quarter paragraph
full_df_orig_para_avg['Processed_score_3'] = full_df_orig_para_avg['score'] + 0.25 * full_df_orig_para_avg['para_score']
full_df_time_para_avg['Processed_score_3'] = full_df_time_para_avg['score'] + 0.25 * full_df_time_para_avg['para_score']
full_df_orig_para_max['Processed_score_3'] = full_df_orig_para_max['score'] + 0.25 * full_df_orig_para_max['para_score']
full_df_time_para_max['Processed_score_3'] = full_df_time_para_max['score'] + 0.25 * full_df_time_para_max['para_score']

#4. Half Full + paragraph
full_df_orig_para_avg['Processed_score_4'] = 0.5 * full_df_orig_para_avg['score'] + full_df_orig_para_avg['para_score']
full_df_time_para_avg['Processed_score_4'] = 0.5 * full_df_time_para_avg['score'] + full_df_time_para_avg['para_score']
full_df_orig_para_max['Processed_score_4'] = 0.5 * full_df_orig_para_max['score'] + full_df_orig_para_max['para_score']
full_df_time_para_max['Processed_score_4'] = 0.5 * full_df_time_para_max['score'] + full_df_time_para_max['para_score']

#5.Quarter Full + paragraph
full_df_orig_para_avg['Processed_score_5'] = 0.25 * full_df_orig_para_avg['score'] + full_df_orig_para_avg['para_score']
full_df_time_para_avg['Processed_score_5'] = 0.25 * full_df_time_para_avg['score'] + full_df_time_para_avg['para_score']
full_df_orig_para_max['Processed_score_5'] = 0.25 * full_df_orig_para_max['score'] + full_df_orig_para_max['para_score']
full_df_time_para_max['Processed_score_5'] = 0.25 * full_df_time_para_max['score'] + full_df_time_para_max['para_score']


In [178]:
#Re-rank 
def process_combo(df, col_name, run_tag):
    #Add run tag
    df['qid'] = run_tag
    
    #Drop duplicates
    df.drop_duplicates(subset=['topic', 'docid'], keep='first', inplace = True)
    df.reset_index(drop=True, inplace=True) #Reset indexes to match row
    
    #Re-rank, need to fill NAs
    df[col_name] = df[col_name].fillna(0)
    df['rank'] = df.groupby('topic')[col_name].rank(ascending=False)
    df['rank'] = df['rank'].astype(int)
    df = df.groupby('topic').apply(lambda x: x.sort_values('rank'))
    
    #Limit to 1000
    #For each topic, save up to 1000 searches (since we drop duplicates)
    df = df[df['rank'] <= 1000] 
    #Reset index
    df.reset_index(drop=True, inplace=True)
    
    #Select TREC-compliant columns
    df = df[['topic', 'q0', 'docid', 'rank', col_name, 'qid']]
    
    return df

In [179]:
pfull_df_orig_para_avg1 = process_combo(full_df_orig_para_avg, 'Processed_score_1', 'Orig_avg_ScorePara')
pfull_df_time_para_avg1 = process_combo(full_df_time_para_avg, 'Processed_score_1', 'Time_avg_ScorePara')
pfull_df_orig_para_max1 = process_combo(full_df_orig_para_max, 'Processed_score_1', 'Orig_max_ScorePara')
pfull_df_time_para_max1 = process_combo(full_df_time_para_max, 'Processed_score_1', 'Time_max_ScorePara')

#2. Full + half paragraph 
pfull_df_orig_para_avg2 = process_combo(full_df_orig_para_avg, 'Processed_score_2', 'Orig_avg_ScoreHalfPara')
pfull_df_time_para_avg2 = process_combo(full_df_time_para_avg, 'Processed_score_2', 'Time_avg_ScoreHalfPara')
pfull_df_orig_para_max2 = process_combo(full_df_orig_para_max, 'Processed_score_2', 'Orig_max_ScoreHalfPara')
pfull_df_time_para_max2 = process_combo(full_df_time_para_max, 'Processed_score_2', 'Time_max_ScoreHalfPara')

#3. Full + Quarter paragraph
pfull_df_orig_para_avg3 = process_combo(full_df_orig_para_avg, 'Processed_score_3', 'Orig_avg_ScoreQuarterPara')
pfull_df_time_para_avg3 = process_combo(full_df_time_para_avg, 'Processed_score_3', 'Time_avg_ScoreQuarterPara')
pfull_df_orig_para_max3 = process_combo(full_df_orig_para_max, 'Processed_score_3', 'Orig_max_ScoreQuarterPara')
pfull_df_time_para_max3 = process_combo(full_df_time_para_max, 'Processed_score_3', 'Time_max_ScoreQuarterPara')

#4. Half Full + paragraph
pfull_df_orig_para_avg4 = process_combo(full_df_orig_para_avg, 'Processed_score_4', 'Orig_avg_ScoreHalfFullPara')
pfull_df_time_para_avg4 = process_combo(full_df_time_para_avg, 'Processed_score_4', 'Time_avg_ScoreHalfFullPara')
pfull_df_orig_para_max4 = process_combo(full_df_orig_para_max, 'Processed_score_4', 'Orig_max_ScoreHalfFullPara')
pfull_df_time_para_max4 = process_combo(full_df_time_para_max, 'Processed_score_4', 'Time_max_ScoreHalfFullPara')

#5.Quarter Full + paragraph
pfull_df_orig_para_avg5 = process_combo(full_df_orig_para_avg, 'Processed_score_5', 'Orig_avg_ScoreQuarterFullPara')
pfull_df_time_para_avg5 = process_combo(full_df_time_para_avg, 'Processed_score_5', 'Time_avg_ScoreQuarterFullPara')
pfull_df_orig_para_max5 = process_combo(full_df_orig_para_max, 'Processed_score_5', 'Orig_max_ScoreQuarterFullPara')
pfull_df_time_para_max5 = process_combo(full_df_time_para_max, 'Processed_score_5', 'Time_max_ScoreQuarterFullPara')


In [180]:
Results_folder = r'C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round2_Runs'
full_df.to_csv(os.path.join(Results_folder, 'OHSU_R2_ORIG_v2.txt'), sep=' ', index=False, header=None)
full_df_time.to_csv(os.path.join(Results_folder, 'OHSU_R2_TIME_v2.txt'), sep=' ', index=False, header=None)

pfull_df_orig_para_avg1.to_csv(os.path.join(Results_folder, 'pfull_df_orig_para_avg1.txt'), sep=' ', index=False, header=None)
pfull_df_time_para_avg1.to_csv(os.path.join(Results_folder, 'pfull_df_time_para_avg1.txt'), sep=' ', index=False, header=None)
pfull_df_orig_para_max1.to_csv(os.path.join(Results_folder, 'pfull_df_orig_para_max1.txt'), sep=' ', index=False, header=None)
pfull_df_time_para_max1.to_csv(os.path.join(Results_folder, 'pfull_df_time_para_max1.txt'), sep=' ', index=False, header=None)

pfull_df_orig_para_avg2.to_csv(os.path.join(Results_folder, 'pfull_df_orig_para_avg2.txt'), sep=' ', index=False, header=None)
pfull_df_time_para_avg2.to_csv(os.path.join(Results_folder, 'pfull_df_time_para_avg2.txt'), sep=' ', index=False, header=None)
pfull_df_orig_para_max2.to_csv(os.path.join(Results_folder, 'pfull_df_orig_para_max2.txt'), sep=' ', index=False, header=None)
pfull_df_time_para_max2.to_csv(os.path.join(Results_folder, 'pfull_df_time_para_max2.txt'), sep=' ', index=False, header=None)

pfull_df_orig_para_avg3.to_csv(os.path.join(Results_folder, 'pfull_df_orig_para_avg3.txt'), sep=' ', index=False, header=None)
pfull_df_time_para_avg3.to_csv(os.path.join(Results_folder, 'pfull_df_time_para_avg3.txt'), sep=' ', index=False, header=None)
pfull_df_orig_para_max3.to_csv(os.path.join(Results_folder, 'pfull_df_orig_para_max3.txt'), sep=' ', index=False, header=None)
pfull_df_time_para_max3.to_csv(os.path.join(Results_folder, 'pfull_df_time_para_max3.txt'), sep=' ', index=False, header=None)


pfull_df_orig_para_avg4.to_csv(os.path.join(Results_folder, 'pfull_df_orig_para_avg4.txt'), sep=' ', index=False, header=None)
pfull_df_time_para_avg4.to_csv(os.path.join(Results_folder, 'pfull_df_time_para_avg4.txt'), sep=' ', index=False, header=None)
pfull_df_orig_para_max4.to_csv(os.path.join(Results_folder, 'pfull_df_orig_para_max4.txt'), sep=' ', index=False, header=None)
pfull_df_time_para_max4.to_csv(os.path.join(Results_folder, 'pfull_df_time_para_max4.txt'), sep=' ', index=False, header=None)


pfull_df_orig_para_avg5.to_csv(os.path.join(Results_folder, 'pfull_df_orig_para_avg5.txt'), sep=' ', index=False, header=None)
pfull_df_time_para_avg5.to_csv(os.path.join(Results_folder, 'pfull_df_time_para_avg5.txt'), sep=' ', index=False, header=None)
pfull_df_orig_para_max5.to_csv(os.path.join(Results_folder, 'pfull_df_orig_para_max5.txt'), sep=' ', index=False, header=None)
pfull_df_time_para_max5.to_csv(os.path.join(Results_folder, 'pfull_df_time_para_max5.txt'), sep=' ', index=False, header=None)


# TRECTOOLS

In [15]:
def trectools_eval(res_fol, qrels_file):
    
    qrels = TrecQrel(qrels_file)

    #Generate metrics for all 3 indices (1000 docs retrieved for each)
    runs = procedures.list_of_runs_from_path(res_fol, "*.txt")
    results = procedures.evaluate_runs(runs, qrels, per_query=True)
    p5 = procedures.extract_metric_from_results(results, "P_5")
    p10 = procedures.extract_metric_from_results(results, "P_10")
    Bpref = procedures.extract_metric_from_results(results, "bpref")
    Mean_avgP = procedures.extract_metric_from_results(results, 'map')

    #Aggregate results to dataframe
    runs_names = [os.path.basename(str(x)).split('.')[0] for x in runs]
    p5_list = []
    p10_list = []
    map_list = []
    bpref_list = []
    ndcg_list = []

    for i in range(len(runs)):
        p5_list.append(p5[i][1])
        p10_list.append(p10[i][1])
        map_list.append(Mean_avgP[i][1])
        bpref_list.append(Bpref[i][1])

    Result_df  = {'Run':runs_names, 'P@5': p5_list, 'P@10': p10_list, 'MAP': map_list, 'Bpref': bpref_list}
    Result_df = pd.DataFrame(Result_df)
    
    print(Result_df.sort_values('MAP', ascending = False))

In [18]:
trectools_eval(res_fol= r'C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round3_Runs',
               qrels_file=r'C:\Users\Jimmy Chen\Box\COVID19\CORD19\Pyserini_Lucene_CORD_index\qrels-covid_d3_j2.5-3.txt')

Found 46 runs in path C:\Users\Jimmy Chen\Box\COVID19\CORD19\TREC_COVID\Round3_Runs
                          Run    P@5    P@10       MAP     Bpref
16               combo_fusion  0.790  0.7425  0.322697  0.582784
17        combo_IOWA_OHSU_run  0.790  0.7425  0.322697  0.582784
31  newrd3borda_alltopics_run  0.875  0.8400  0.318511  0.577768
42                time_fusion  0.690  0.6275  0.269141  0.571685
40             R3_time_fusion  0.690  0.6275  0.269141  0.571685
41        R3_time_fusion_1000  0.690  0.6275  0.269141  0.549705
43           time_fusion_1000  0.690  0.6275  0.269141  0.549705
6                       avg_0  0.610  0.5650  0.242025  0.514498
39        R3_orig_fusion_1000  0.690  0.6025  0.240156  0.512049
33           orig_fusion_1000  0.690  0.6025  0.240156  0.512049
45          total_fusion_1000  0.690  0.6025  0.240156  0.512049
7                       avg_0  0.665  0.5850  0.235632  0.506401
3       AbstractTxt_time_1000  0.645  0.6025  0.232578  0.481820
2     