In [1]:
import xml.etree.ElementTree as Et

from pyserini.search import pysearch
from pyserini.search import pyquerybuilder
from tqdm import tqdm
from collections import defaultdict
from pprint import pprint

import subprocess

from pyserini.pyclass import autoclass
from pyserini.analysis.pyanalysis import get_lucene_analyzer

In [2]:
searcher = pysearch.SimpleSearcher('/home/chris/data/anserini/lucene-index-cord19-paragraph-2020-05-01/')

In [3]:
docid_path = 'docids-rnd2.txt'
valid = set()
with open(docid_path, 'r') as f:
    for line in f:
        valid |= {line.strip()}

In [4]:
topics_path = '/home/chris/data/topics/topics-rnd2.xml'
qrel_path = '/home/chris/data/qrels/qrels-rnd1.txt'

In [5]:
tree = Et.parse(topics_path)
root = tree.getroot()
topics = [root[i][0].text for i in range(35)]
nars =  [root[i][1].text for i in range(35)]

In [6]:
judged = defaultdict(set)
with open(qrel_path, 'r') as f:
    for line in f:
        topicno, iteration, empty, docid, relevance = line.strip().split(' ')
        judged[int(topicno)] |= {docid}

In [7]:
tasks = [
    ['environmental transmission', 'incubation', 'contagious', 'persistence', 'stability', 'physical', 'weather',
     'epidemiology', 'shedding', 'reproductive number', 'modes of transmission', 'virulent', 'asymptomatic', 'pathogen',
     'evolutionary host', 'transmission host'],
    ['smoking', 'risk', 'pulmonary', 'pre-condition', 'co-infection', 'high-risk', 'severe', 'susceptible', 'fatality',
     'neonates', 'respitory', 'condition', 'pre-existing', 'pregnant', 'morbidities'],
    ['human-animal', 'origin', 'genetics', 'evolution', 'genome', 'sample sets', 'genomic', 'strain', 'livestock',
     'animal host', 'natural history', 'genetic drift', 'mutation', 'genomics', 'sequencing'],
    ['vaccine', 'therapeutic', 'treat', 'drugs', 'pharmaceuticals', 'recipients', 'ADE', 'complication', 'antiviral',
     'prophylaxis', 'cloroquine', 'vaccination', 'immume respone'],
    ['medical care', 'surge capacity', 'nursing home', 'allocation', 'personal protective equirement',
     'clinical characterization', 'nursing', 'care', 'Extracorporeal membrane oxygenation', 'ECMO',
     'mechanical ventilation', 'extrapulmonary manifestations', 'cardiomyopathy', 'cardiac arrest',
     'regulatory standards', 'N95 masks', 'elastomeric respirators', 'telemedicine', 'steroids', 'high flow oxygen',
     'supportive interventions'],
    ['NPI', 'non-pharmaceutical intervention', 'school closure', 'travel ban', 'quarantine', 'mass gathering',
     'social distancing', 'public health advice', 'economic impact'],
    ['counties', 'geographic', 'geography', 'mortality rate', 'spread', 'mutations'],
    ['diagnostic', 'surveillance', 'detection', 'screening', 'ELISAs', 'capacity', 'testing', 'point-of-care',
     'rapid testing', 'pathogen', 'reagent', 'cytokines', 'response markers', 'swabs'],
    ['ethical', 'social science', 'principles', 'standards', 'ethics', 'psychological health', 'fear', 'anxiety',
     'stigma', 'sociology'],
    ['collaboration', 'nomenclature', 'data standards', 'information sharing', 'communication', 'collaborate',
     'coordination', 'misunderstanding', 'action plan']
]

topic_task_no = [2,0,3,0,3,6,3,6,7,5,4,5,0,0,0,0,3,5,5,1,0,0,1,1,6,6,6,3,3,3,2,2,3,1,9]

In [8]:
stops = {'of', 'to', 'the', 'for', 'in', 'on', 'and',
         'is', 'will', 'Is', 'or', 'are', 'there', 'that', 'an', 'with',
         'at', 'by', 'but', 'Are', 'be', 'this', 'if', 'they?'}

def build_query(bm25, nars, task, bm25_split, nars_split, task_split):
    bm25 = [b for b in bm25 if b not in stops]
    task = [t for t in task if t not in stops]
    nars = [n for n in nars if n not in stops]
    
    builder = pyquerybuilder.get_boolean_query_builder()
    
    for b in bm25:
        b = pyquerybuilder.get_term_query(b)
        boostquery = pyquerybuilder.get_boost_query(b, bm25_split/len(bm25))
        builder.add(boostquery, pyquerybuilder.JBooleanClauseOccur['should'].value)
        
    for t in task:
        t = pyquerybuilder.get_term_query(t)
        boostquery = pyquerybuilder.get_boost_query(t, task_split/len(task))
        builder.add(boostquery, pyquerybuilder.JBooleanClauseOccur['should'].value)
        
    for n in nars:
        try:
            n = pyquerybuilder.get_term_query(n)
        except:
            print(n)
            continue
        boostquery = pyquerybuilder.get_boost_query(n, nars_split/len(nars))
        builder.add(boostquery, pyquerybuilder.JBooleanClauseOccur['should'].value)
        
    return builder.build()

In [9]:
bm25_split = .60
nar_split = .25
task_split = .15


qrel_seen = 0
total = 0
filename = f'ru-tn-exp-rnd2.run' 
with open(filename, 'w') as f:
    for topicno, topic in tqdm(enumerate(topics)):
        task = tasks[topic_task_no[topicno]]
        topic = topic.split(' ')
        nar = nars[topicno]
        nar = nar.split(' ')
        new_task = []
        for t in task:
            for nt in t.split(' '):
                new_task.append(nt)
        task = new_task
        query = build_query(topic, nar, task, bm25_split, nar_split, task_split)
        hits = searcher.search(query, 5000)
        i = 0
        j = 0
        seen = set()
        while i < 100:
            hit = hits[j]
            if hit.docid.split('.')[0] in seen or hit.docid.split('.')[0] in judged[topicno+1] or hit.docid.split('.')[0] not in valid:
                j+=1
                continue
            f.write(f'{topicno+1} Q0 {hit.docid.split(".")[0]} {i+1} {hit.score} {filename[:-4]}\n')
            i+=1
            j+=1
            seen |= {hit.docid.split('.')[0]}

            if hit.docid.split('.')[0] in judged[topicno+1]:
                qrel_seen +=1
            total+=1

15it [00:31,  2.41s/it]




35it [00:59,  1.25s/it]


In [10]:
qrel_seen

0

In [11]:
!head ru-tn-exp-rnd2.run

1 Q0 sqrn6kjy 1 1.7396999597549438 ru-tn-exp-rnd2
1 Q0 2054tkb7 2 1.7359000444412231 ru-tn-exp-rnd2
1 Q0 xvfl7ycj 3 1.6239999532699585 ru-tn-exp-rnd2
1 Q0 d6by9p41 4 1.62090003490448 ru-tn-exp-rnd2
1 Q0 958u08vb 5 1.610200047492981 ru-tn-exp-rnd2
1 Q0 16rgt4ca 6 1.5649000406265259 ru-tn-exp-rnd2
1 Q0 zpiaka80 7 1.5645999908447266 ru-tn-exp-rnd2
1 Q0 z9dolxky 8 1.55239999294281 ru-tn-exp-rnd2
1 Q0 9jb3w0zu 9 1.5455000400543213 ru-tn-exp-rnd2
1 Q0 0khg28ex 10 1.539199948310852 ru-tn-exp-rnd2


In [12]:
!head ru-t-exp-rnd2.run

1 Q0 sqrn6kjy 1 1.8131999969482422 ru-t-exp-rnd2
1 Q0 ne5r4d4b 2 1.7706999778747559 ru-t-exp-rnd2
1 Q0 2054tkb7 3 1.7668999433517456 ru-t-exp-rnd2
1 Q0 d6by9p41 4 1.7547999620437622 ru-t-exp-rnd2
1 Q0 zqf351sv 5 1.7525999546051025 ru-t-exp-rnd2
1 Q0 4iwddq2u 6 1.7404999732971191 ru-t-exp-rnd2
1 Q0 jr255dwn 7 1.722100019454956 ru-t-exp-rnd2
1 Q0 djclli8n 8 1.7196999788284302 ru-t-exp-rnd2
1 Q0 imvbkt69 9 1.7128000259399414 ru-t-exp-rnd2
1 Q0 431ksdno 10 1.7055000066757202 ru-t-exp-rnd2
