In [1]:
import xml.etree.ElementTree as Et

from pyserini.search import pysearch
from pyserini.search import pyquerybuilder
from tqdm import tqdm
from collections import defaultdict
from pprint import pprint

import subprocess

from pyserini.pyclass import autoclass
from pyserini.analysis.pyanalysis import get_lucene_analyzer

In [2]:
searcher = pysearch.SimpleSearcher('/home/chris/data/anserini/lucene-index-cord19-paragraph-2020-05-01/')

In [12]:
docid_path = 'docids-rnd2.txt'
valid = set()
with open(docid_path, 'r') as f:
    for line in f:
        valid |= {line.strip()}

In [3]:
topics_path = '/home/chris/data/topics/topics-rnd2.xml'
qrel_path = '/home/chris/data/qrels/qrels-rnd1.txt'

In [4]:
tree = Et.parse(topics_path)
root = tree.getroot()
topics = [root[i][0].text for i in range(35)]
nars =  [root[i][1].text for i in range(35)]

In [5]:
judged = defaultdict(set)
with open(qrel_path, 'r') as f:
    for line in f:
        topicno, iteration, empty, docid, relevance = line.strip().split(' ')
        judged[int(topicno)] |= {docid}

In [6]:
tasks = [
    ['environmental transmission', 'incubation', 'contagious', 'persistence', 'stability', 'physical', 'weather',
     'epidemiology', 'shedding', 'reproductive number', 'modes of transmission', 'virulent', 'asymptomatic', 'pathogen',
     'evolutionary host', 'transmission host'],
    ['smoking', 'risk', 'pulmonary', 'pre-condition', 'co-infection', 'high-risk', 'severe', 'susceptible', 'fatality',
     'neonates', 'respitory', 'condition', 'pre-existing', 'pregnant', 'morbidities'],
    ['human-animal', 'origin', 'genetics', 'evolution', 'genome', 'sample sets', 'genomic', 'strain', 'livestock',
     'animal host', 'natural history', 'genetic drift', 'mutation', 'genomics', 'sequencing'],
    ['vaccine', 'therapeutic', 'treat', 'drugs', 'pharmaceuticals', 'recipients', 'ADE', 'complication', 'antiviral',
     'prophylaxis', 'cloroquine', 'vaccination', 'immume respone'],
    ['medical care', 'surge capacity', 'nursing home', 'allocation', 'personal protective equirement',
     'clinical characterization', 'nursing', 'care', 'Extracorporeal membrane oxygenation', 'ECMO',
     'mechanical ventilation', 'extrapulmonary manifestations', 'cardiomyopathy', 'cardiac arrest',
     'regulatory standards', 'N95 masks', 'elastomeric respirators', 'telemedicine', 'steroids', 'high flow oxygen',
     'supportive interventions'],
    ['NPI', 'non-pharmaceutical intervention', 'school closure', 'travel ban', 'quarantine', 'mass gathering',
     'social distancing', 'public health advice', 'economic impact'],
    ['counties', 'geographic', 'geography', 'mortality rate', 'spread', 'mutations'],
    ['diagnostic', 'surveillance', 'detection', 'screening', 'ELISAs', 'capacity', 'testing', 'point-of-care',
     'rapid testing', 'pathogen', 'reagent', 'cytokines', 'response markers', 'swabs'],
    ['ethical', 'social science', 'principles', 'standards', 'ethics', 'psychological health', 'fear', 'anxiety',
     'stigma', 'sociology'],
    ['collaboration', 'nomenclature', 'data standards', 'information sharing', 'communication', 'collaborate',
     'coordination', 'misunderstanding', 'action plan']
]

topic_task_no = [2,0,3,0,3,6,3,6,7,5,4,5,0,0,0,0,3,5,5,1,0,0,1,1,6,6,6,3,3,3,2,2,3,1,9]

In [7]:
stops = {'of', 'to', 'the', 'for', 'in', 'on', 'and'}

def build_query(original, extension, extension_split):
    original = [o for o in original if o not in stops]
    extension = [e for e in extension if e not in stops]
    original_split = 1 - extension_split
    builder = pyquerybuilder.get_boolean_query_builder()
    for o in original:
        o = pyquerybuilder.get_term_query(o)
        boostquery = pyquerybuilder.get_boost_query(o, original_split/len(original))
        builder.add(boostquery, pyquerybuilder.JBooleanClauseOccur['should'].value)
    for e in extension:
        e = pyquerybuilder.get_term_query(e)
        boostquery = pyquerybuilder.get_boost_query(e, extension_split/len(extension))
        builder.add(boostquery, pyquerybuilder.JBooleanClauseOccur['should'].value)
    return builder.build()

In [14]:
split = .2
qrel_seen = 0
total = 0
filename = f'ru-t-exp-rnd2.run' 
with open(filename, 'w') as f:
    for topicno, topic in tqdm(enumerate(topics)):
        task = tasks[topic_task_no[topicno]]
        topic = topic.split(' ')
        new_task = []
        for t in task:
            for nt in t.split(' '):
                new_task.append(nt)
        task = new_task
        query = build_query(topic, task, split)
        hits = searcher.search(query, 5000)
        i = 0
        j = 0
        seen = set()
        while i < 100:
            hit = hits[j]
            if hit.docid.split('.')[0] in seen or hit.docid.split('.')[0] in judged[topicno+1] or hit.docid.split('.')[0] not in valid:
                j+=1
                continue
            f.write(f'{topicno+1} Q0 {hit.docid.split(".")[0]} {i+1} {hit.score} {filename[:-4]}\n')
            i+=1
            j+=1
            seen |= {hit.docid.split('.')[0]}

            if hit.docid.split('.')[0] in judged[topicno+1]:
                qrel_seen +=1
            total+=1

35it [00:59,  1.56s/it]


In [9]:
qrel_seen

0

In [16]:
!head -n 30 ru-t-exp-rnd2.run

1 Q0 sqrn6kjy 1 1.8131999969482422 ru-t-exp-rnd2
1 Q0 ne5r4d4b 2 1.7706999778747559 ru-t-exp-rnd2
1 Q0 2054tkb7 3 1.7668999433517456 ru-t-exp-rnd2
1 Q0 d6by9p41 4 1.7547999620437622 ru-t-exp-rnd2
1 Q0 zqf351sv 5 1.7525999546051025 ru-t-exp-rnd2
1 Q0 4iwddq2u 6 1.7404999732971191 ru-t-exp-rnd2
1 Q0 jr255dwn 7 1.722100019454956 ru-t-exp-rnd2
1 Q0 djclli8n 8 1.7196999788284302 ru-t-exp-rnd2
1 Q0 imvbkt69 9 1.7128000259399414 ru-t-exp-rnd2
1 Q0 431ksdno 10 1.7055000066757202 ru-t-exp-rnd2
1 Q0 m8jf9n6x 11 1.7034000158309937 ru-t-exp-rnd2
1 Q0 wuegn0jg 12 1.7009999752044678 ru-t-exp-rnd2
1 Q0 pwvcwlh8 13 1.7000000476837158 ru-t-exp-rnd2
1 Q0 9sofxhj7 14 1.6996999979019165 ru-t-exp-rnd2
1 Q0 xxblr8qd 15 1.6959999799728394 ru-t-exp-rnd2
1 Q0 9dwpnvxf 16 1.690500020980835 ru-t-exp-rnd2
1 Q0 58hjolfz 17 1.687399983406067 ru-t-exp-rnd2
1 Q0 6pbaa6u4 18 1.6827991008758545 ru-t-exp-rnd2
1 Q0 rq9hmjsx 19 1.6825000047683716 ru-t-exp-rnd2
1 Q0 zcy4hpqp 20 1.6821000576019287 ru-t-ex

In [17]:
valid

{'74c69z28',
 'hwow5eie',
 '3oo9u8k8',
 'p9r733t4',
 'zlupioxp',
 's2g5xqtz',
 'dimdlzwc',
 'yoxzf1yx',
 '6e2urzjm',
 'u9kv4113',
 'fxlbx84w',
 '6oca1mrm',
 'ujo72w06',
 'twwqo66v',
 'zqsi6k4q',
 'e00koo91',
 'qmo3o8cg',
 '6x39kaym',
 'p39caig9',
 'b49q7944',
 'h7ffv0xb',
 'n950aw6u',
 'zp7v1s9s',
 'yhzo0qqx',
 'f15rvjok',
 'hdc509ww',
 'gbmjuhgv',
 '9lkfjvwo',
 'ptuqsq0q',
 'vbqjgjah',
 'pgl7aebd',
 '218pgsd6',
 '4rhlkhji',
 'mneil7gn',
 'ftsmf9yh',
 '5qz12dcu',
 'j90jc0rj',
 '3g6uswt9',
 'n3muqhup',
 'qye9de5o',
 'nik7xizn',
 '5y0guel5',
 'kko3422g',
 'ch4pcy21',
 'ww0c2hac',
 'p2fjtg8l',
 't6b3bzq7',
 'j3h5krys',
 'tymoeyoo',
 '03c9rx3o',
 '4ba4h18s',
 'b62mk859',
 'l7iovqmg',
 'ntjngiem',
 'a47hcrim',
 'y4t4uu9l',
 'y9pzsoqa',
 'vr5hnzp8',
 '1udn20wo',
 '2k1nh74w',
 'lwx79950',
 'a4ynxgxz',
 'l7eq3oeb',
 'g5zdtqnb',
 'hx51zg4a',
 'np78ybqs',
 'uq2gvye9',
 'wf0zbd0j',
 'agnmfkoq',
 'h7wp91r8',
 'x672pd0y',
 'c9ts2g7w',
 'y2tcqh16',
 'jm60nxc2',
 'tm7krxxw',
 'gpabfr9o',
 '0cxdtr7r',