In [1]:
import xml.etree.ElementTree as Et

from pyserini.search import pysearch
from pyserini.search import pyquerybuilder
from tqdm import tqdm
from collections import defaultdict
from pprint import pprint

import subprocess

from pyserini.pyclass import autoclass
from pyserini.analysis.pyanalysis import get_lucene_analyzer

In [2]:
searcher = pysearch.SimpleSearcher('/home/chris/data/anserini/lucene-index-cord19-paragraph-2020-05-01/')

In [3]:
docid_path = 'docids-rnd2.txt'
valid = set()
with open(docid_path, 'r') as f:
    for line in f:
        valid |= {line.strip()}

In [4]:
topics_path = '/home/chris/data/topics/topics-rnd2.xml'
qrel_path = '/home/chris/data/qrels/qrels-rnd1.txt'

In [5]:
tree = Et.parse(topics_path)
root = tree.getroot()
topics = [root[i][0].text for i in range(35)]


In [6]:
judged = defaultdict(set)
with open(qrel_path, 'r') as f:
    for line in f:
        topicno, iteration, empty, docid, relevance = line.strip().split(' ')
        judged[int(topicno)] |= {docid}

In [7]:
tasks = [{'transmission': 0.033, 'mortality': 0.03, 'important': 0.027, 'common': 0.024, 'known': 0.023, 'one': 0.022, 'vaccine': 0.02}, {'clinical': 0.049, 'system': 0.045, 'risk': 0.028, 'studies': 0.027, 'evidence': 0.027, 'high': 0.023, 'large': 0.021, 'known': 0.02, 'factors': 0.02, 'highly': 0.02, 'common': 0.02, 'diagnosis': 0.019, 'identified': 0.017}, {'three': 0.035, 'host': 0.034, 'animal': 0.031, 'species': 0.03, 'early': 0.029, 'reported': 0.028, 'studies': 0.024, 'infected': 0.024, 'two': 0.022, 'animals': 0.021, 'mortality': 0.02, 'vaccine': 0.02, 'large': 0.019}, {'protein': 0.029, 'highly': 0.027, 'development': 0.024, 'vaccines': 0.024, 'although': 0.023, 'years': 0.021, 'effective': 0.021, 'assay': 0.021, 'related': 0.02}, {'clinical': 0.056, 'care': 0.046, 'diagnosis': 0.028, 'several': 0.025, 'provide': 0.025, 'studies': 0.022, 'data': 0.021, 'may': 0.019, 'critical': 0.019, 'lower': 0.019, 'confirmed': 0.018, 'identify': 0.018, 'information': 0.018, 'evidence': 0.018}, {'non': 0.035, 'important': 0.028, 'evidence': 0.025, 'control': 0.023, 'model': 0.023, 'care': 0.022, 'large': 0.022, 'although': 0.021, 'based': 0.019}, {'need': 0.046, 'control': 0.035, 'early': 0.032, 'care': 0.022, 'genome': 0.021, 'low': 0.021, 'years': 0.02, 'reported': 0.02, 'effective': 0.02, 'major': 0.02, 'developed': 0.02}, {'clinical': 0.05, 'diagnosis': 0.023, 'common': 0.019, 'major': 0.019, 'detection': 0.017, 'effective': 0.017}, {}, {}]

topic_task_no = [2,0,3,0,3,6,3,6,7,5,4,5,0,0,0,0,3,5,5,1,0,0,1,1,6,6,6,3,3,3,2,2,3,1,9]

In [8]:
tasks_weights = []
for t in tasks:
    total = sum([value for key, value in t.items()])
    weights = dict()
    for key, value in t.items():
        weights[key] = value / total * .2
    tasks_weights.append(weights)
tasks_weights

[{'common': 0.026815642458100565,
  'important': 0.03016759776536314,
  'known': 0.025698324022346376,
  'mortality': 0.033519553072625705,
  'one': 0.024581005586592184,
  'transmission': 0.03687150837988828,
  'vaccine': 0.022346368715083803},
 {'clinical': 0.02916666666666667,
  'common': 0.011904761904761904,
  'diagnosis': 0.01130952380952381,
  'evidence': 0.01607142857142857,
  'factors': 0.011904761904761904,
  'high': 0.01369047619047619,
  'highly': 0.011904761904761904,
  'identified': 0.01011904761904762,
  'known': 0.011904761904761904,
  'large': 0.0125,
  'risk': 0.016666666666666666,
  'studies': 0.01607142857142857,
  'system': 0.026785714285714288},
 {'animal': 0.018397626112759642,
  'animals': 0.012462908011869434,
  'early': 0.017210682492581602,
  'host': 0.020178041543026704,
  'infected': 0.014243323442136496,
  'large': 0.011275964391691392,
  'mortality': 0.011869436201780414,
  'reported': 0.01661721068249258,
  'species': 0.01780415430267062,
  'studies': 0.

In [9]:
stops = {'of', 'to', 'the', 'for', 'in', 'on', 'and',
         'is', 'will', 'Is', 'or', 'are', 'there', 'that', 'an', 'with',
         'at', 'by', 'but', 'Are', 'be', 'this', 'if', 'they?'}

def build_query(topic, task):
    bm25_split = .8
    bm25 = [b for b in topic if b not in stops]
    
    builder = pyquerybuilder.get_boolean_query_builder()
    
    for b in bm25:
        b = pyquerybuilder.get_term_query(b)
        boostquery = pyquerybuilder.get_boost_query(b, bm25_split/len(bm25))
        builder.add(boostquery, pyquerybuilder.JBooleanClauseOccur['should'].value)
        
    for t, boost in task.items():
        t = pyquerybuilder.get_term_query(t)
        boostquery = pyquerybuilder.get_boost_query(t, boost)
        builder.add(boostquery, pyquerybuilder.JBooleanClauseOccur['should'].value)
        
    return builder.build()

In [10]:
bm25_split = .8
task_split = .2

qrel_seen = 0
total = 0
filename = f'ru-tw-exp-rnd2.run' 
with open(filename, 'w') as f:
    for topicno, topic in tqdm(enumerate(topics)):
        task = tasks_weights[topic_task_no[topicno]]
        topic = topic.split(' ')
        query = build_query(topic, task)
        hits = searcher.search(query, 5000)
        i = 0
        j = 0
        seen = set()
        while i < 100:
            hit = hits[j]
            if hit.docid.split('.')[0] in seen or hit.docid.split('.')[0] in judged[topicno+1] or hit.docid.split('.')[0] not in valid:
                j+=1
                continue
            f.write(f'{topicno+1} Q0 {hit.docid.split(".")[0]} {i+1} {hit.score} {filename[:-4]}\n')
            i+=1
            j+=1
            seen |= {hit.docid.split('.')[0]}

            if hit.docid.split('.')[0] in judged[topicno+1]:
                qrel_seen +=1
            total+=1

35it [00:50,  1.17s/it]


In [11]:
qrel_seen

0

In [12]:
!head ru-tn-exp-rnd2.run

1 Q0 sqrn6kjy 1 1.7396999597549438 ru-tn-exp-rnd2
1 Q0 2054tkb7 2 1.7359000444412231 ru-tn-exp-rnd2
1 Q0 xvfl7ycj 3 1.6239999532699585 ru-tn-exp-rnd2
1 Q0 d6by9p41 4 1.62090003490448 ru-tn-exp-rnd2
1 Q0 958u08vb 5 1.610200047492981 ru-tn-exp-rnd2
1 Q0 16rgt4ca 6 1.5649000406265259 ru-tn-exp-rnd2
1 Q0 zpiaka80 7 1.5645999908447266 ru-tn-exp-rnd2
1 Q0 z9dolxky 8 1.55239999294281 ru-tn-exp-rnd2
1 Q0 9jb3w0zu 9 1.5455000400543213 ru-tn-exp-rnd2
1 Q0 0khg28ex 10 1.539199948310852 ru-tn-exp-rnd2


In [13]:
!head ru-t-exp-rnd2.run

1 Q0 sqrn6kjy 1 1.8131999969482422 ru-t-exp-rnd2
1 Q0 ne5r4d4b 2 1.7706999778747559 ru-t-exp-rnd2
1 Q0 2054tkb7 3 1.7668999433517456 ru-t-exp-rnd2
1 Q0 d6by9p41 4 1.7547999620437622 ru-t-exp-rnd2
1 Q0 zqf351sv 5 1.7525999546051025 ru-t-exp-rnd2
1 Q0 4iwddq2u 6 1.7404999732971191 ru-t-exp-rnd2
1 Q0 jr255dwn 7 1.722100019454956 ru-t-exp-rnd2
1 Q0 djclli8n 8 1.7196999788284302 ru-t-exp-rnd2
1 Q0 imvbkt69 9 1.7128000259399414 ru-t-exp-rnd2
1 Q0 431ksdno 10 1.7055000066757202 ru-t-exp-rnd2


In [14]:
!head ru-tw-exp-rnd3.run

1 Q0 sqrn6kjy 1 1.7417999505996704 ru-tw-exp-rnd3
1 Q0 ne5r4d4b 2 1.7325999736785889 ru-tw-exp-rnd3
1 Q0 h8cemq2n 3 1.7266000509262085 ru-tw-exp-rnd3
1 Q0 d6by9p41 4 1.7049000263214111 ru-tw-exp-rnd3
1 Q0 wuegn0jg 5 1.6937999725341797 ru-tw-exp-rnd3
1 Q0 pwvcwlh8 6 1.6830999851226807 ru-tw-exp-rnd3
1 Q0 zqf351sv 7 1.676200032234192 ru-tw-exp-rnd3
1 Q0 2054tkb7 8 1.6756999492645264 ru-tw-exp-rnd3
1 Q0 xxblr8qd 9 1.6660000085830688 ru-tw-exp-rnd3
1 Q0 3dlukfho 10 1.6649999618530273 ru-tw-exp-rnd3
