# PySearch - a simple script to query a Solr installation with a set of TREC-COVID topics

In [65]:
#imports
from requests import get
import os.path
import xml.etree.ElementTree as ET
import json


In [66]:
BASE_URL = "http://localhost:8983/solr/trec/select?q="
DOCID = "id"
FIELDS = "&fl=" + ID + ",score"
ROWS = "&rows=1000"
RUN_FILE = 'baseline-title-abstract-query.run'
TAG = 'solr-bm25'
TOPICFILE = 'topics-rnd5.xml'

In [67]:
# check if local Solr installtion is online
print(requests.get(BASE_URL).status_code)

200


In [68]:
# download the TREC-COVID topic file (round 5) from the NIST archive and safe a local copy

if os.path.isfile(TOPICFILE) != True:
    topicsfile = requests.get('https://ir.nist.gov/covidSubmit/data/topics-rnd5.xml', allow_redirects=True)
    open('topics-rnd5.xml', 'wb').write(topicsfile.content)

In [69]:
# query the title_txt field with the query taken from the topic file for all 50 topics

with open(TOPICFILE, 'r') as f:
    topicsxml = f.read() 

with open(RUN_FILE, 'w') as f_out:
    root = ET.parse(TOPICFILE).getroot()    
    for topic in root.findall('topic'):    

        query = topic.find('query').text
        topicId = topic.attrib['number']        

        # We assume that there are two fields index: title_txt and abstract_txt - Your milage may vary... 
        q = "title_txt:(" + query.replace(' ', '%20') + ") " + "abstract_txt:(" + query.replace(' ', '%20') + ")"
        
        url = ''.join([BASE_URL, q, FIELDS, ROWS])
        json = get(url).json()        
        
        rank = 1                
        
        for doc in json.get('response').get('docs'):
            docid = doc.get(DOCID)            
            score = doc.get('score')
            out_str = '\t'.join([topicId, 'Q0', str(docid), str(rank), str(score), TAG])
            f_out.write(out_str + '\n')
            rank += 1