In [1]:
from NER_processing import nlp, disease_service, chemical_service, genetic_service
import pysolr

# Replicate paragraphs in local

In [2]:
solr = pysolr.Solr('http://librairy.linkeddata.es/data/covid-paragraphs', timeout=10)

In [3]:
solr_local = pysolr.Solr('http://localhost:8984/solr/covid_paragraphs', timeout=10)

In [7]:
solr_local.ping()

'{\n  "responseHeader":{\n    "zkConnected":null,\n    "status":0,\n    "QTime":1,\n    "params":{\n      "q":"{!lucene}*:*",\n      "distrib":"false",\n      "df":"_text_",\n      "rows":"10",\n      "echoParams":"all",\n      "rid":"-3"}},\n  "status":"OK"}\n'

In [4]:
results = solr.search(q="*:*", rows=60000,sort="id asc")

In [None]:
j=0
paragraphs = []
for result in results:
    paragraph = {}
    if ('section_s' in result):
        paragraph['section_s'] = result['section_s']
    if ('id' in result):
        paragraph['id'] = result['id']
    if ('article_id_s' in result):
        paragraph['article_id_s'] = result['article_id_s']
    if ('text_t' in result):
        paragraph['text_t'] = result['text_t']
    if ('size_i' in result):
        paragraph['size_i'] = result['size_i']
    if ('name_s' in result):
        paragraph['name_s'] = result['name_s']
    paragraphs.append(paragraph)
    j+=1
    if j%10000 == 0:
        solr_local.add(paragraphs)
        print(j, 'paragraphs added')
        paragraphs = []

# Annotate in local

In [9]:
import time

In [51]:
solr_local = pysolr.Solr('http://localhost:8984/solr/covid_paragraphs', always_commit=True, timeout=50)

In [47]:
paragraphs = solr_local.search(q='*:*', rows=200, start=200, sort="id ASC")

In [48]:
print(len(paragraphs))

200


In [55]:
paragraphs_processed = []
print("reading from solr..")
counter = 0
completed = False
window_size= 100
cursor = "*"
while (not completed):
    old_counter = counter        
    
    try:
        paragraphs = solr_local.search(q="*:*", rows=window_size, start=counter, sort="id asc")
#         cursor = paragraphs.nextCursorMark
        
        for p in paragraphs:
            paragraph = {}
            paragraph['id'] = p['id']
            if ('section_s' in p):
                paragraph['section_s'] = p['section_s']
            if ('id' in p):
                paragraph['id'] = p['id']
            if ('article_id_s' in p):
                paragraph['article_id_s'] = p['article_id_s']
            if ('size_i' in p):
                paragraph['size_i'] = p['size_i']
            if ('name_s' in p):
                paragraph['name_s'] = p['name_s']
            if ('text_t' in p):
                paragraph['text_t'] = p['text_t']
                doc = nlp(str(paragraph['text_t']))            
                paragraph['disease_ents'] = [f.text for f in doc.ents if f.label_ == 'DISEASE']
                paragraph['chemical_ents'] = [f.text for f in doc.ents if f.label_ == 'CHEMICAL']
                paragraph['genetic_ents'] = [f.text for f in doc.ents if f.label_ == 'GENETIC']
            paragraphs_processed.append(paragraph)
            
        counter += len(paragraphs)
        print(counter,"docs evaluated")
        
        if counter % window_size == 0:
            print(paragraphs_processed[0])
            solr_local.add(paragraphs_processed)
            print(counter, 'paragraphs annotated')
            paragraphs_processed=[]
            
        if (old_counter == counter):
            print("done!")
            break
            
    except Exception as e:
        print(repr(e))
        print("Solr query error. Wait for 5secs..")
        time.sleep(5.0)

reading from solr..
100 docs evaluated
{'id': '0000017c2479d436783516b1fe7ff927', 'section_s': 'disease spreading in a quickly adaptive network structure', 'article_id_s': '6e48dbcd16f8a5ac136397a92e6a6bad4d52a6bb', 'size_i': 122, 'name_s': 'disease spreading in a quickly adaptive network structure', 'text_t': ['This quantity exceeds zero whenever Note that taking r D 0 yields the basic reproductive ratio R A 0 for both SIR and SIS:'], 'disease_ents': [], 'chemical_ents': [], 'genetic_ents': []}
100 paragraphs annotated
200 docs evaluated
{'id': '0002d778d037e737863800a22cc88f06', 'section_s': 'reference', 'article_id_s': 'e5d54917d149ed87572748aafa6b0847668ce5a6', 'size_i': 117, 'name_s': 'reference', 'text_t': ['Hiestand BC, Smith SW. Cocaine chest pain: between a (crack) rock and a hard place. Acad Emerg Med. 2011;18(1):68-71.'], 'disease_ents': ['chest pain'], 'chemical_ents': ['Cocaine'], 'genetic_ents': []}
200 paragraphs annotated
300 docs evaluated
{'id': '000601bccdd22fa74af9a

KeyboardInterrupt: 

In [44]:
class CORD19Processor:
    def __init__(self,window_size):
        self.covid_articles = pysolr.Solr('http://librairy.linkeddata.es/data/covid', timeout=10)
        self.covid_paragraphs = pysolr.Solr('http://librairy.linkeddata.es/data/covid-paragraphs', timeout=10)
        self.window_size = window_size
        self.annotated_paragraphs = self.annotate_paragraphs()
    
    
    def annotate_paragraphs(self):
        counter = 0
        completed = False
        cursor = "*"
        i=0
        j=0
        processed_paragraphs = []
        while (not completed):
            old_counter = counter
            try:
                results = self.covid_paragraphs.search(q="*:*",cursorMark=cursor, rows=self.window_size,sort="id asc")
                cursor = results.nextCursorMark
                counter += len(results)
                paragraphs = []
                print(len(results))
                for result in results:
                    paragraph = {}
                    if ('section_s' in result):
                        paragraph['section_s'] = result['section_s']
                    if ('id' in result):
                        paragraph['id'] = result['id']
                    if ('article_id_s' in result):
                        paragraph['article_id_s'] = result['article_id_s']
                    if ('text_t' in result):
                        paragraph['text_t'] = result['text_t']
                    if ('size_i' in result):
                        paragraph['size_i'] = result['size_i']
                    if ('name_s' in result):
                        paragraph['name_s'] = result['name_s']
                    paragraphs.append(paragraph)
                    j+=1
                    if j%100000 == 0:
                        print(j, 'paragraphs retrieved')
                print(len(paragraphs),'paragraphs retrieved')
                if (old_counter == counter):
                    print('Final')
                    break
        
                for paragraph in paragraphs:
                    doc = nlp(paragraph['text_t'])
                    paragraph['disease_ents'] = [f for f in doc.ents if f.label_ == 'DISEASE']
                    paragraph['chemical_ents'] = [f for f in doc.ents if f.label_ == 'CHEMICAL']
                    paragraph['genetic_ents'] = [f for f in doc.ents if f.label_ == 'GENETIC']
                    i+=1
                    if i%10000 == 0:
                        print(i, 'paragraphs processed')
                    processed_paragraphs.append(paragraph)
               
                completed = True
            except Exception as e:
                print(repr(e))
                        
        return paragraph

        
            
            
            
            

In [45]:
CORD19Processor(window_size=200)

2103891
100000 paragraphs retrieved
200000 paragraphs retrieved
300000 paragraphs retrieved
400000 paragraphs retrieved
500000 paragraphs retrieved
600000 paragraphs retrieved
700000 paragraphs retrieved
800000 paragraphs retrieved
900000 paragraphs retrieved
1000000 paragraphs retrieved
1100000 paragraphs retrieved
1200000 paragraphs retrieved
1300000 paragraphs retrieved
1400000 paragraphs retrieved
1500000 paragraphs retrieved
1600000 paragraphs retrieved
1700000 paragraphs retrieved
1800000 paragraphs retrieved
1900000 paragraphs retrieved
2000000 paragraphs retrieved
2100000 paragraphs retrieved
2103891 paragraphs retrieved
100 paragraphs processed
{'section_s': 'background', 'id': '0002d368d74a23c546db9bc6098c37fe', 'article_id_s': '744f2ba295872a8f27eae0204ba16b3afb3bd7c9', 'text_t': 'Aseptic meningitis, encephalitis, and myelitis are inflammatory conditions of the central nervous system (CNS) (meninges, brain, and spinal cord, respectively). Disease is caused by a variety of in

KeyboardInterrupt: 

In [4]:
process_cord19()

1000 paragraphs retrieved
100 paragraphs processed
{'section_s': 'subunit vaccines', 'id': '43c7274ceb6399f07a7e0a52bbcd30ba', 'article_id_s': 'a9754e66d1d118157221031daab60604bab2ad14', 'text_t': 'Both MERS-CoV and SARS-CoV RBDs consist of a core and a receptor-binding subdomain. They share a high degree of structural similarity in the core subdomains, but the receptor-binding subdomains are notably divergent [32] . Because of variation of the receptor-binding subdomain region within subgroups of the same CoV or across different CoV groups, it is advisable to rationally design MERS subunit vaccines using chimeric S protein containing several neutralizing epitopes from divergent subgroups. This approach will provide a strategic platform for the rational design of subunit vaccines against future emerging CoVs by focusing on the chimeric S protein containing neutralizing epitopes from multiple virus strains across different subgroups [94] .', 'size_i': 754, 'disease_ents': [MERS-CoV, SAR

In [4]:
doc = nlp('pneumonia')

0
0
