In [1]:
"""
Using a spaCy EntityRuler pipeline component to identify entities (virus terms, drug terms, etc.) in text passages.

Edit - Not sure if this is the direction we're going, so just used regex and pandas to filter the section texts for now.
"""

"\nUsing a spaCy EntityRuler pipeline component to identify entities (virus terms, drug terms, etc.) in text passages.\n\nEdit - Not sure if this is the direction we're going, so just used regex and pandas to filter the section texts for now.\n"

In [1]:
import json
import spacy
import en_core_sci_lg
from spacy.pipeline import EntityRuler
import re
import pandas as pd

In [18]:
"""
Input
"""

input_path = '../../../../resources/'
ent_ruler_jsonl_path = '%sspacy_entity_rulers/vt_task_entity_ruler_200501.jsonl' % input_path
merged_text_vec_csv_path = '%scord_titles_abstracts_conclusions.csv' % input_path
drug_terms_path = '%sspacy_entity_rulers/input/DrugNames.txt' % input_path

"""
Output
"""

output_path = '../../../../output/'

section_text_with_drugs_outpath = '%ssection_text_with_drug_mentions_200504.csv' % output_path
section_text_with_drugs_ann_outpath = '%ssection_texts_with_drug_mentions_ann_200507.csv' % output_path
section_sent_text_with_drugs_ann_outpath = '%ssection_texts_with_drug_mentions_ann_200504.csv' % output_path

In [3]:
nlp = en_core_sci_lg.load()

merged_text_vec_df = pd.read_csv(merged_text_vec_csv_path, index_col=0)

In [4]:
disabled = nlp.disable_pipes("tagger", "parser", "ner")
entity_ruler = EntityRuler(nlp)
entity_ruler.from_disk(ent_ruler_jsonl_path)
disabled.restore()

nlp.add_pipe(entity_ruler, before='ner')
nlp.remove_pipe('ner')


('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f87a8de2ac0>)

In [5]:
def annotate_text_with_named_entities(text):
    """
    Using an EntityRuler component, identify named entities and merge their component tokens.
    :param passage (str): a text passage.
    :return (Spacy.doc): a doc object that includes the identified named entities.
    """

    doc = nlp(text.lower())
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            retokenizer.merge(ent)

    return doc

In [27]:
"""
Merge all sentences belonging to each section of each paper into contiguous text passages.
"""

concat_sent_df = merged_text_vec_df.groupby(['cord_uid', 'section'], as_index=False).agg({'sentence': ' '.join})
concat_sent_df.loc[:, 'sentence'] = concat_sent_df.loc[:, 'sentence'].str.lower()

concat_sent_df= concat_sent_df.rename(columns={'sentence' :'text'})

#Adding a space before and after each text to allow regex matches at start and end of text.
concat_sent_df['text_spaced'] = ' ' + concat_sent_df.loc[:, 'text'] + ' '


In [7]:
"""
Read drug terms from reference file and generate regex pattern for matching.
"""

with open(drug_terms_path) as f:
    drug_terms = ['%s' % i.lower() for i in f.read().splitlines()]
    
drug_terms_pattern =  '\W' + '\W|\W'.join(drug_terms) + '\W'

drug_terms_pattern = drug_terms_pattern.replace('.', '\.')

In [8]:
"""
Filter to rows where section text contains drug term regex pattern.
"""

contain_drug_mask = concat_sent_df['text_spaced'].str.contains(drug_terms_pattern, case=False)
section_text_with_drugs_df = concat_sent_df[contain_drug_mask]

In [9]:
contain_drug_mask.value_counts()

False    9169
True      800
Name: text_spaced, dtype: int64

In [None]:
"""
Manual annotation of drug terms used.

"""

section_text_with_drugs_ann_df = section_text_with_drugs_df.copy()

section_text_with_drugs_ann_df['drug_terms_used'] = ''

for index, row in section_text_with_drugs_ann_df.iterrows():
    
    drugs_used = []
    for drug in drug_terms:
        if drug in row.text:
            drugs_used.append(drug)
    
    section_text_with_drugs_ann_df.at[index, 'drug_terms_used'] = ','.join(drugs_used)

In [21]:
"""
Writing section texts to outfile.

"""

section_text_with_drugs_ann_df = section_text_with_drugs_ann_df.drop(columns=['text'])
section_text_with_drugs_ann_df.to_csv(section_text_with_drugs_ann_outpath)

In [19]:

#section_text_with_drugs_ann_df = pd.read_csv(section_text_with_drugs_ann_outpath, index_col=0)
section_sent_text_with_drugs_ann_outpath

'../../../../output/section_texts_with_drug_mentions_ann_200504.csv'

In [29]:
section_sent_texts_with_drugs_df = pd.merge(merged_text_vec_df, section_text_with_drugs_ann_df, how='inner', left_on=['cord_uid', 'section'], right_on=['cord_uid', 'section'])

section_sent_texts_with_drugs_df = section_sent_texts_with_drugs_df.drop(columns=['text_spaced'])

section_sent_texts_with_drugs_df.to_csv(section_sent_text_with_drugs_ann_outpath)

In [52]:
#Verify that final cord_uid : sections included matches section_text_with_drugs_ann_df
section_sent_cord_uid_section_dict = section_sent_texts_with_drugs_df.groupby('cord_uid')['section'].apply(set).to_dict()

section_passage_cord_uid_section_dict = section_text_with_drugs_ann_df.groupby('cord_uid')['section'].apply(set).to_dict()

print(section_passage_cord_uid_section_dict == section_sent_cord_uid_section_dict)

True


In [49]:
prior_section_sent_texts_with_drugs_df = pd.read_csv('../../../../output/section_sent_texts_with_drug_mentions_ann_200502.csv')

In [65]:
#Comparing prior and current outputs manually

test_df = section_sent_texts_with_drugs_df.merge(prior_section_sent_texts_with_drugs_df, how='outer', indicator=True).loc[lambda x : x['_merge'] == 'left_only']
#test_df = prior_section_sent_texts_with_drugs_df.merge(section_sent_texts_with_drugs_df, how='outer', indicator=True).loc[lambda x : x['_merge'] == 'left_only']
test_df.sentence.tolist()

['TH17 responses in cytokine storm of COVID-19: An emerging target of JAK2 inhibitor Fedratinib',
 'Nelfinavir was predicted to be a potential inhibitor of 2019-nCov main protease by an integrative approach combining homology modelling, molecular docking and binding free energy calculation',
 'Favorable changes of CT findings in a patient with COVID-19 pneumonia after treatment with tocilizumab',
 'Lactate dehydrogenase, a Risk Factor of Severe COVID-19 Patients',
 'Hydrogen Peroxide Vapor sterilization of N95 respirators for reuse',
 'Angiotensin II Receptor Blockers and Angiotensin-Converting Enzyme Inhibitors Usage is Associated with Improved Inflammatory Status and Clinical Outcomes in COVID-19 Patients With Hypertension',
 'Remdesivir inhibits renal fibrosis in obstructed kidneys',
 'Atazanavir inhibits SARS-CoV-2 replication and pro-inflammatory cytokine production',
 'Amantadine disrupts lysosomal gene expression; potential therapy for COVID19',
 'Nelfinavir inhibits replication

In [43]:
"""
Manual spot-checking.

"""

merged_text_cord_uids = set(merged_text_vec_df.cord_uid.tolist())

section_text_cord_uids = set(section_text_with_drugs_ann_df.cord_uid.tolist())

cord_uid_section_tuples = list(zip(section_text_with_drugs_ann_df.cord_uid.tolist(), section_text_with_drugs_ann_df.section.tolist()))
cord_uid_section_tuples

[('02q9y011', 'RESULTS AND DISCUSSION'),
 ('08vsaov7', 'DISCUSSION'),
 ('0a49okho', 'Discussion'),
 ('0d77ojnb', 'Discussion and Conclusion'),
 ('0d77ojnb', 'abstract'),
 ('0d77ojnb', 'title'),
 ('0euaaspo',
  'Complications, treatment and clinical outcomes during hospitalization and follow-up 296'),
 ('0gier0lu', 'abstract'),
 ('0hrmk77p', 'Conclusion'),
 ('0hrmk77p', 'Discussion'),
 ('0lk8eujq', 'Discussion'),
 ('0lk8eujq', 'abstract'),
 ('0lk8eujq', 'title'),
 ('0nhgxoim', 'Added value of this study'),
 ('0nhgxoim', 'Discussion'),
 ('0rk2dw4e', 'DISCUSSION'),
 ('0uengr9t', 'Conclusion'),
 ('0uengr9t', 'abstract'),
 ('0uengr9t', 'title'),
 ('11060ijh',
  'Docking Results of Compounds in Garlic Essential Oil into the PDB6LU7 Protein of SARS-CoV-2 ::: Docking Simulation ::: Results and Discussion'),
 ('11060ijh', 'abstract'),
 ('125o0o7x',
  'Prediction of the clinical outcome of COVID-19 patients using T lymphocyte subsets with 340 cases from Wuhan, China: a retrospective cohort study

In [23]:
test_df.loc[test_df.cord_uid == '647zcjgu' ].loc[test_df.section == 'Discussion'].text.tolist()[0]

'since its initial outbreak, the 2019-ncov infection is much more contagious than it was originally thought. we know that the virus is capable of spreading quickly from human to human and that people can spread the virus even before they become symptomatic [1] . this makes it harder to contain the virus, and many are concerned about the possibility of a new pandemic. our study suggests unique structural features of the spike glycoprotein rbd of 2019-ncov that confers potentially higher affinity binding for its receptor than found with sars-cov. with a higher affinity binding capability, the number of viruses required to infect a cell is much reduced. this partly explains why 2019-ncov appears to be more aggressive than sars-cov. this also reminds us of a lesser-known coronavirus hcov-nl63 that also uses ace2 also as a receptor. hcov-nl63 was initially isolated from a child with bronchiolitis in the netherlands [21] . it belongs to the alphacoronavirus subfamily. the rbd of sars-cov sha

In [232]:
"""
Manual checking for drug mentions excluded by regex search, using direct string matches.

"""

sections_texts = concat_sent_df.sentence.tolist()

cord_uids_contain_drugs_manual_search_dict = {}

idx = 0
for idx, row in concat_sent_df.iterrows():
    text = row.sentence
    for drug in drug_terms:
        if drug in text:
            cord_uids_contain_drugs_manual_search_dict.setdefault(row.cord_uid, []).append(drug)
            
    idx +=1
    
cord_uids_with_manual_search_drugs = set(cord_uids_contain_drugs_manual_search_dict.keys())

#Difference in cord_uid lists
diff_cord_uids = cord_uids_with_manual_search_drugs - set(cord_uids_with_drugs)

In [267]:
diff_cord = 'ela022bo'

drugs_used = cord_uids_contain_drugs_manual_search_dict[diff_cord]
test_df = concat_sent_df.loc[concat_sent_df.cord_uid == diff_cord]

for sent in test_df.text.tolist():
    if 'lustra' in sent:
        print(sent)


the sars-cov-2 epidemic is one of the biggest challenges healthcare systems worldwide have ever had to face. to curb transmission many countries have adopted social distancing measures and travel restrictions. estimating the effect of these measures in each context is challenging and requires mathematical models of the transmission dynamics. projections for the future course of the epidemic strongly rely on model predictions and accurate representation of real-time data as they accumulate. here i develop an seir modeling framework for covid-19, to evaluate reported cases and fatalities, and to enable forecasting using evidence-based bayesian parameter estimation. this bayesian framework offers a tool to parametrize real-time dynamics of covid-19 cases, and explore the effect of control as it unfolds in any setting. i apply the model to covid-19 data from albania, where drastic control measures were put in place already on the day of the first confirmed case. evaluating the dynamics of 

In [248]:
test_df = concat_sent_df.loc[concat_sent_df.cord_uid == 'ela022bo']

for sent in test_df.sentence.tolist():
    if 'lustra' in sent:
        print(sent)

the sars-cov-2 epidemic is one of the biggest challenges healthcare systems worldwide have ever had to face. to curb transmission many countries have adopted social distancing measures and travel restrictions. estimating the effect of these measures in each context is challenging and requires mathematical models of the transmission dynamics. projections for the future course of the epidemic strongly rely on model predictions and accurate representation of real-time data as they accumulate. here i develop an seir modeling framework for covid-19, to evaluate reported cases and fatalities, and to enable forecasting using evidence-based bayesian parameter estimation. this bayesian framework offers a tool to parametrize real-time dynamics of covid-19 cases, and explore the effect of control as it unfolds in any setting. i apply the model to covid-19 data from albania, where drastic control measures were put in place already on the day of the first confirmed case. evaluating the dynamics of 

In [276]:
len(section_text_with_drugs_df.cord_uid.unique())

544

In [272]:
concat_sent_df.loc[concat_sent_df.cord_uid == 'zph6r4il'].loc[concat_sent_df.section == 'abstract'].text.tolist()[0]

'objective: the sars-cov-2-infected disease (covid-19) outbreak is a major threat to human beings. previous studies mainly focused on wuhan and typical symptoms. we analysed 74 confirmed covid-19 cases with gi symptoms in the zhejiang province to determine epidemiological, clinical and virological characteristics. design: covid-19 hospital patients were admitted in the zhejiang province from 17 january 2020 to 8 february 2020. epidemiological, demographic, clinical, laboratory, management and outcome data of patients with gi symptoms were analysed using multivariate analysis for risk of severe/critical type. bioinformatics were used to analyse features of sars-cov-2 from zhejiang province. results: among enrolled 651 patients, 74 (11.4%) presented with at least one gi symptom (nausea, vomiting or diarrhoea), average age of 46.14 years, 4-day incubation period and 10.8% had pre-existing liver disease. of patients with covid-19 with gi symptoms, 17 (22.97%) and 23 (31.08%) had severe/cri

In [278]:
section_text_with_drugs_ann_df

Unnamed: 0,cord_uid,section,text,drug_terms_used
14,02q9y011,RESULTS AND DISCUSSION,hypervariability in the nsp3 macro x domain-th...,"amino acids,glycine,protein c,tandem"
56,08vsaov7,DISCUSSION,the predominant pathological features of covid...,angiotensin ii
66,0a49okho,Discussion,"the term 'flatten the curve', originating from...","stimate,tandem"
82,0d77ojnb,Discussion and Conclusion,the rapid spread of sars-cov-2 represents a si...,"heparin,cosamin,heparin,protein s"
83,0d77ojnb,abstract,[{'text': 'many pathogens take advantage of th...,"heparin,cosamin,heparin"
...,...,...,...,...
9828,ziepfnpz,abstract,the epidemiological and clinical characteristi...,tacrolimus
9872,zn87f1lk,title,"hydroxychloroquine, a less toxic derivative of...","chloroquine,chloroquine,hydroxychloroquine"
9893,zph6r4il,abstract,objective: the sars-cov-2-infected disease (co...,"lactate,ultiva"
9949,zwqci59h,Discussion,although recent studies have reported epidemio...,"compro,creatinine,cyclosporine,methylprednisol..."


In [76]:
with open('../../../../resources/drug_terms_regex_pattern_200504.txt', 'w') as f:
    f.write(drug_terms_pattern)

In [22]:
section_text_df = pd.read_csv(section_text_with_drugs_ann_outpath)

In [23]:
section_text_df

Unnamed: 0.1,Unnamed: 0,cord_uid,section,text_spaced,drug_terms_used
0,14,02q9y011,RESULTS AND DISCUSSION,hypervariability in the nsp3 macro x domain-t...,"amino acids,glycine,protein c,tandem"
1,21,03eifdr1,title,chloroquine as prophylactic agent against cov...,"chloroquine,chloroquine"
2,56,08vsaov7,DISCUSSION,the predominant pathological features of covi...,angiotensin ii
3,66,0a49okho,Discussion,"the term 'flatten the curve', originating fro...","stimate,tandem"
4,82,0d77ojnb,Discussion and Conclusion,the rapid spread of sars-cov-2 represents a s...,"heparin,cosamin,heparin,protein s"
...,...,...,...,...,...
795,9828,ziepfnpz,abstract,the epidemiological and clinical characterist...,tacrolimus
796,9872,zn87f1lk,title,"hydroxychloroquine, a less toxic derivative o...","chloroquine,chloroquine,hydroxychloroquine"
797,9893,zph6r4il,abstract,objective: the sars-cov-2-infected disease (c...,"lactate,ultiva"
798,9949,zwqci59h,Discussion,although recent studies have reported epidemi...,"compro,creatinine,cyclosporine,methylprednisol..."
