In [2]:
import pandas as pd
import datetime
import time
import os

In [None]:
"""
This notebook outputs the input for MedNLI and the output from it.

The steps in this workflow are:

-Extract sentences of interest from an existing dataframe.
-Write tab-separated pairs of sentences to a text file.
-Run the "predict.py" script in the MedNLI folder (https://github.com/jgc128/mednli_baseline) on the sentence file
and pretrained models, generating a score/prediction file.
-Merge the original sentences with the score/prediction table.

Todo:

    -The pretrained models all predate covid-19 by a year.  Try replacing all covid-19 synonyms in the input sentence with "influenza" and see how this affects results.
    -Write a bash script for the automatic execution of predict.py on all input sentence files in a directory.
    -Research if it's possible to retrain or modify the existing models to direct attention exclusively to covid-19 and drug terms.
    
"""

In [50]:
timestr = time.strftime("%Y%m%d")

"""
Input
"""

input_dir = '../../../../resources/mednli/input/'

sent_df_path = '%s/sents_with_cov19_and_drugs_200505.csv' % input_dir
claims_df_path = '%s/claims_flag_sentence_cord_uid.csv' % input_dir

#Sentence files are written to a date-specific folder
date_input_dir = '%s%s' % (input_dir, timestr)

if not os.path.isdir(date_input_dir):
    os.mkdir('%s%s' % (input_dir,timestr))

chloro_sents_outpath = '%s/chloro_sents_%s.txt' % (date_input_dir, timestr)
chloro_claims_outpath = '%s/chloro_claim_sents_%s.txt' % (date_input_dir, timestr)
    
"""
MedNLI script directory
"""

mednli_dir = '../../../../notebooks/mednli_baseline/'

"""
Output
"""

output_dir = '../../../../resources/mednli/output/'


#Sentence/score tables are written to a date-specific folder
date_output_dir = '%s%s' % (output_dir, timestr)

if not os.path.isdir(date_output_dir):
    os.mkdir('%s%s' % (output_dir,timestr))



In [102]:
sents_df = pd.read_csv(sent_df_path)
chloro_sent_df = sents_df.loc[sents_df.entities.str.contains('chloroquine')]
chloro_sents = chloro_sent_df.text.tolist()

claims_df = pd.read_csv(claims_df_path)
claims_df = df.loc[df.claims.str.len() > 2]
claims_df = claims_df.drop_duplicates(subset=['claims'])
chloro_claim_sents = claims_df.loc[claims_df.drug_terms_used.str.contains('chloroquine')].claims.tolist()

In [103]:
chloro_claim_sents

[' this study confirms that fever, cough, and breathing difficulties are the major symptoms of covid-19 infection among pregnant women, which are similar to general infected patients.',
 'this would be the first successful use of cq in humans for treating acute viral disease.',
 'it is also recommended to further research on the role of cq for covid-19 prevention.',
 'it has also been urged in an article that the above cq regimens be investigated in parallel with the mass deployment of this drug, without unnecessary regulatory delays, in an attempt to contain the global pandemic as the benefits of cq far outweigh the risks or costs.',
 'more data from ongoing and future trials will add more insight into the role of chloroquine and hydroxychloroquine in covid-19 infection.',
 '48 however, absolute risk in our study should be interpreted cautiously since patients with ra are likely different from those with covid-19.',
 'as the world awaits the results of clinical trials for the anti-vir

In [104]:
def pair_sentences(sents, outpath):
    
    sent_pairs = []
    
    with open(outpath, 'w') as f:
        for sent_i in sents:
            for sent_j in sents:
                f.write('%s\t%s\n' % (sent_i, sent_j))
                sent_pairs.append((sent_i, sent_j))
    
    return sent_pairs
    

In [105]:
chloro_pairs = pair_sentences(chloro_sents, chloro_sents_outpath)
chloro_claim_pairs = pair_sentences(chloro_claim_sents, chloro_claims_outpath)

In [106]:
"""
Run pretrained MedNLI models on tab-separated sentence files here, and use output for following cells.

Ex command:

python predict.py data/models/mednli.infersent.glovebioasqmimic.128.n8d0l13c.pkl data/chloro_claim_sents_200505.txt ../../resources/mednli/mednli_output/chloro_claim_sent_entail_glove_asq_mimic_200507.csv


"""


'\nRun pretrained MedNLI models on tab-separated sentence files here, and use output for following cells.\n\nEx command:\n\npython predict.py data/models/mednli.infersent.glovebioasqmimic.128.n8d0l13c.pkl data/chloro_claim_sents_200505.txt ../../resources/mednli/mednli_output/chloro_claim_sent_entail_glove_asq_mimic_200507.csv\n\n\n'

In [107]:
def format_scores_as_dict(sent_pairs, score_file):
    
    with open(score_file) as f:
        scores = f.read().splitlines()
        
    sent_pair_score_tuples = list(zip(sent_pairs, scores[1:]))
    
    res_dict = {}
    idx = 0
    for sent_pair_score_tuple in sent_pair_score_tuples:
        
        score_res = sent_pair_score_tuple[1].split(',')

        res_dict[idx] = {
            'sent_1' : sent_pair_score_tuple[0][0],
            'sent_2' : sent_pair_score_tuple[0][1],
            'prediction' : score_res[0],
            'contradiction' : score_res[1],
            'entailment' : score_res[2],
            'neutral' : score_res[3]
        }
        idx += 1
    
    return res_dict

def convert_scores_dict_to_df(scores_dict, outpath):
    
    scores_df = pd.DataFrame.from_dict(scores_dict).transpose()
    scores_df = scores_df[['sent_1', 'sent_2', 'prediction', 'contradiction', 'entailment', 'neutral']]
    
    scores_df.to_csv(outpath)
    
    return scores_df



In [136]:
models = [
    'glove',
    'glovebioasqmimic'   
]

for model in models:
    
    chloro_results_outpath = '%s/chloro_sent_entail_%s_%s.csv' % (date_output_dir, model, timestr)
    chloro_score_file = '%s/chloro_sent_score_%s_%s.csv' % (date_output_dir, model, timestr)
    chloro_results_dict = format_scores_as_dict(chloro_pairs, chloro_score_file)
    chloro_results_df = convert_scores_dict_to_df(chloro_results_dict, chloro_results_outpath)
    
    chloro_claim_results_outpath = '%s/chloro_claim_entail_%s_%s.csv' % (date_output_dir, model, timestr)
    chloro_claim_score_file = '%s/chloro_claim_score_%s_%s.csv' % (date_output_dir, model, timestr)
    chloro_claim_results_dict = format_scores_as_dict(chloro_claim_pairs, chloro_claim_score_file)
    chloro_claims_df = convert_scores_dict_to_df(chloro_claim_results_dict, chloro_claim_results_outpath)
        

In [137]:
"""
Examining results
"""


category = 'contradiction'

entail_df = pd.read_csv('../../../../resources/mednli/output/20200507/chloro_claim_entail_glovebioasqmimic_20200507.csv')
contra_df = entail_df.loc[entail_df.prediction == category]

print(entail_df.prediction.value_counts())

entailment       9310
neutral          1788
contradiction     351
Name: prediction, dtype: int64


In [139]:
for idx in range(1, 20):
    print(contra_df.sort_values(by=category, ascending=False).iloc[idx]['sent_1'])
    print('\n')
    print(contra_df.sort_values(by=category, ascending=False).iloc[idx]['sent_2'])
    print(contra_df.sort_values(by=category, ascending=False).iloc[idx][category])
    print('\n')

 no evidence of rapid antiviral clearance or clinical benefit with the combination of hydroxychloroquine and azithromycin in patients with severe covid-19 infection


this map would be helpful for personalized medicine in the future.
0.99382


hypertensive patients who received acei/arb therapy had a lower risk of developing severe pneumonia compared with those on non-acei/arb antihypertensive therapy, which suggested that acei/arb therapy may be protective from severe pulmonary injury for hypertensive patients.


 no evidence of rapid antiviral clearance or clinical benefit with the combination of hydroxychloroquine and azithromycin in patients with severe covid-19 infection
0.99234


these results can contribute to guide the design of new functional and high specific mabs providing a cost-and-time-effective computational framework towards the development of better diagnostic strategies and an effective treatment and/or vaccine for covid-19.


we postulate here that the acidotropic an

In [73]:
chloro_contradictions_df = chloro_results_df.loc[chloro_results_df.prediction == 'contradiction']

In [75]:
for idx, row in chloro_contradictions_df.sort_values(by='contradiction', ascending=False).iterrows():
    ,
    print(row.sent_1)
    print(row.sent_2)
    print(row.contradiction)
     
    #print(chloro_results_df.loc[chloro_results_df.sent_1 == row.sent_2].loc[chloro_results_df.sent_2 == row.sent_1])
    print('\n')
    

chloroquine and hydroxychloroquine in the treatment of covid-19 with or without diabetes:
a narrative letter by chinese authors reported that a news briefing from the state council of china had indicated that "chloroquine phos-phate… had demonstrated marked efficacy and acceptable safety in treating covid-19 associated pneumonia in multicentre clinical trials conducted in china" [5] .
0.99838


this issue has rapidly reviewed and published articles describing covid-19, including the drug treatment options for sars-cov-2, its clinical characteristics, and therapies involving a combination of chinese and western medicine, the efficacy of chloroquine phosphate in the treatment of covid-19 associated pneumonia according to clinical studies, and reflections on the system of reserve medical supplies for public health emergencies.
some covid-19 patients are responding well to chloroquine whereas others are not.
0.99834


we will include randomised controlled trials evaluating the effect of ch

0.79667


efficacy of hydroxychloroquine in patients with covid-19: results of a randomized clinical trial
we reviewed all interventional covid-19 trials on remdesivir, lopinavir/ritonavir and hydroxychloroquine registered in the national library of medicine (nlm) at the national institutes of health (nih
0.79635


in vitro antiviral activity and projection of optimized dosing design of hydroxychloroquine for the treatment of severe acute respiratory syndrome coronavirus 2 (sars-cov-2
chloroquine phosphate and hydroxychloroquine were reported in this review and showed favorable outcomes on the recovery of covid-19 patients (6) (7) (57) (58) (59) (60) .
0.79609


a narrative letter by chinese authors reported that a news briefing from the state council of china had indicated that "chloroquine phos-phate… had demonstrated marked efficacy and acceptable safety in treating covid-19 associated pneumonia in multicentre clinical trials conducted in china" [5] .
chloroquine and hydroxychloroqu

chloroquine and hydroxychloroquine in the treatment of covid-19 with or without diabetes:
0.60872


quantifying treatment effects of hydroxychloroquine and azithromycin for covid-19: a secondary analysis of an open label non-randomized clinical trial (gautret et al, 2020)
chloroquine and hydroxychloroquine in the treatment of covid-19 with or without diabetes:
0.60837


the research letter, written by a group of chinese researchers, studied the effect of chloroquine in vitro, using vero e6 cells infected by sars-cov-2 at a multiplicity of infection (moi) of 0.05.
of chloroquine and covid-19
0.60740


a number of nonspecific antiviral drugs, including ifn, lopinavir-ritonavir (hiv protease inhibitors), chloroquine, favipiravir (t-705) and remdesivir (gs-5734), have been used in clinics in china to treat sars-cov-2 infection.31 their in vivo efficacies still require further confirmation.
no evidence of rapid antiviral clearance or clinical benefit with the combination of hydroxychloroqui

In [47]:
test_sent = 'no evidence of rapid antiviral clearance or clinical benefit with the combination of hydroxychloroquine and azithromycin in patients with severe covid-19 infection'

In [50]:
test_df = chloro_results_df.loc[chloro_results_df.sent_2 == test_sent]

In [52]:
test_df.sort_values(by='contradiction', ascending=False)

Unnamed: 0,sent_1,sent_2,prediction,contradiction,entailment,neutral
730,two medications often used for treatment of im...,no evidence of rapid antiviral clearance or cl...,contradiction,0.79144,0.20855,0.00001
9507,the pharmaceutical interventions found for cov...,no evidence of rapid antiviral clearance or cl...,contradiction,0.75227,0.24766,0.00006
337,rapid reporting and dissemination of results a...,no evidence of rapid antiviral clearance or cl...,contradiction,0.68502,0.31487,0.00011
12651,recent studies suggest that hydroxychloroquine...,no evidence of rapid antiviral clearance or cl...,contradiction,0.66036,0.33917,0.00047
4136,therapeutic application of chloroquine in clin...,no evidence of rapid antiviral clearance or cl...,contradiction,0.64726,0.35263,0.00010
13175,"several drugs such as chloroquine, arbidol, re...",no evidence of rapid antiviral clearance or cl...,contradiction,0.62030,0.37968,0.00002
15402,of chloroquine and covid-19,no evidence of rapid antiviral clearance or cl...,contradiction,0.58796,0.40958,0.00246
1385,caution and clarity required in the use of chl...,no evidence of rapid antiviral clearance or cl...,contradiction,0.57954,0.41988,0.00058
11734,hydroxychloroquine is being used globally for ...,no evidence of rapid antiviral clearance or cl...,contradiction,0.57592,0.42383,0.00024
2040,case report of chloroquine therapy and hypogly...,no evidence of rapid antiviral clearance or cl...,entailment,0.49828,0.50165,0.00007


In [54]:
chloro_results_df.to_csv('../output/chloroquine_sentences_nli_200505.csv')

In [72]:
chloro_results_df.prediction.value_counts()

entailment       13405
neutral           2378
contradiction     1378
Name: prediction, dtype: int64

In [43]:
df = pd.read_csv('../../../../resources/mednli/input/claims_flag_sentence_cord_uid.csv')

In [49]:
for claim in df.claims:
    print(type(claim))
    print(claim)
    print('\n')

<class 'str'>
the recombinant protein reported here, together with the detailed structural information, might also be useful to others developing sars-cov-2 diagnostics and/or therapeutics.


<class 'str'>
the results suggest that sars-cov infection may target the cell types that are important for the protection of airway mucosa and their damage may lead to deterioration of epithelial cell function, finally leading to a more severe lung disease with accumulation of alveolar exudate and inflammatory cells and lung edema, the signs of pneumonia recently described in the lung specimens of two patients with covid-19 infection (60) .


<class 'str'>
the results show putative binding sites for several transcription factors which have a strong correlation of expression with ace2 in colon, kidney, and ileum.


<class 'str'>
gene ontology annotation results for the processes associated with genes strongly coexpressed (≥0.5) with ace2 across all tissues in gtex dataset.


<class 'str'>
however, 

In [59]:
claims = [i for i in df.claims if len(i) > 2]

In [82]:
claims_df.shape

(701, 5)