In [1]:
# Written to read and explore the initial data
# associating drug labels to adverse events
# written 12-13-18 JLW, adapted 8-28-19

import pickle,os,csv
from collections import defaultdict

rscs_dir = '../rscs/'
data_dir = '../data/'

db2nf = os.path.join(rscs_dir,'drugbankid_to_name.pkl')
db2n = pickle.load(open(db2nf,'rb'))
n2db = dict([(nm.lower(),db) for (db,nm) in db2n.items()])

In [2]:
# methods for phenotype matching
dme_syns = pickle.load(open(os.path.join(data_dir,'synonyms.pkl'),'rb'))
def strip_string(s):
        return s.replace(',','').replace('(','').replace(')','')

def exact_overlap(t,p): # check if all tox words are contained in phenotype
        t_words = [x.lower() for x in t.split(' ')]
        p_words = [x.lower() for x in strip_string(p).split(' ')]
        t_len = float(len(t_words))
        if len(set(t_words).intersection(set(p_words))) == t_len:
                return True
        else:
                return False

def check_synonyms(t,p):
        match = False
        if dme_syns[t] != []:
                syns = dme_syns[t]
                syns = [x for x in syns if x!= '']
                for s in syns:
                        if exact_overlap(s,p):
                                match = True
        return match

In [3]:
# load drug-DME data from drug labels
f = os.path.join(data_dir,'Drugs_labeled_for_AEs.txt') #Each column header is a DME, and the rows are drugs associated with the DME
dR = csv.DictReader(open(f,'r'),delimiter='\t')

# gather starting data
unique_drugs = set()
drugs_by_dme = defaultdict(list)
dmes_by_drugs = defaultdict(list)
for r in dR:
        for (dme,drug) in r.items():
                if drug != '':
                        drugs_by_dme[dme].append(drug.lower())
                        unique_drugs.add(drug.lower())
                        dmes_by_drugs[drug.lower()].append(dme)
print('extracted drug-dme relationships')

# create set of false positives
fpdmes_by_drugs = defaultdict(list)
unique_dmes = [k for k in drugs_by_dme.keys()]
for (drug,dme_list) in dmes_by_drugs.items():
        fp_dmes = list(set(unique_dmes).difference(set(dme_list))) # get list of dmes NOT on the drug label
        fpdmes_by_drugs[drug.lower()] = fp_dmes

# map to DrugBank IDs
un_in_DB = [n2db[n.lower()] for n in unique_drugs if n.lower() in n2db] #1136 drugs

extracted drug-dme relationships


In [4]:
# check how many are in drug bank and modeled
db_with_targf = os.path.join(rscs_dir,'drugBank_with_at_least_one_intom_targ.pkl')
db_with_targ = pickle.load(open(db_with_targf,'rb'))
un_with_targ = [n for n in un_in_DB if n in db_with_targ] #997 drugs
un_with_targ_dname = [db2n[d].lower() for d in un_with_targ] #997 drugs
num_drug_analyzed = len(un_with_targ)
un_w_t_in_dme = [d for d in un_with_targ if db2n[d].lower() in dmes_by_drugs] # drug with targets and dmes
print('loaded drug bank mappings')

# dictionary to look up drug network results
asfdir = os.path.join(data_dir,'all_drugbank_network_association_files/')
asf = [f for f in os.listdir(asfdir)]
asf_dic = dict([(f.split('_')[0],f) for f in asf if 'merged_neighborhood__assoc_table_.txt' in f])
print('loaded association files')

# other data needed for analysis
# expected p-values for each dme, and number of drug targets
back_phmeds_f = os.path.join(rscs_dir,'back_phmeds.pkl')
back_phmeds = pickle.load(open(back_phmeds_f,'rb')) # to find expected p-value for phenotype
dintf = os.path.join(rscs_dir,'drug_intome_targets.pkl')
dint = pickle.load(open(dintf,'rb')) # to find number of drug targets

loaded drug bank mappings
loaded association files


In [7]:
# run analysis, store data for calculating specificity
drugs_matched_to_tox = defaultdict(set)
drugs_fp_to_tox = defaultdict(set)
of1 = open('drugs_to_dmes_true_positive.txt','w')
hdr = ['Drug Name','DrugBankID','Label DME','PathFX Phenotype','PathFX Phen. CUI','Corrected P-value','Network genes assoc to phen','\n']
of1.write('\t'.join(hdr))
of2 = open('drugs_to_dmes_false_positives.txt','w')
hdr = ['Drug Name','DrugBankID','False Positive DME','PathFX Phenotype','PathFX Phen. CUI','Corrected P-value','Network genes assoc to phen','\n']
of2.write('\t'.join(hdr))
for dbid in un_w_t_in_dme:
        print('analyzing drug: '+dbid)
        asf = os.path.join(asfdir,asf_dic[dbid])
        targs = dint[dbid]
        nt = len(targs)
        dname = db2n[dbid].lower()
        drug_dmes = dmes_by_drugs[dname]
        fp_dmes = fpdmes_by_drugs[dname]
        dR = csv.DictReader(open(asf,'r'),delimiter='\t')
        for row in dR: # get each phenotype for that drug
                [rank,ph,cui,asin,asii,BH,genes] = [row['rank'],row['phenotype'],row['cui'],row['assoc in neigh'],row['assoc in intom'],float(row['Benjamini-Hochberg']),row['genes']]
                if cui in back_phmeds[nt]:
                        exp_BH = back_phmeds[nt][cui]
                else:
                        exp_BH = 1
                if BH < exp_BH: # only look for matches with the significant phenotypes
                        for tox in drug_dmes:
                                if exact_overlap(tox,ph) or check_synonyms(tox,ph):
                                        drugs_matched_to_tox[tox].add(dname)
                                        out_data = [dname,dbid,tox,ph,cui,str(exp_BH),genes,'\n']
                                        of1.write('\t'.join(out_data))

                        for tox in fp_dmes:
                                if exact_overlap(tox,ph) or check_synonyms(tox,ph):
                                        drugs_fp_to_tox[tox].add(dname)
                                        out_data = [dname,dbid,tox,ph,cui,str(exp_BH),genes,'\n']
                                        of2.write('\t'.join(out_data))
of1.close()
of2.close()

analyzing drug: DB00991
analyzing drug: DB00703
analyzing drug: DB01080
analyzing drug: DB01130
analyzing drug: DB01081
analyzing drug: DB01324
analyzing drug: DB08897
analyzing drug: DB01119
analyzing drug: DB00019
analyzing drug: DB00471
analyzing drug: DB01167
analyzing drug: DB00257
analyzing drug: DB00947
analyzing drug: DB00206
analyzing drug: DB06201
analyzing drug: DB00503
analyzing drug: DB01197
analyzing drug: DB00736
analyzing drug: DB00554
analyzing drug: DB00424
analyzing drug: DB00441
analyzing drug: DB01623
analyzing drug: DB00553
analyzing drug: DB00868
analyzing drug: DB00512
analyzing drug: DB00348
analyzing drug: DB00390
analyzing drug: DB00580
analyzing drug: DB01656
analyzing drug: DB00950
analyzing drug: DB00433
analyzing drug: DB04572
analyzing drug: DB08875
analyzing drug: DB01609
analyzing drug: DB09035
analyzing drug: DB00111
analyzing drug: DB00324
analyzing drug: DB00458
analyzing drug: DB00841
analyzing drug: DB01098
analyzing drug: DB00591
analyzing drug: 

analyzing drug: DB00937
analyzing drug: DB00621
analyzing drug: DB06706
analyzing drug: DB00594
analyzing drug: DB00978
analyzing drug: DB04868
analyzing drug: DB00428
analyzing drug: DB00988
analyzing drug: DB01129
analyzing drug: DB00690
analyzing drug: DB00108
analyzing drug: DB00748
analyzing drug: DB00419
analyzing drug: DB00704
analyzing drug: DB09043
analyzing drug: DB01394
analyzing drug: DB05676
analyzing drug: DB00476
analyzing drug: DB06212
analyzing drug: DB01032
analyzing drug: DB01221
analyzing drug: DB00513
analyzing drug: DB00669
analyzing drug: DB01190
analyzing drug: DB00445
analyzing drug: DB09102
analyzing drug: DB00008
analyzing drug: DB01369
analyzing drug: DB00530
analyzing drug: DB00997
analyzing drug: DB06204
analyzing drug: DB06401
analyzing drug: DB00366
analyzing drug: DB06413
analyzing drug: DB01202
analyzing drug: DB09083
analyzing drug: DB00050
analyzing drug: DB00608
analyzing drug: DB00185
analyzing drug: DB06335
analyzing drug: DB00550
analyzing drug: 

analyzing drug: DB00349
analyzing drug: DB00794
analyzing drug: DB01356
analyzing drug: DB06209
analyzing drug: DB00275
analyzing drug: DB01137
analyzing drug: DB01097
analyzing drug: DB00263
analyzing drug: DB00753
analyzing drug: DB00006
analyzing drug: DB01307
analyzing drug: DB08901
analyzing drug: DB00607
analyzing drug: DB00099
analyzing drug: DB01281
analyzing drug: DB01020
analyzing drug: DB01039
analyzing drug: DB01438
analyzing drug: DB00322
analyzing drug: DB00862
analyzing drug: DB00881
analyzing drug: DB00404
analyzing drug: DB00254
analyzing drug: DB09063
analyzing drug: DB00582
analyzing drug: DB09488
analyzing drug: DB01597
analyzing drug: DB00697
analyzing drug: DB00438
analyzing drug: DB00377
analyzing drug: DB01151
analyzing drug: DB00987
analyzing drug: DB00373
analyzing drug: DB00576
analyzing drug: DB00586
analyzing drug: DB09048
analyzing drug: DB00489
analyzing drug: DB06243
analyzing drug: DB09278
analyzing drug: DB00595
analyzing drug: DB00184
analyzing drug: 

In [8]:
# calculate specificity and sensitivity
of3 = open('dme_label_specificity_sensitivity.txt','w')
hdr = ['DME','Sensitivity','Specificity','True Positive Count','True Positives','False Negative Count','False Negatives','False Positive Count','False Positives','True Negative Count','True Negatives','\n']
of3.write('\t'.join(hdr))
for dme in drugs_by_dme:
        print('TP,FP calculation for: '+dme)
        all_positives = drugs_by_dme[dme]
        positives = set(all_positives).intersection(set(un_with_targ_dname)) # remove positives that aren't in the 997 drugs that are eligible for analysis
        print('num positives: '+str(len(positives)))
        true_positives = drugs_matched_to_tox[dme]
        print('num TP: '+str(len(true_positives)))
        false_negatives = list(set(positives).difference(true_positives)) # drugs with dmes not discovered by PathFX
        print('num FN: '+str(len(false_negatives)))

        negatives = list(set(un_with_targ_dname).difference(set(positives))) # drugs with targets but dme not on label
        print('num negatives: '+str(len(negatives)))
        false_positives = drugs_fp_to_tox[dme]
        print('num FP: '+str(len(false_positives)))
        true_negatives = list(set(negatives).difference(false_positives))
        print('num TN: '+str(len(true_negatives)))

        tpc = str(len(true_positives)) # true positive count
        tps = ','.join(sorted(list(true_positives))) # true positive list
        fnc = len(false_negatives) # false negative count
        fns = ','.join(sorted(false_negatives)) # false negative list
        fpc = len(false_positives) # false positive count
        fps = ','.join(sorted(list(false_positives))) # false positive list
        tnc = len(true_negatives) # true negative count
        tns = ','.join(sorted(true_negatives)) # true negative list
        sens = len(true_positives)/float(len(positives))
        spec = len(true_negatives)/float(len(negatives))
        outdata = [dme,str(sens),str(spec),str(tpc),tps,str(fnc),fns,str(fpc),fps,str(tnc),tns,'\n']
        of3.write('\t'.join(outdata))
of3.close()

TP,FP calculation for: Myocardial infarction
num positives: 364
num TP: 144
num FN: 220
num negatives: 633
num FP: 187
num TN: 446
TP,FP calculation for: Hypertension
num positives: 513
num TP: 218
num FN: 295
num negatives: 484
num FP: 159
num TN: 325
TP,FP calculation for: Pancreatitis
num positives: 307
num TP: 80
num FN: 227
num negatives: 690
num FP: 197
num TN: 493
TP,FP calculation for: Hyperlipidemia
num positives: 99
num TP: 28
num FN: 71
num negatives: 898
num FP: 253
num TN: 645
TP,FP calculation for: Tardive dyskinesia
num positives: 37
num TP: 35
num FN: 2
num negatives: 960
num FP: 175
num TN: 785
TP,FP calculation for: Sepsis
num positives: 210
num TP: 29
num FN: 181
num negatives: 787
num FP: 152
num TN: 635
TP,FP calculation for: Cardiac arrest
num positives: 205
num TP: 7
num FN: 198
num negatives: 792
num FP: 27
num TN: 765
TP,FP calculation for: Cerebral infarction
num positives: 38
num TP: 7
num FN: 31
num negatives: 959
num FP: 176
num TN: 783
TP,FP calculation fo