In [31]:
import os
import sys
import re
import numpy as np
from bokeh.layouts import gridplot
from bokeh.plotting import figure, show, output_notebook

In [32]:
output_notebook()

In [8]:
ikey2idx={}
idx2ikey={}
ikey2smi={}
with open('integrated_chemicals.tsv','r') as f:
    for line in f:
        line=line.strip().split('\t')
        idx=int(line[0])
        ikey=line[1]
        smi=line[2]
        ikey2idx[ikey]=idx
        idx2ikey[idx]=ikey
        ikey2smi[ikey]=smi
        
chemsimpairs={ikey:{} for ikey in ikey2idx.keys()}
with open('integrated_chem_chem_sim_threshold03_reduced.csv','r') as f:
    for line in f:
        line=line.strip().split(',')
        ikey1=idx2ikey[int(line[0])]
        ikey2=idx2ikey[int(line[1])]
        sim=float("{:.4f}".format(float(line[2])))
        chemsimpairs[ikey1].update({ikey2:sim})
        chemsimpairs[ikey2].update({ikey1:sim})

In [71]:
input_activity_file='../activities/tsv/integrated_kinase_continuous.tsv'
activities=[]
kinase_ligands=[]
kinases=[]
with open(kinasefile,'r') as f:
    next(f)
    for line in f:
        line=line.strip().split('\t')
        ikey=line[1].strip()
        uniprot=line[2].strip()
        val_type=line[3].strip()
        rel=line[4].strip()
        val=line[5].strip()
        try:
            val=float(val)
            if np.isinf(val):
                continue
            if np.isnan(val):
                continue
        except:
            #activity value cannot be converted into float; skip
            continue
        activities.append(val)
        if ikey not in kinase_ligands:
            kinase_ligands.append(ikey)
        if uniprot not in kinases:
            kinases.append(uniprot)

In [76]:
len(kinase_ligands)

134755

In [81]:
max_sim_cutoff=0.6 #chemicals more similar than cutoff cannot appear in train/test set
#chemicals are included into same cluster if more similar than this,
# and separated if less similar than cutoff
max_num_chem_in_test=int(np.ceil(len(kinase_ligands)/20))
print("Cutoff {}, max_num_chem_in_test {}".format(max_sim_cutoff,max_num_chem_in_test))
arr = np.arange(len(kinase_ligands))
np.random.shuffle(arr)
seed_ikeys=[kinase_ligands[idx] for idx in arr[:115000]]
query_ikeys=[kinase_ligands[idx] for idx in arr[115000:]]
print("{} seed chemicals randomly chosen".format(len(seed_ikeys)))
print("{} chemicals not in the seed chemicals are being processed...".format(len(query_ikeys)))
disjoint_ikeys=[]
disjoint_ikeys_rest=[]
for i,query_ikey in enumerate(query_ikeys):
    num_test_chems=len(disjoint_ikeys)
    if i%1000==0:
        print("{} chemicals out of {} are processed. test chemical size {}".format(i,len(query_ikeys),num_test_chems))
    if query_ikey not in chemsimpairs.keys(): #similarity information not found
        disjoint_ikeys.append(query_ikey)
        continue
    if num_test_chems>=max_num_chem_in_test:
        #already max number of disjoint chemicals chosen
        disjoint_ikeys_rest.append(query_ikey)
    max_sim_to_seed=0.0
    for sikey in seed_ikeys:
        if sikey in chemsimpairs[query_ikey].keys():
            sim=chemsimpairs[query_ikey][sikey]
            if sim>max_sim_to_seed:
                max_sim_to_seed=sim
    if max_sim_to_seed>=max_sim_cutoff:
        #query chem too similar to seed chemicals -- include into seed set
        seed_ikeys.append(query_ikey)
    else:
        #max sim to seed set is below cutoff -- distinct chemical from seed
        disjoint_ikeys.append(query_ikey)
        
print("{} seed chemicals, {} disjoint chemicals, and {} spare disjoint chemicals".format(
    len(seed_ikeys),len(disjoint_ikeys),len(disjoint_ikeys_rest)))

Cutoff 0.6, max_num_chem_in_test 6738
115000 seed chemicals randomly chosen
19755 chemicals not in the seed chemicals are being processed...
0 chemicals out of 19755 are processed. test chemical size 0
1000 chemicals out of 19755 are processed. test chemical size 158
2000 chemicals out of 19755 are processed. test chemical size 332
3000 chemicals out of 19755 are processed. test chemical size 493
4000 chemicals out of 19755 are processed. test chemical size 654
5000 chemicals out of 19755 are processed. test chemical size 820
6000 chemicals out of 19755 are processed. test chemical size 989
7000 chemicals out of 19755 are processed. test chemical size 1159
8000 chemicals out of 19755 are processed. test chemical size 1298
9000 chemicals out of 19755 are processed. test chemical size 1447
10000 chemicals out of 19755 are processed. test chemical size 1599
11000 chemicals out of 19755 are processed. test chemical size 1760
12000 chemicals out of 19755 are processed. test chemical size 19

In [90]:
#double check if two chemical sets are disjoint in similarity measure
chems_to_move=[]
for i,tchem in enumerate(disjoint_ikeys):
    max_sim_to_train=0.0
    if tchem in chemsimpairs.keys():
        for chem in seed_ikeys:
            if chem in chemsimpairs[tchem].keys():
                sim=chemsimpairs[tchem][chem]
                if sim>max_sim_to_train:
                    max_sim_to_train=sim
    if max_sim_to_train>0.6:
        chems_to_move.append(tchem)

disjoint_ikeys=list(set(disjoint_ikeys)-set(chems_to_move))
seed_ikeys=list(set(seed_ikeys+chems_to_move))

max_sims=[]
for i,tchem in enumerate(disjoint_ikeys):
    max_sim_to_train=0.0
    if tchem in chemsimpairs.keys():
        for chem in seed_ikeys:
            if chem in chemsimpairs[tchem].keys():
                sim=chemsimpairs[tchem][chem]
                if sim>max_sim_to_train:
                    max_sim_to_train=sim
    max_sims.append(max_sim_to_train)

print("Finally, {} seed chemicals and {} disjoint chemicals.".format(len(seed_ikeys),len(disjoint_ikeys)))
hist, edges = np.histogram(max_sims, density=False, range=(0,1),bins=20)
p = figure(tools='', background_fill_color="#fafafa",x_range=(0,1))
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
       fill_color="navy", line_color="white", alpha=0.5)
show(p)

Finally, 131631 seed chemicals and 3124 disjoint chemicals.


In [85]:
print(hist)
print(edges)

[2685    0    0    0    0    0    2    8   31   55  139  204    7    5
    7    6    4    0    0    0]
[0.   0.05 0.1  0.15 0.2  0.25 0.3  0.35 0.4  0.45 0.5  0.55 0.6  0.65
 0.7  0.75 0.8  0.85 0.9  0.95 1.  ]


In [49]:
kinasefile='../activities/tsv/integrated_kinases_continuous.tsv'
activities=[]
kinase_ligands=[]
kinases=[]
with open(kinasefile,'r') as f:
    next(f)
    for line in f:
        line=line.strip().split('\t')
        ikey=line[1].strip()
        uniprot=line[2].strip()
        val_type=line[3].strip()
        rel=line[4].strip()
        val=line[5].strip()
        try:
            val=float(val)
            if np.isinf(val):
                continue
            if np.isnan(val):
                continue
        except:
            #activity value cannot be converted into float; skip
            continue
        activities.append(val)
        if ikey not in kinase_ligands:
            kinase_ligands.append(ikey)
        if uniprot not in kinases:
            kinases.append(uniprot)

hist, edges = np.histogram(activities, density=False, range=(0,15),bins=150)
p = figure(tools='', background_fill_color="#fafafa",x_range=(3.0,12))
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
       fill_color="navy", line_color="white", alpha=0.5)
p.y_range.start = 0
p.legend.location = "center_right"
p.legend.background_fill_color = "#fefefe"
show(p)

In [70]:
np.sum(np.where(np.array(activities)<4.18,1,0))
#4.17~4.18 and 5.0 are overrepresented

74172

In [98]:
kinasefile='../activities/tsv/integrated_kinases_continuous.tsv'
#print("Finally, {} seed chemicals and {} disjoint chemicals.".format(len(seed_ikeys),len(disjoint_ikeys)))
pkipkd_cont=[]
pic50_cont=[]
kinase_binary=[]
trainfile_cont_pkipkd='kinase_activities_by_disjoint_chemicals/kinase_pki_pkd_train.tsv'
devfile_cont_pkipkd='kinase_activities_by_disjoint_chemicals/kinase_pki_pkd_dev.tsv'
testfile_cont_pkipkd='kinase_activities_by_disjoint_chemicals/kinase_pki_pkd_test.tsv'
trainfile_cont_pic50='kinase_activities_by_disjoint_chemicals/kinase_pic50_train.tsv'
devfile_cont_pic50='kinase_activities_by_disjoint_chemicals/kinase_pic50_dev.tsv'
testfile_cont_pic50='kinase_activities_by_disjoint_chemicals/kinase_pic50_test.tsv'
trainfile_bin='kinase_activities_by_disjoint_chemicals/kinase_binary_train.tsv'
devfile_bin='kinase_activities_by_disjoint_chemicals/kinase_binary_dev.tsv'
testfile_bin='kinase_activities_by_disjoint_chemicals/kinase_binary_test.tsv'
tr_pkipkd=open(trainfile_cont_pkipkd,'w')
dv_pkipkd=open(devfile_cont_pkipkd,'w')
ts_pkipkd=open(testfile_cont_pkipkd,'w')
tr_pic50=open(trainfile_cont_pic50,'w')
dv_pic50=open(devfile_cont_pic50,'w')
ts_pic50=open(testfile_cont_pic50,'w')
tr_bin=open(trainfile_bin,'w')
dv_bin=open(devfile_bin,'w')
ts_bin=open(testfile_bin,'w')

with open(kinasefile,'r') as f:
    next(f)
    for line in f:
        line=line.strip().split('\t')
        ikey=line[1].strip()
        uniprot=line[2].strip()
        val_type=line[3].strip().lower()
        rel=line[4].strip()
        val=line[5].strip()
        try:
            val=float(val)
        except:
            #activity value cannot be converted into float; skip
            continue
        
        if rel!='=':
            continue
#             #relation inequality value; only for binary data
#             if rel in ['>=','>>','>']: #activity stronger
#                 if val_type=='pIC50':
#                     if val>=6.0:
#                         #active
#                         tr_bin.write("{}\t{}\t1\n".format(ikey,uniprot))
#                     else:
#                         #undecided
#                         continue
#                 else:
#                     if val>=7.0:
#                         #active
#                         tr_bin.write("{}\t{}\t1\n".format(ikey,uniprot))
#                     else:
#                         #undecided
#                         continue
                        
#             elif rel in ['<=','<<','<']: #activity weaker
#                 if val_type=='pIC50':
#                     if val<=6.0:
#                         #inactive
#                         tr_bin.write("{}\t{}\t0\n".format(ikey,uniprot))
#                     else:
#                         #undecided
#                         continue
#                 else:
#                     if val<=7.0:
#                         #inactive
#                         tr_bin.write("{}\t{}\t0\n".format(ikey,uniprot))
#                     else:
#                         #undecided
#                         continue
        else:
            #activity in equality
            if ikey in seed_ikeys:
                #train chemicals
                
                if val_type=='pic50':
                    tr_pic50.write("{}\t{}\t{}\n".format(ikey,uniprot,val))
                    if val>=6.0:
                        #active
                        tr_bin.write("{}\t{}\t1\n".format(ikey,uniprot))
                    else:
                        #inactive
                        tr_bin.write("{}\t{}\t0\n".format(ikey,uniprot))
                else:
                    tr_pkipkd.write("{}\t{}\t{}\n".format(ikey,uniprot,val))
                    if val>=7.0:
                        #active
                        tr_bin.write("{}\t{}\t1\n".format(ikey,uniprot))
                    else:
                        #inactive
                        tr_bin.write("{}\t{}\t0\n".format(ikey,uniprot))

            elif ikey in disjoint_ikeys:
                #dev/test chemicals
                prob=np.random.random()
                if val_type=='pic50': #ic50
                    if prob<0.8: #dev
                        dv_pic50.write("{}\t{}\t{}\n".format(ikey,uniprot,val))
                        if val>=6.0: #active
                            dv_bin.write("{}\t{}\t1\n".format(ikey,uniprot))
                        else: #inactive
                            dv_bin.write("{}\t{}\t0\n".format(ikey,uniprot))
                    else: #test
                        ts_pic50.write("{}\t{}\t{}\n".format(ikey,uniprot,val))
                        if val>=6.0:
                            ts_bin.write("{}\t{}\t1\n".format(ikey,uniprot))
                        else:
                            ts_bin.write("{}\t{}\t0\n".format(ikey,uniprot))
                else: #pkipkd
                    if prob<0.8: #dev
                        dv_pkipkd.write("{}\t{}\t{}\n".format(ikey,uniprot,val))
                        if val>=7.0:#active
                            dv_bin.write("{}\t{}\t1\n".format(ikey,uniprot))
                        else:#inactive
                            dv_bin.write("{}\t{}\t0\n".format(ikey,uniprot))
                    else: #test
                        ts_pkipkd.write("{}\t{}\t{}\n".format(ikey,uniprot,val))
                        if val>=7.0:#active
                            ts_bin.write("{}\t{}\t1\n".format(ikey,uniprot))
                        else:#inactive
                            ts_bin.write("{}\t{}\t0\n".format(ikey,uniprot))
                            
tr_pkipkd.close()
dv_pkipkd.close()
ts_pkipkd.close()
tr_pic50.close()
dv_pic50.close()
ts_pic50.close()
tr_bin.close()
dv_bin.close()
ts_bin.close()

KeyboardInterrupt: 

In [99]:
!pwd

/Users/hansaimlim/DrugTargetInteraction/data/Integrated/chemicals
