In [12]:
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols
from DeepPurpose.pybiomed_helper import _GetPseudoAAC, CalculateAADipeptideComposition, \
calcPubChemFingerAll, CalculateConjointTriad, GetQuasiSequenceOrder
import torch
from torch.utils import data
from torch.autograd import Variable
try:
	from descriptastorus.descriptors import rdDescriptors, rdNormalizedDescriptors
except:
	raise ImportError("Please install pip install git+https://github.com/bp-kelley/descriptastorus.")
from DeepPurpose.chemutils import get_mol, atom_features, bond_features, MAX_NB, ATOM_FDIM, BOND_FDIM
from subword_nmt.apply_bpe import BPE
import codecs
import pickle
import wget
from zipfile import ZipFile 
import os
import sys

from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils.data import SequentialSampler
from torch import nn 

from tqdm import tqdm
import matplotlib.pyplot as plt
from time import time
from sklearn.metrics import mean_squared_error, roc_auc_score, average_precision_score, f1_score, log_loss
from lifelines.utils import concordance_index
from scipy.stats import pearsonr
import pickle 
torch.manual_seed(2)
np.random.seed(3)
import copy
from prettytable import PrettyTable

import os

from DeepPurpose.utils import *
from DeepPurpose.model_helper import Encoder_MultipleLayers, Embeddings        
from DeepPurpose.encoders import *

In [13]:
data_path = './data//BindingDB_All.tsv'
df = pd.read_csv(data_path, sep = '\t', error_bad_lines=False)
df = df[df['Number of Protein Chains in Target (>1 implies a multichain complex)'] == 1.0]
df = df[df['Ligand SMILES'].notnull()]

b'Skipping line 772572: expected 193 fields, saw 205\nSkipping line 772598: expected 193 fields, saw 205\n'
b'Skipping line 805291: expected 193 fields, saw 205\n'
b'Skipping line 827961: expected 193 fields, saw 265\n'
b'Skipping line 1231688: expected 193 fields, saw 241\n'
b'Skipping line 1345591: expected 193 fields, saw 241\nSkipping line 1345592: expected 193 fields, saw 241\nSkipping line 1345593: expected 193 fields, saw 241\nSkipping line 1345594: expected 193 fields, saw 241\nSkipping line 1345595: expected 193 fields, saw 241\nSkipping line 1345596: expected 193 fields, saw 241\nSkipping line 1345597: expected 193 fields, saw 241\nSkipping line 1345598: expected 193 fields, saw 241\nSkipping line 1345599: expected 193 fields, saw 241\n'
b'Skipping line 1358864: expected 193 fields, saw 205\n'
b'Skipping line 1378087: expected 193 fields, saw 241\nSkipping line 1378088: expected 193 fields, saw 241\nSkipping line 1378089: expected 193 fields, saw 241\nSkipping line 1378090: e

In [20]:
df = df[['BindingDB Reactant_set_id', 'Ligand InChI', 'Ligand SMILES',\
                  'PubChem CID', 'UniProt (SwissProt) Primary ID of Target Chain',\
                  'Target Source Organism According to Curator or DataSource',\
                  'BindingDB Target Chain  Sequence', 'Kd (nM)', 'IC50 (nM)', 'Ki (nM)',\
                  'EC50 (nM)', 'kon (M-1-s-1)', 'koff (s-1)','pH','Temp (C)']]
df.rename(columns={'BindingDB Reactant_set_id':'ID',
                        'Ligand SMILES':'SMILES',
                        'Ligand InChI':'InChI',
                        'PubChem CID':'PubChem_ID',
                        'UniProt (SwissProt) Primary ID of Target Chain':'UniProt_ID',
                        'BindingDB Target Chain  Sequence': 'Target Sequence',
                        'Target Source Organism According to Curator or DataSource': 'Organism',
                        'Kd (nM)':'Kd',
                        'IC50 (nM)':'IC50',
                        'Ki (nM)':'Ki',
                        'EC50 (nM)':'EC50',
                        'kon (M-1-s-1)':'kon',
                        'koff (s-1)':'koff',
                        'Temp (C)':'Temp',}, 
                        inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [21]:
df.head()

Unnamed: 0,ID,InChI,SMILES,PubChem_ID,UniProt_ID,Organism,Target Sequence,Kd,IC50,Ki,EC50,kon,koff,pH,Temp
0,1,InChI=1S/C22H24BrFN4O2/c1-28-7-5-14(6-8-28)12-...,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1,3081361.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,,,0.24,,,,5.5,37.00 C
1,2,InChI=1S/C31H34N6O3/c38-29-27(17-23-9-3-1-4-10...,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,5327236.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,,,0.25,,,,5.5,37.00 C
2,3,InChI=1S/C29H34N4O3/c34-27-25(16-21-8-3-1-4-9-...,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,5327235.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,,,0.41,,,,5.5,37.00 C
3,4,InChI=1S/C29H40N2O4/c32-18-10-2-1-9-17-30-25(1...,OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,5327234.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,,,0.8,,,,5.5,37.00 C
4,5,InChI=1S/C28H38N2O4/c31-17-9-3-8-16-29-24(18-2...,OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,3009319.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,,,0.99,,,,5.5,37.00 C


In [36]:
df['Temp'] = df['Temp'].str.rstrip('C')
df.count()
df.to_pickle("./df.pkl")

In [65]:
idx_str = ['Kd', 'IC50', 'Ki','EC50','Temp']
df_want = df
convert_to_log = 0

# have at least uniprot or pubchem ID
df_want = df_want[df_want.PubChem_ID.notnull() | df_want.UniProt_ID.notnull()]
df_want = df_want[df_want.InChI.notnull()]

for label in idx_str:
#    df_want = df_want[df_want[label].notnull()]
#    print(df_want.size)
    df_want[label] = df_want[label].str.replace('>', '')
    df_want[label] = df_want[label].str.replace('<', '')
    #df_want[label] = df_want[label].astype(float)
#    df_want = df_want[df_want[label] <= 10000000.0]

y = df_want[idx_str]
for label in idx_str:
    if convert_to_log:
            print('Default set to logspace (nM -> p) for easier regression')
            y[label] = convert_y_unit(df_want[label].values, 'nM', 'p') 
    else:
            y[label] = df_want[label].values

X_drugs = df_want.SMILES.values
X_targets = df_want['Target Sequence'].values
y = y.apply(pd.to_numeric, errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [82]:
X_drugs.size

1730866

In [74]:
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer

imputer = SimpleImputer() #TODO: better imputer
y_i = imputer.fit_transform(y)

In [75]:
y_i = pd.DataFrame(data=y_i,columns=idx_str)
y_i

Unnamed: 0,Kd,IC50,Ki,EC50,Temp
0,872669.450093,1.264663e+08,2.400000e-01,362878.629538,37.000000
1,872669.450093,1.264663e+08,2.500000e-01,362878.629538,37.000000
2,872669.450093,1.264663e+08,4.100000e-01,362878.629538,37.000000
3,872669.450093,1.264663e+08,8.000000e-01,362878.629538,37.000000
4,872669.450093,1.264663e+08,9.900000e-01,362878.629538,37.000000
...,...,...,...,...,...
1730861,872669.450093,1.264663e+08,1.941028e+06,152.000000,27.936324
1730862,872669.450093,1.264663e+08,1.941028e+06,601.000000,27.936324
1730863,872669.450093,1.264663e+08,1.941028e+06,12.000000,27.936324
1730864,872669.450093,1.264663e+08,1.941028e+06,402.000000,27.936324


In [86]:
df_data = y_i
df_data['SMILES'] = X_drugs
df_data['Target Sequence'] = X_targets

print('in total: ' + str(len(df_data)) + ' drug-target pairs')
df_data

in total: 1730866 drug-target pairs


Unnamed: 0,Kd,IC50,Ki,EC50,Temp,SMILES,Target Sequence
0,872669.450093,1.264663e+08,2.400000e-01,362878.629538,37.000000,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...
1,872669.450093,1.264663e+08,2.500000e-01,362878.629538,37.000000,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...
2,872669.450093,1.264663e+08,4.100000e-01,362878.629538,37.000000,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...
3,872669.450093,1.264663e+08,8.000000e-01,362878.629538,37.000000,OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...
4,872669.450093,1.264663e+08,9.900000e-01,362878.629538,37.000000,OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...
...,...,...,...,...,...,...,...
1730861,872669.450093,1.264663e+08,1.941028e+06,152.000000,27.936324,Oc1ccc(Br)cc1Cn1c(nc2ccc(cc12)[N+]([O-])=O)-c1...,MWRCGGRQGLCVLRRLSGGHAHHRAWRWNSNRACERALQYKLGDKI...
1730862,872669.450093,1.264663e+08,1.941028e+06,601.000000,27.936324,Oc1ccc(Br)cc1CN1C(N(Cc2cc(Br)ccc2O)c2cc(ccc12)...,MWRCGGRQGLCVLRRLSGGHAHHRAWRWNSNRACERALQYKLGDKI...
1730863,872669.450093,1.264663e+08,1.941028e+06,12.000000,27.936324,Oc1ccc(Br)cc1Cn1c(nc2ccc(cc12)[N+]([O-])=O)-c1...,MWRCGGRQGLCVLRRLSGGHAHHRAWRWNSNRACERALQYKLGDKI...
1730864,872669.450093,1.264663e+08,1.941028e+06,402.000000,27.936324,Oc1ccc(Br)cc1CN1C(N(Cc2cc(Br)ccc2O)c2cc(ccc12)...,MWRCGGRQGLCVLRRLSGGHAHHRAWRWNSNRACERALQYKLGDKI...


In [None]:
import time

drug_func_list= [smiles2morgan,smiles2daylight,trans_drug,drug2emb_encoder,smiles2mpnnfeature]
#TODO: add calcPubChemFingerAll back in when it's not broken
#TODO: smiles2rdkit2d takes forever and can be added later
column_name = 'SMILES'
start = time.time()

for func in drug_func_list:
    save_column_name = func.__name__
    unique = pd.Series(df_data[column_name].unique()).apply(func)
    unique_dict = dict(zip(df_data[column_name].unique(), unique))
    df_data[save_column_name] = [unique_dict[i] for i in df_data[column_name]]
    end = time.time()
    print(end - start)

In [None]:
prot_func_list = [CalculateConjointTriad, GetQuasiSequenceOrder, trans_protein, protein2emb_encoder]
#TODO: run CalculateAADipeptideComposition and _GetPseudoAAC when time permits
column_name = 'Target Sequence'
start = time.time()

for func in prot_func_list:
    save_column_name = func.__name__
    AA = pd.Series(df_data[column_name].unique()).apply(func)
    AA_dict = dict(zip(df_data[column_name].unique(), AA))
    df_data[save_column_name] = [AA_dict[i] for i in df_data[column_name]]
    end = time.time()
    print(end - start)

In [9]:
# dti split

print('splitting dataset...')

#TODO: what is HTS

if split_method == 'random': 
    train, val, test = create_fold(df_data, random_seed, frac)
elif split_method == 'cold_drug':
    train, val, test = create_fold_setting_cold_drug(df_data, random_seed, frac)
elif split_method == 'HTS':
    train, val, test = create_fold_setting_cold_drug(df_data, random_seed, frac)
    val = pd.concat([val[val.Label == 1].drop_duplicates(subset = 'SMILES'), val[val.Label == 0]])
    test = pd.concat([test[test.Label == 1].drop_duplicates(subset = 'SMILES'), test[test.Label == 0]])        
elif split_method == 'cold_protein':
    train, val, test = create_fold_setting_cold_protein(df_data, random_seed, frac)
elif split_method == 'repurposing_VS':
    train = df_data
    val = df_data
    test = df_data
elif split_method == 'no_split':
    print('do not do train/test split on the data for already splitted data')
    return df_data.reset_index(drop=True)
else:
    raise AttributeError("Please select one of the three split method: random, cold_drug, cold_target!")
    
print('Done.')

train = train.reset_index(drop=True)
val = val.reset_index(drop=True)
test = test.reset_index(drop=True)
    

b'Skipping line 772572: expected 193 fields, saw 205\nSkipping line 772598: expected 193 fields, saw 205\n'
b'Skipping line 805291: expected 193 fields, saw 205\n'
b'Skipping line 827961: expected 193 fields, saw 265\n'
b'Skipping line 1231688: expected 193 fields, saw 241\n'
b'Skipping line 1345591: expected 193 fields, saw 241\nSkipping line 1345592: expected 193 fields, saw 241\nSkipping line 1345593: expected 193 fields, saw 241\nSkipping line 1345594: expected 193 fields, saw 241\nSkipping line 1345595: expected 193 fields, saw 241\nSkipping line 1345596: expected 193 fields, saw 241\nSkipping line 1345597: expected 193 fields, saw 241\nSkipping line 1345598: expected 193 fields, saw 241\nSkipping line 1345599: expected 193 fields, saw 241\n'
b'Skipping line 1358864: expected 193 fields, saw 205\n'
b'Skipping line 1378087: expected 193 fields, saw 241\nSkipping line 1378088: expected 193 fields, saw 241\nSkipping line 1378089: expected 193 fields, saw 241\nSkipping line 1378090: e

KeyboardInterrupt: 

In [None]:
drug_encoding = None, target_encoding = None, 
result_folder = "./result/",
input_dim_drug = 1024, 
input_dim_protein = 8420,
hidden_dim_drug = 256, 
hidden_dim_protein = 256,
cls_hidden_dims = [1024, 1024, 512],
mlp_hidden_dims_drug = [1024, 256, 64],
mlp_hidden_dims_target = [1024, 256, 64],
batch_size = 256,
train_epoch = 10,
test_every_X_epoch = 20,
LR = 1e-4,
decay = 0,
transformer_emb_size_drug = 128,
transformer_intermediate_size_drug = 512,
transformer_num_attention_heads_drug = 8,
transformer_n_layer_drug = 8,
transformer_emb_size_target = 64,
transformer_intermediate_size_target = 256,
transformer_num_attention_heads_target = 4,
transformer_n_layer_target = 2,
transformer_dropout_rate = 0.1,
transformer_attention_probs_dropout = 0.1,
transformer_hidden_dropout_rate = 0.1,
mpnn_hidden_size = 50,
mpnn_depth = 3,
cnn_drug_filters = [32,64,96],
cnn_drug_kernels = [4,6,8],
cnn_target_filters = [32,64,96],
cnn_target_kernels = [4,8,12],
rnn_Use_GRU_LSTM_drug = 'GRU',
rnn_drug_hid_dim = 64,
rnn_drug_n_layers = 2,
rnn_drug_bidirectional = True,
rnn_Use_GRU_LSTM_target = 'GRU',
rnn_target_hid_dim = 64,
rnn_target_n_layers = 2,
rnn_target_bidirectional = True,
num_workers = 0 

base_config = {'input_dim_drug': input_dim_drug,
                'input_dim_protein': input_dim_protein,
                'hidden_dim_drug': hidden_dim_drug, # hidden dim of drug
                'hidden_dim_protein': hidden_dim_protein, # hidden dim of protein
                'cls_hidden_dims' : cls_hidden_dims, # decoder classifier dim 1
                'batch_size': batch_size,
                'train_epoch': train_epoch,
                'test_every_X_epoch': test_every_X_epoch, 
                'LR': LR,
                'drug_encoding': drug_encoding,
                'target_encoding': target_encoding, 
                'result_folder': result_folder,
                'binary': False,
                'num_workers': num_workers                  
}
if not os.path.exists(base_config['result_folder']):
    os.makedirs(base_config['result_folder'])

base_config['mlp_hidden_dims_drug'] = mlp_hidden_dims_drug # MLP classifier dim 1				
base_config['input_dim_drug'] = 881 #could be 2048 or 200 or 2586
base_config['cnn_drug_filters'] = cnn_drug_filters
base_config['cnn_drug_kernels'] = cnn_drug_kernels
base_config['rnn_Use_GRU_LSTM_drug'] = rnn_Use_GRU_LSTM_drug
base_config['rnn_drug_hid_dim'] = rnn_drug_hid_dim
base_config['rnn_drug_n_layers'] = rnn_drug_n_layers
base_config['rnn_drug_bidirectional'] = rnn_drug_bidirectional 
base_config['transformer_emb_size_drug'] = transformer_emb_size_drug
base_config['transformer_num_attention_heads_drug'] = transformer_num_attention_heads_drug
base_config['transformer_intermediate_size_drug'] = transformer_intermediate_size_drug
base_config['transformer_n_layer_drug'] = transformer_n_layer_drug
base_config['transformer_dropout_rate'] = transformer_dropout_rate
base_config['transformer_attention_probs_dropout'] = transformer_attention_probs_dropout
base_config['transformer_hidden_dropout_rate'] = transformer_hidden_dropout_rate
base_config['hidden_dim_drug'] = transformer_emb_size_drug #could also be hidden_dim_drug
base_config['batch_size'] = batch_size 
base_config['mpnn_hidden_size'] = mpnn_hidden_size
base_config['mpnn_depth'] = mpnn_depth

base_config['mlp_hidden_dims_target'] = mlp_hidden_dims_target # MLP classifier dim 1				
base_config['input_dim_protein'] = 30 #could be 343 or 100 or 4114
base_config['cnn_target_filters'] = cnn_target_filters
base_config['cnn_target_kernels'] = cnn_target_kernels
base_config['rnn_Use_GRU_LSTM_target'] = rnn_Use_GRU_LSTM_target
base_config['rnn_target_hid_dim'] = rnn_target_hid_dim
base_config['rnn_target_n_layers'] = rnn_target_n_layers
base_config['rnn_target_bidirectional'] = rnn_target_bidirectional 
base_config['cnn_target_filters'] = cnn_target_filters
base_config['cnn_target_kernels'] = cnn_target_kernels
base_config['transformer_emb_size_target'] = transformer_emb_size_target
base_config['transformer_num_attention_heads_target'] = transformer_num_attention_heads_target
base_config['transformer_intermediate_size_target'] = transformer_intermediate_size_target
base_config['transformer_n_layer_target'] = transformer_n_layer_target	
base_config['transformer_dropout_rate'] = transformer_dropout_rate
base_config['transformer_attention_probs_dropout'] = transformer_attention_probs_dropout
base_config['transformer_hidden_dropout_rate'] = transformer_hidden_dropout_rate
base_config['hidden_dim_protein'] = transformer_emb_size_target

config = base_config

In [None]:

    self.model_drug = MLP(config['input_dim_drug'], config['hidden_dim_drug'], config['mlp_hidden_dims_drug'])

    self.model_drug = CNN('drug', **config)

    self.model_drug = CNN_RNN('drug', **config)

    self.model_drug = transformer('drug', **config)

    self.model_drug = MPNN(config['hidden_dim_drug'], config['mpnn_depth'])

if target_encoding == 'AAC' or target_encoding == 'PseudoAAC' or  target_encoding == 'Conjoint_triad' or target_encoding == 'Quasi-seq':
    self.model_protein = MLP(config['input_dim_protein'], config['hidden_dim_protein'], config['mlp_hidden_dims_target'])
elif target_encoding == 'CNN':
    self.model_protein = CNN('protein', **config)
elif target_encoding == 'CNN_RNN':
    self.model_protein = CNN_RNN('protein', **config)
elif target_encoding == 'Transformer':
    self.model_protein = transformer('protein', **config)
else:
    raise AttributeError('Please use one of the available encoding method.')

self.model = Classifier(self.model_drug, self.model_protein, **config)
self.config = config
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

self.drug_encoding = drug_encoding
self.target_encoding = target_encoding
self.result_folder = config['result_folder']
if not os.path.exists(self.result_folder):
    os.mkdir(self.result_folder)            
self.binary = False
if 'num_workers' not in self.config.keys():
    self.config['num_workers'] = 0
if 'decay' not in self.config.keys():
    self.config['decay'] = 0