In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols
from DeepPurpose.pybiomed_helper import _GetPseudoAAC, CalculateAADipeptideComposition, \
calcPubChemFingerAll, CalculateConjointTriad, GetQuasiSequenceOrder
import torch
from torch.utils import data
from torch.autograd import Variable
try:
	from descriptastorus.descriptors import rdDescriptors, rdNormalizedDescriptors
except:
	raise ImportError("Please install pip install git+https://github.com/bp-kelley/descriptastorus.")
from DeepPurpose.chemutils import get_mol, atom_features, bond_features, MAX_NB, ATOM_FDIM, BOND_FDIM
from subword_nmt.apply_bpe import BPE
import codecs
import pickle
import wget
from zipfile import ZipFile 
import os
import sys

from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils.data import SequentialSampler
from torch import nn 

from tqdm import tqdm
import matplotlib.pyplot as plt
from time import time
from sklearn.metrics import mean_squared_error, roc_auc_score, average_precision_score, f1_score, log_loss
from lifelines.utils import concordance_index
from scipy.stats import pearsonr
import pickle 
torch.manual_seed(2)
np.random.seed(3)
import copy
from prettytable import PrettyTable

import os

from DeepPurpose.utils import *
from DeepPurpose.model_helper import Encoder_MultipleLayers, Embeddings        
from DeepPurpose.encoders import *
from DeepPurpose import DTI

In [2]:
data_path = './data//BindingDB_All.tsv'
df = pd.read_csv(data_path, sep = '\t', error_bad_lines=False)
df = df[df['Number of Protein Chains in Target (>1 implies a multichain complex)'] == 1.0]
df = df[df['Ligand SMILES'].notnull()]

KeyboardInterrupt: 

In [20]:
df = df[['BindingDB Reactant_set_id', 'Ligand InChI', 'Ligand SMILES',\
                  'PubChem CID', 'UniProt (SwissProt) Primary ID of Target Chain',\
                  'Target Source Organism According to Curator or DataSource',\
                  'BindingDB Target Chain  Sequence', 'Kd (nM)', 'IC50 (nM)', 'Ki (nM)',\
                  'EC50 (nM)', 'kon (M-1-s-1)', 'koff (s-1)','pH','Temp (C)']]
df.rename(columns={'BindingDB Reactant_set_id':'ID',
                        'Ligand SMILES':'SMILES',
                        'Ligand InChI':'InChI',
                        'PubChem CID':'PubChem_ID',
                        'UniProt (SwissProt) Primary ID of Target Chain':'UniProt_ID',
                        'BindingDB Target Chain  Sequence': 'Target Sequence',
                        'Target Source Organism According to Curator or DataSource': 'Organism',
                        'Kd (nM)':'Kd',
                        'IC50 (nM)':'IC50',
                        'Ki (nM)':'Ki',
                        'EC50 (nM)':'EC50',
                        'kon (M-1-s-1)':'kon',
                        'koff (s-1)':'koff',
                        'Temp (C)':'Temp'}, 
                        inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [21]:
df.head()

Unnamed: 0,ID,InChI,SMILES,PubChem_ID,UniProt_ID,Organism,Target Sequence,Kd,IC50,Ki,EC50,kon,koff,pH,Temp
0,1,InChI=1S/C22H24BrFN4O2/c1-28-7-5-14(6-8-28)12-...,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1,3081361.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,,,0.24,,,,5.5,37.00 C
1,2,InChI=1S/C31H34N6O3/c38-29-27(17-23-9-3-1-4-10...,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,5327236.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,,,0.25,,,,5.5,37.00 C
2,3,InChI=1S/C29H34N4O3/c34-27-25(16-21-8-3-1-4-9-...,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,5327235.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,,,0.41,,,,5.5,37.00 C
3,4,InChI=1S/C29H40N2O4/c32-18-10-2-1-9-17-30-25(1...,OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,5327234.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,,,0.8,,,,5.5,37.00 C
4,5,InChI=1S/C28H38N2O4/c31-17-9-3-8-16-29-24(18-2...,OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,3009319.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,,,0.99,,,,5.5,37.00 C


In [38]:
df['Temp'] = df['Temp'].str.rstrip('C')
df.count()

ID                 1733850
InChI              1733282
SMILES             1733850
PubChem_ID         1718479
UniProt_ID         1538086
Organism           1238470
Target Sequence    1733850
Kd                   74761
IC50               1080811
Ki                  417859
EC50                164210
kon                    654
koff                   524
pH                  204919
Temp                191364
dtype: int64

In [36]:
df.to_pickle("./df.pkl")

In [3]:
df = pd.read_pickle("./df.pkl")

In [37]:
idx_str = ['IC50']
df_want = df
convert_to_log = 1

# have at least uniprot or pubchem ID
df_want = df_want[df_want.PubChem_ID.notnull() | df_want.UniProt_ID.notnull()]
df_want = df_want[df_want.InChI.notnull()]
df_want = df_want[df_want['Temp'].notnull()]
df_want = df_want[df_want['pH'].notnull()]
df_want = df_want[df_want['IC50'].notnull()]

for label in idx_str:
#    print(df_want.size)
    df_want[label] = df_want[label].str.replace('>', '')
    df_want[label] = df_want[label].str.replace('<', '')
    #df_want[label] = df_want[label].astype(float)
#    df_want = df_want[df_want[label] <= 10000000.0]

y = df_want[idx_str]
for label in idx_str:
    if convert_to_log:
            print('Default set to logspace (nM -> p) for easier regression')
            y[label] = convert_y_unit(df_want[label].values, 'nM', 'p') 
    else:
            y[label] = df_want[label].values

y['pH'] = df_want['pH']
X_drugs = df_want.SMILES.values
X_targets = df_want['Target Sequence'].values
y = y.apply(pd.to_numeric, errors='coerce')

Default set to logspace (nM -> p) for easier regression


TypeError: can't multiply sequence by non-int of type 'float'

In [15]:
df_want

Unnamed: 0,ID,InChI,SMILES,PubChem_ID,UniProt_ID,Organism,Target Sequence,Kd,IC50,Ki,EC50,kon,koff,pH,Temp
180,181,InChI=1/C31H51N5O5/c1-20(2)28(32-22(5)37)30(40...,CC(C)[C@H](NC(C)=O)C(=O)N[C@@H](Cc1ccccc1)[C@@...,65023.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,,8.5,,,,,6.0,37.00
181,182,InChI=1/C33H55N5O7/c1-7-44-32(42)35-28(22(3)4)...,CCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccccc1)[...,461984.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,,177,,,,,6.0,37.00
183,184,InChI=1/C35H59N5O9/c1-24(2)30(37-34(44)48-19-1...,COCCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccccc1...,461988.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,,164,,,,,6.0,37.00
184,185,InChI=1/C39H67N5O11/c1-28(2)34(41-38(48)54-23-...,COCCOCCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccc...,461990.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,,67,,,,,6.0,37.00
185,186,InChI=1/C38H51N7O7/c1-24(2)34(43-38(51)52-3)37...,COC(=O)N[C@@H](C(C)C)C(=O)NN(C[C@H](O)[C@H](Cc...,461985.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,,27,,,,,6.0,37.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1062791,50319347,InChI=1S/C26H33N3O2/c1-27(18-15-21-11-5-4-6-12...,CN(CCCCCCn1c(O)c2Cc3ccccc3Cn2c1=O)CCc1ccccc1,91936612.0,Q99720,Homo sapiens,MQWAVGRRWAWAALLLAVAAVLTQVVWLWLGTQSFVFQREEIAQLA...,,29.5,,,,,7.5,22.00
1751877,51029634,"InChI=1S/C38H65NO12/c1-15-26-38(10,45-14)31(41...",CCC1OC(=O)C(C)C(O[C@H]2C[C@@](C)(OC)[C@@H](O)C...,44276817.0,,,MGSPWNGSDGPEDAREPPWAALPPCDERRCSPFPLGTLVPVTAVCL...,,8.9,,,,,2.5,25.00
1751879,51029639,"InChI=1S/C37H67NO13/c1-14-25-37(10,45)30(41)20...",CCC1OC(=O)C(C)C(O[C@H]2C[C@@](C)(OC)[C@@H](O)C...,10897911.0,,,MGSPWNGSDGPEDAREPPWAALPPCDERRCSPFPLGTLVPVTAVCL...,,71,,,,,2.5,25.00
1751880,51029642,"InChI=1/C37H65NO12/c1-14-25-37(10,43)30(40)20(...",CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...,83954.0,,,MGSPWNGSDGPEDAREPPWAALPPCDERRCSPFPLGTLVPVTAVCL...,,224,,,,,2.5,25.00


In [5]:
X_org = df_want['Organism']
y

Unnamed: 0,IC50,Temp,pH,SMILES,Target Sequence
180,8.5,37.0,6.0,CC(C)[C@H](NC(C)=O)C(=O)N[C@@H](Cc1ccccc1)[C@@...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...
181,177.0,37.0,6.0,CCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccccc1)[...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...
183,164.0,37.0,6.0,COCCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccccc1...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...
184,67.0,37.0,6.0,COCCOCCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccc...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...
185,27.0,37.0,6.0,COC(=O)N[C@@H](C(C)C)C(=O)NN(C[C@H](O)[C@H](Cc...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...
...,...,...,...,...,...
1062791,29.5,22.0,7.5,CN(CCCCCCn1c(O)c2Cc3ccccc3Cn2c1=O)CCc1ccccc1,MQWAVGRRWAWAALLLAVAAVLTQVVWLWLGTQSFVFQREEIAQLA...
1751877,8.9,25.0,2.5,CCC1OC(=O)C(C)C(O[C@H]2C[C@@](C)(OC)[C@@H](O)C...,MGSPWNGSDGPEDAREPPWAALPPCDERRCSPFPLGTLVPVTAVCL...
1751879,71.0,25.0,2.5,CCC1OC(=O)C(C)C(O[C@H]2C[C@@](C)(OC)[C@@H](O)C...,MGSPWNGSDGPEDAREPPWAALPPCDERRCSPFPLGTLVPVTAVCL...
1751880,224.0,25.0,2.5,CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...,MGSPWNGSDGPEDAREPPWAALPPCDERRCSPFPLGTLVPVTAVCL...


In [11]:
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer

imputer = SimpleImputer() #TODO: better imputer
y_i = imputer.fit_transform(y)

In [6]:
y_i = pd.DataFrame(data=y,columns=idx_str+['pH'])
y_i

Unnamed: 0,IC50,Temp,pH
180,8.5,37.0,6.0
181,177.0,37.0,6.0
183,164.0,37.0,6.0
184,67.0,37.0,6.0
185,27.0,37.0,6.0
...,...,...,...
1062791,29.5,22.0,7.5
1751877,8.9,25.0,2.5
1751879,71.0,25.0,2.5
1751880,224.0,25.0,2.5


In [6]:
df_data = y
df_data['SMILES'] = X_drugs
df_data['Target Sequence'] = X_targets
df_data['Organism'] = X_org

print('in total: ' + str(len(df_data)) + ' drug-target pairs')
df_data

in total: 93309 drug-target pairs


Unnamed: 0,IC50,Temp,pH,SMILES,Target Sequence,Organism
180,8.5,37.0,6.0,CC(C)[C@H](NC(C)=O)C(=O)N[C@@H](Cc1ccccc1)[C@@...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Human immunodeficiency virus 1
181,177.0,37.0,6.0,CCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccccc1)[...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Human immunodeficiency virus 1
183,164.0,37.0,6.0,COCCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccccc1...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Human immunodeficiency virus 1
184,67.0,37.0,6.0,COCCOCCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccc...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Human immunodeficiency virus 1
185,27.0,37.0,6.0,COC(=O)N[C@@H](C(C)C)C(=O)NN(C[C@H](O)[C@H](Cc...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Human immunodeficiency virus 1
...,...,...,...,...,...,...
1062791,29.5,22.0,7.5,CN(CCCCCCn1c(O)c2Cc3ccccc3Cn2c1=O)CCc1ccccc1,MQWAVGRRWAWAALLLAVAAVLTQVVWLWLGTQSFVFQREEIAQLA...,Homo sapiens
1751877,8.9,25.0,2.5,CCC1OC(=O)C(C)C(O[C@H]2C[C@@](C)(OC)[C@@H](O)C...,MGSPWNGSDGPEDAREPPWAALPPCDERRCSPFPLGTLVPVTAVCL...,
1751879,71.0,25.0,2.5,CCC1OC(=O)C(C)C(O[C@H]2C[C@@](C)(OC)[C@@H](O)C...,MGSPWNGSDGPEDAREPPWAALPPCDERRCSPFPLGTLVPVTAVCL...,
1751880,224.0,25.0,2.5,CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...,MGSPWNGSDGPEDAREPPWAALPPCDERRCSPFPLGTLVPVTAVCL...,


In [20]:
df_backup = df_data
df_data = df_data.head(1)

In [9]:
df_backup.IC50

0          1.264663e+08
1          1.264663e+08
2          1.264663e+08
3          1.264663e+08
4          1.264663e+08
               ...     
1730861    1.264663e+08
1730862    1.264663e+08
1730863    1.264663e+08
1730864    1.264663e+08
1730865    1.264663e+08
Name: IC50, Length: 1730866, dtype: float64

In [67]:
import math
def magnitude(x):
    if x > 0:
        return int(math.floor(math.log10(x)))
    else:
        return 0

magic50 = df_data['IC50'].apply(magnitude)

In [20]:
df_foo = df_data
df_foo['magic50'] = magic50
chaindf = df_foo.groupby(by=['magic50']).count()
chaindf

Unnamed: 0_level_0,IC50,Temp,pH,SMILES,Target Sequence,Organism
magic50,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
-4,1,1,1,1,1,0
-3,113,113,113,113,113,15
-2,262,262,262,262,262,148
-1,4632,4632,4632,4632,4632,2123
0,17711,17719,17719,17719,17719,10094
1,23355,23355,23355,23355,23355,13384
2,20336,20336,20336,20336,20336,12008
3,12899,12899,12899,12899,12899,7538
4,9646,9646,9646,9646,9646,6161
5,3972,3972,3972,3972,3972,2079


In [7]:
del df_want
del df

In [21]:
import time

drug_func_list= [smiles2morgan,drug2emb_encoder,calcPubChemFingerAll,smiles2daylight]
#TODO: add calcPubChemFingerAll back in when it's not broken
#TODO: smiles2rdkit2d takes forever and can be added later
#TODO: smiles2mpnnfeature doesn't take super long (around 40 min on desktop) but can be added later
#TODO: same wrt smiles2daylight
column_name = 'SMILES'
start = time.time()

for func in drug_func_list:
    save_column_name = func.__name__
    unique = pd.Series(df_data[column_name].unique()).apply(func)
    unique_dict = dict(zip(df_data[column_name].unique(), unique))
    df_data[save_column_name] = [unique_dict[i] for i in df_data[column_name]]
    end = time.time()
    print(end - start)

0.04999661445617676
0.05501055717468262
0.10199880599975586
0.11101531982421875


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [22]:
prot_func_list = [CalculateConjointTriad, protein2emb_encoder,target2quasi]
#TODO: run CalculateAADipeptideComposition and _GetPseudoAAC when time permits
#TODO: GetQuasiSequenceOrder is broken
#prot_func_list = [GetQuasiSequenceOrder]
column_name = 'Target Sequence'
start = time.time()

for func in prot_func_list:
    save_column_name = func.__name__
    AA = pd.Series(df_data[column_name].unique()).apply(func)
    AA_dict = dict(zip(df_data[column_name].unique(), AA))
    df_data[save_column_name] = [AA_dict[i] for i in df_data[column_name]]
    end = time.time()
    print(end - start)

0.002000570297241211
0.0070002079010009766
0.016000032424926758


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [12]:
df_data.to_pickle("./df_data_small.pkl")

In [12]:
df_data = pd.read_pickle("./df_data_small3.pkl")

In [3]:
import tables

ImportError: DLL load failed: The specified module could not be found.

In [23]:
import sys
np.set_printoptions(threshold=sys.maxsize)

df_data.to_csv('df_data_ph.csv.gz'
         , sep='|'
         , header=True
         , index=False
         , chunksize=5000
         , compression='gzip'
         , encoding='utf-8')

In [6]:
df_data = pd.read_csv('df_data_ph.csv.gz'
         , sep='|'
         , compression='gzip'
         , encoding='utf-8')

In [72]:
df_data.head()

Unnamed: 0,IC50,Temp,pH,SMILES,Target Sequence,Organism,smiles2morgan,drug2emb_encoder,calcPubChemFingerAll,smiles2daylight,CalculateConjointTriad,protein2emb_encoder,target2quasi
180,8.5,37.0,6.0,CC(C)[C@H](NC(C)=O)C(=O)N[C@@H](Cc1ccccc1)[C@@...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 6.0, 1.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, ...","([2266, 117, 72, 124, 339, 295, 186, 277, 1880...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ...","[0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, ...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...","[0.000344, 0.000459, 0.000459, 0.000459, 0.000..."
181,177.0,37.0,6.0,CCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccccc1)[...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 6.0, 1.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, ...","([240, 339, 1416, 295, 186, 277, 1880, 1436, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ...","[0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, ...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...","[0.000344, 0.000459, 0.000459, 0.000459, 0.000..."
183,164.0,37.0,6.0,COCCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccccc1...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 6.0, 1.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, ...","([2227, 339, 1416, 295, 186, 277, 1880, 1436, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ...","[0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...","[0.000344, 0.000459, 0.000459, 0.000459, 0.000..."
184,67.0,37.0,6.0,COCCOCCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccc...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 6.0, 1.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, ...","([867, 486, 339, 1416, 295, 186, 277, 1880, 14...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...","[0.000344, 0.000459, 0.000459, 0.000459, 0.000..."
185,27.0,37.0,6.0,COC(=O)N[C@@H](C(C)C)C(=O)NN(C[C@H](O)[C@H](Cc...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 5.0, 1.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, ...","([92, 2346, 199, 179, 1397, 206, 763, 124, 81,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ...","[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...","[0.000344, 0.000459, 0.000459, 0.000459, 0.000..."


In [23]:
#turn categorical variables into numerical dummy variables for modeling

cat_list = pd.get_dummies(df_data['Organism'], prefix='var')
df_data=df_data.join(cat_list)

In [None]:
df_data1

In [24]:
discard=['SMILES','Target Sequence','Organism']
#discard=['SMILES','Target Sequence','Organism','trans_drug','trans_protein','pH_raw','Kd_raw','IC50_raw','Ki_raw','EC50_raw','Temp_raw']
df_vars=df_data.columns.values.tolist()
to_keep=[i for i in df_vars if i not in discard]
df_final=df_data[to_keep]
df_final.head()

Unnamed: 0,IC50,Temp,pH,smiles2morgan,drug2emb_encoder,calcPubChemFingerAll,smiles2daylight,CalculateConjointTriad,protein2emb_encoder,target2quasi,var_Human immunodeficiency virus 1
180,8.5,37.0,6.0,"[0.0, 6.0, 1.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, ...","([2266, 117, 72, 124, 339, 295, 186, 277, 1880...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ...","[0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, ...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...","[0.000344, 0.000459, 0.000459, 0.000459, 0.000...",1


In [25]:
#X = df_final.drop(["IC50","magic50","Kd","Ki","EC50","smiles2mpnnfeature"], axis=1)
X = df_final.drop(["IC50"], axis=1)
#X2 = X

In [53]:
X["smiles2morgan"].iloc[1]

['0.',
 '6.',
 '1.',
 '0.',
 '3.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '1.',
 '0.',
 '0.',
 '0.',
 '6.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '2.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '3.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '1.',
 '5.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '2.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '3.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '4.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',
 '0.',

In [30]:
import collections

In [31]:
def flattener(x):
    if isinstance(x, collections.Iterable):
        return [a for i in x for a in flattener(i)]
    else:
        return [x]

smiles2morgan : <class 'str'>
drug2emb_encoder : <class 'str'>
calcPubChemFingerAll : <class 'str'>
smiles2daylight : <class 'str'>
CalculateConjointTriad : <class 'str'>
protein2emb_encoder : <class 'str'>
target2quasi : <class 'str'>

In [62]:
from array import array

X['drug2emb_encoder'].apply(eval)

NameError: name 'array' is not defined

In [54]:
from ast import literal_eval

for n in range(len(X)):
    #X['CalculateConjointTriad'][n] = X['CalculateConjointTriad'][n].str.strip('[]').str.split()
    #X['smiles2morgan'][n] = X['smiles2morgan'][n].str.strip('[.]').str.split()
    X['drug2emb_encoder'][n] = literal_eval(X['drug2emb_encoder'][n])
    X['calcPubChemFingerAll'][n] = X['calcPubChemFingerAll'][n].str.strip('[]').str.split()
    X['smiles2daylight'][n] = X['smiles2daylight'][n].str.strip('[.]').str.split()
    X['protein2emb_encoder'][n] = literal_eval(X['protein2emb_encoder'][n])
    X['target2quasi'][n] = X['target2quasi'][n].str.strip('[]').str.split()
X.head()

ValueError: malformed node or string: <_ast.Call object at 0x000002342F7CA388>

In [27]:
typelist = []
for i in list(X):
    print(i, ':', type(X[i].iloc[0]))#, ':', X[i].iloc[1])
    if isinstance(X[i].iloc[0],np.ndarray):
        print(len(X[i].iloc[0]))
        typelist.extend([i]*len(X[i].iloc[0]))
    elif isinstance(X[i].iloc[0],tuple):
        for n in X[i].iloc[0]:
            if isinstance(n,np.ndarray):
                print(len(n))
                typelist.extend([i]*len(n))
    elif i[:4] == 'var_':
        typelist.append('organism')
    else:
        typelist.append(i)

Temp : <class 'numpy.float64'>
pH : <class 'numpy.float64'>
smiles2morgan : <class 'numpy.ndarray'>
1024
drug2emb_encoder : <class 'tuple'>
50
50
calcPubChemFingerAll : <class 'numpy.ndarray'>
881
smiles2daylight : <class 'numpy.ndarray'>
2048
CalculateConjointTriad : <class 'numpy.ndarray'>
343
protein2emb_encoder : <class 'tuple'>
545
545
target2quasi : <class 'numpy.ndarray'>
100
var_Human immunodeficiency virus 1 : <class 'numpy.uint8'>


In [81]:
Z_df = np.empty(shape=[len(X),len(flattener(X.iloc[0]))])
for n in range(len(X)):
    Z_df[n] = flattener(X.iloc[n])

In [84]:
np.savetxt("Z_df.csv.gz", Z_df, delimiter=",")

In [8]:
Z_df=pd.read_csv('Z_df.csv.gz', sep=',',header=None)

In [3]:
vecx = X.iloc[0:100].apply(flattener,axis=1)

NameError: name 'X' is not defined

In [23]:
vec = X.apply(flattener,axis=1)

In [16]:
len(Z_df)

93309

In [12]:
model = DTI.model_pretrained(path_dir = './model-9-24')

In [5]:
from DeepPurpose import utils, dataset, CompoundPred
from DeepPurpose import DTI as models

data_path = './data//BindingDB_All.tsv'
X_drugs, X_targets, y = dataset.process_BindingDB(path = data_path, df = None, y = 'IC50', binary = False, convert_to_log = True, threshold = 0)
drug_encoding, target_encoding = 'Morgan', 'Conjoint_triad'
train_all, val, test = utils.data_process(X_drugs, X_targets, y, 
                                drug_encoding, target_encoding, 
                                split_method='random',frac=[1,0,0])

Loading Dataset from path...


b'Skipping line 772572: expected 193 fields, saw 205\nSkipping line 772598: expected 193 fields, saw 205\n'
b'Skipping line 805291: expected 193 fields, saw 205\n'
b'Skipping line 827961: expected 193 fields, saw 265\n'
b'Skipping line 1231688: expected 193 fields, saw 241\n'
b'Skipping line 1345591: expected 193 fields, saw 241\nSkipping line 1345592: expected 193 fields, saw 241\nSkipping line 1345593: expected 193 fields, saw 241\nSkipping line 1345594: expected 193 fields, saw 241\nSkipping line 1345595: expected 193 fields, saw 241\nSkipping line 1345596: expected 193 fields, saw 241\nSkipping line 1345597: expected 193 fields, saw 241\nSkipping line 1345598: expected 193 fields, saw 241\nSkipping line 1345599: expected 193 fields, saw 241\n'
b'Skipping line 1358864: expected 193 fields, saw 205\n'
b'Skipping line 1378087: expected 193 fields, saw 241\nSkipping line 1378088: expected 193 fields, saw 241\nSkipping line 1378089: expected 193 fields, saw 241\nSkipping line 1378090: e

Beginning Processing...
There are 1073803 drug target pairs.
Default set to logspace (nM -> p) for easier regression
Drug Target Interaction Prediction Mode...
in total: 1073803 drug-target pairs
encoding drug...
unique drugs: 549205
rdkit not found this smiles for morgan: CC(C)(C)OC(=O)N1CC(=O)N(C(=O)C1)c1ccc(cc1)N1CC(COC(=O)[N]2=CC=C(Cl)S2)OC1=O convert to all 0 features
rdkit not found this smiles for morgan: CSc1ccc(cc1)C1=C(C=C[N]([O-])=C1)[C@@H]1CCC(F)(F)C[C@H]1C(=O)NCC#N convert to all 0 features
rdkit not found this smiles for morgan: O=C1NC(=O)c2c1c1c3ccccc3n3[Ru](C#[O])[n+]4cccc2c4c13 convert to all 0 features
rdkit not found this smiles for morgan: CN1C(=O)c2c(C1=O)c1cc(F)c[n+]3[Ru](C#[O])n4c5ccc(O)cc5c2c4c13 convert to all 0 features
rdkit not found this smiles for morgan: CCC=[C]1=CC=C(C=C1)N1CC(C1)Oc1ccc(cc1)[C@H](C)NC(=O)c1ccncc1 convert to all 0 features
rdkit not found this smiles for morgan: NOOSc1ccc(CC[N]23CC4=CC=CC=[N]4[Re+]2[N]2=C(C3)C=CC=C2)cc1 convert to all 0 f

rdkit not found this smiles for morgan: CC(C)n1c(Nc2ccccc2)nc2cnc(Nc3ccc(cc3)C(=O)N[N]3=CCN(C)CC3)nc12 convert to all 0 features
rdkit not found this smiles for morgan: CC(C)n1c(Nc2cccc(NC(C)=O)c2)nc2cnc(Nc3ccc(cc3F)C(=O)N[N]3=CCN(C)CC3)nc12 convert to all 0 features
rdkit not found this smiles for morgan: CC[C@@H](N[C@@H](C)CC(N)=O)c1ccc(Cl)c(C(=O)C2=CC=[N]([O-])C=C2)c1F convert to all 0 features
rdkit not found this smiles for morgan: CC1=N[N](C)=C(C)C1N[S+]([O-])(=O)c1c(Cl)cc(cc1Cl)-c1cccc2CNCCc12 convert to all 0 features
rdkit not found this smiles for morgan: C[N]1=C(CC=N1)Nc1cc(ncc1C(=O)NC[C@@H](F)C(C)(C)O)-n1ccc2cc(cnc12)C#N convert to all 0 features
rdkit not found this smiles for morgan: COc1ccc(cn1)-c1cc(NC(=O)C2=C3N=CC=C[N]3=NC2)n(n1)-c1ccc(C)cc1 convert to all 0 features
rdkit not found this smiles for morgan: Cc1ccc(cc1)-n1nc(cc1NC(=O)C1=C2N=CC=C[N]2=NC1)C1=CCN(CC1)S(C)(=O)=O convert to all 0 features
rdkit not found this smiles for morgan: CC(C)n1ncc2c(cc(C)nc12)C(=O)NCc

rdkit not found this smiles for morgan: CCc1ccc(Cl)cc1-c1nc(cc1C#N)-c1cc(N)ncn1 convert to all 0 features
rdkit not found this smiles for morgan: Cc1ccc(Cl)cc1-c1cc(nc1C(N)=O)-c1ncnc2[nH]ccc12 convert to all 0 features
rdkit not found this smiles for morgan: CCc1ccc(Cl)cc1-c1cc(nc1C(N)=O)-c1ncnc2[nH]ccc12 convert to all 0 features
rdkit not found this smiles for morgan: CCc1ccc(cc1-c1cc(nc1C(N)=O)-c1ncnc2[nH]ccc12)C(F)(F)F convert to all 0 features
rdkit not found this smiles for morgan: CNc1cc(ncn1)-c1cc(C(N)=O)c(n1)-c1cc(Cl)ccc1C convert to all 0 features
rdkit not found this smiles for morgan: Cc1ccc(Cl)cc1-c1nc(cc1C(N)=O)-c1ncnc2[nH]ccc12 convert to all 0 features
rdkit not found this smiles for morgan: Cc1ccc(Cl)cc1-c1nc(cc1C(N)=O)-c1nc[nH]c2nccc12 convert to all 0 features
rdkit not found this smiles for morgan: CCc1ccc(Cl)cc1-c1nc(cc1C(N)=O)-c1cc(N)ncn1 convert to all 0 features
rdkit not found this smiles for morgan: CCc1ccc(Cl)cc1-c1nc(cc1C(N)=O)-c1ncnc2[nH]ccc12 convert to al

rdkit not found this smiles for morgan: Cc1[nH]nn(C)c1-c1cnc2c(c1)n(C(C1CCOCC1)c1ccc(Cl)cc1)c1cc(ccc21)C(C)(C)O convert to all 0 features
rdkit not found this smiles for morgan: Cc1[nH]nn(C)c1-c1cnc2c(c1)n(C(C1CCOCC1)c1cccc(Cl)c1)c1cc(ccc21)C(C)(C)O convert to all 0 features
rdkit not found this smiles for morgan: Cc1[nH]nn(C)c1-c1cnc2c(c1)n(C(C1CCOCC1)c1ccccc1Cl)c1cc(ccc21)C(C)(C)O convert to all 0 features
rdkit not found this smiles for morgan: Cc1[nH]nn(C)c1-c1cnc2c(c1)n([C@@H](C1CCOCC1)c1ccccc1)c1cc(ccc21)C(C)(O)C1CC1 convert to all 0 features
rdkit not found this smiles for morgan: Cc1[nH]nn(C)c1-c1cnc2c(c1)n([C@@H](C1CCOCC1)c1ccccc1)c1c(F)c(ccc21)[C@](C)(O)C1CC1 convert to all 0 features
rdkit not found this smiles for morgan: Cc1[nH]nn(C)c1-c1cnc2c(c1)n([C@@H](C1CCOCC1)c1ccccc1)c1c(F)c(ccc21)[C@@](C)(O)C1CC1 convert to all 0 features
rdkit not found this smiles for morgan: Cc1[nH]nn(C)c1-c1cnc2c(c1)n([C@@H](C1CCOCC1)c1ccccc1F)c1c(F)c(ccc21)C(C)(O)C1CC1 convert to all 0 features

rdkit not found this smiles for morgan: CC(C)(O)c1cc(O[C@H]2CC[C@H](CC2)N2C[N](CC#N)(C2)n2cc(cn2)-c2ncnc3[nH]ccc23)nc(c1)C(F)(F)F convert to all 0 features
rdkit not found this smiles for morgan: COc1cc(OC)cc(c1)C(\O)=C1\CC=c2ncc(cc2=[N]1CCO)-c1cnn(C)c1 convert to all 0 features
rdkit not found this smiles for morgan: CCOC(=O)c1cn(-c2ccc(N3CCNC3=O)c(F)c2)c(=O)n(Cc2cccc(c2C)N([O-])=O)c1=O convert to all 0 features
rdkit not found this smiles for morgan: Cc1c(Cn2c(=O)c(cn(-c3ccc(N4CCNC4=O)c(F)c3)c2=O)C(O)=O)cccc1N([O-])=O convert to all 0 features
rdkit not found this smiles for morgan: Cc1c(Cn2c(=O)c(cn(-c3ccc(cc3)N3CCOC3=O)c2=O)C(O)=O)cccc1N([O-])=O convert to all 0 features
rdkit not found this smiles for morgan: CC(Nc1nc(C)nc(N)c1C#N)C1=Nc2cccc(Cl)c2C(=O)[N@@]1(c1ccccc1)c1cc(F)cc(F)c1 convert to all 0 features
rdkit not found this smiles for morgan: CC(Nc1nc(N)nc(Cl)c1C#N)C1=Nc2cccc(Cl)c2C(=O)[N@@]1(c1ccccc1)c1cc(F)cc(F)c1 convert to all 0 features
rdkit not found this smiles for mor

rdkit not found this smiles for morgan: C[N]1=CC=[N](C=C1)c1ccc(cc1)-c1nc2c(ncnc2[nH]1)-c1ccc(OC2CCOCC2)c(c1)C#N convert to all 0 features
rdkit not found this smiles for morgan: Oc1ccc(cc1)[B]1234[B]567[B]89%10[B]%11%12%13[B]585[B]%118%11[B]%12%12%14[B]9%139[B]16%10[C]2%129[C]38%14[B]475%11 convert to all 0 features
rdkit not found this smiles for morgan: Oc1ccc(cc1)[B]1234[B]567[B]89%10[B]%11%12%13[B]%14%15%16[B]%11%11%17[B]8%128[B]159[C]2%118[B]3%14%17[B]46%15[C]7%10%13%16 convert to all 0 features
rdkit not found this smiles for morgan: Oc1ccc(cc1)[B]1234[B]567[B]89%10[B]55%11[B]88%12[B]%13%14%15[B]11([B]269[C]%108%131)[B]3%141[B]475[C]%11%12%151 convert to all 0 features
rdkit not found this smiles for morgan: Oc1ccc(cc1)[C]1234[B]567[B]89%10[B]55%11[B]%12%13%14[B]%15%16%17[B]88([B]169[B]2%158[B]3%12%16[B]475%13)[C]%10%11%14%17 convert to all 0 features
rdkit not found this smiles for morgan: Oc1ccc(cc1)[B]1234[B]567[B]89%10[B]55%11[B]161[B]556[B]211[B]323[B]478[B]922[C]%10%115[C]

In [5]:
idxlist = df_want.index
train_true = train_all.iloc[idxlist[idxlist<len(train_all)]]

NameError: name 'train_all' is not defined

In [9]:
train_true

Unnamed: 0,SMILES,Target Sequence,Label,drug_encoding,target_encoding
180,CCNC(=O)[C@@H](NC(=O)c1ccccc1Oc1ccccc1)[C@@H]1...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,6.795609,"[0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ..."
181,CCNC(=O)[C@@H](NC(=O)c1ccccc1Nc1ccccc1)[C@@H]1...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,5.318750,"[0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ..."
183,CC1(C)S[C@@H](N[C@H]1C(=O)NCCNC(=O)[C@@H]1N[C@...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,9.187087,"[0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ..."
184,CC1(C)S[C@@H](N[C@H]1C(=O)NCCNC(=O)[C@@H]1N[C@...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,8.119186,"[0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ..."
185,CC1(C)S[C@@H](N[C@H]1C(=O)NCCNC(=O)[C@@H]1N[C@...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,7.850781,"[0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ..."
...,...,...,...,...,...
746326,ONC(=O)c1cc(CCCCC(=O)Nc2ccc(cc2)-c2ccccc2)on1,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...,5.130176,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[4, 1, 4, 3, 1, 4, 0, 3, 4, 7, 7, 1, 3, 1, 2, ..."
746327,ONC(=O)c1cc(CCCCC(=O)NC2CCCCC2)on1,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...,4.522877,"[0.0, 0.0, 1.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, ...","[4, 1, 4, 3, 1, 4, 0, 3, 4, 7, 7, 1, 3, 1, 2, ..."
1062788,Cc1nc(Oc2ccc(cc2)C(O)=O)ccc1CN1CCC(CC1)N1[C@@H...,MNPTDIADTTLDESIYSNYYLYESIPKPCTKEGIKAFGELFLPPLY...,6.247107,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[3, 6, 0, 0, 1, 1, 0, 8, 9, 7, 3, 1, 0, 2, 0, ..."
1062790,CNC(=O)c1ccc(CN2CCC(CC2)N2[C@@H](CN(C3CCCCC3)C...,MDYQVSSPIYDINYYTSEPCQKINVKQIAARLLPPLYSLVFIFGFV...,7.793174,"[0.0, 0.0, 1.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, ...","[4, 3, 1, 3, 2, 1, 0, 5, 11, 4, 2, 1, 1, 1, 1,..."


In [9]:
ye = test_ypred[idxlist[idxlist<1100000]]

In [10]:
len(ye)

93305

In [22]:
type(train_all.iloc[0].to_frame())

pandas.core.frame.DataFrame

In [24]:
y_pred = model.predict(train_all)

predicting...


In [60]:
y_pred 

NameError: name 'y_pred' is not defined

In [26]:
np.save("y_pred.csv.gz",y_pred)

In [6]:
test_ypred = np.load("y_pred.csv.gz.npy")

In [64]:
idxlist

Int64Index([    180,     181,     183,     184,     185,     186,     187,
                188,     189,     190,
            ...
             746325,  746326,  746327, 1062788, 1062790, 1062791, 1751877,
            1751879, 1751880, 1751881],
           dtype='int64', length=93309)

In [11]:
Z_df.drop(Z_df.tail(4).index,inplace=True)
len(Z_df)

93305

In [12]:
Z_df[len(Z_df.columns)] = ye
Z_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5659,5660,5661,5662,5663,5664,5665,5666,5667,5668
0,37.0,6.0,0.0,6.0,1.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.230999
1,37.0,6.0,0.0,6.0,1.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.519184
2,37.0,6.0,0.0,6.0,1.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.871435
3,37.0,6.0,0.0,6.0,1.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.848017
4,37.0,6.0,0.0,5.0,1.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.532343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93300,25.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.515060
93301,25.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.219722
93302,22.0,7.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.606981
93303,22.0,7.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.978816


In [None]:
from IPython.display import clear_output

y_pred = train_all
y_pred['pred'] = 0
for i in train_true.index:
    print(i)
    clear_output(wait=True)
    try:
        y_pred['pred'].iloc[i:i+1] = model.predict(train_all.loc[i:i+1])
    except:
        y_pred['pred'].iloc[i] = 0

In [13]:
foo2 = Z_df.to_numpy(dtype = object)
#foo2 = np.vstack(Z_df)
foo2

KeyboardInterrupt: 

In [14]:
# tag each index with equal probabilities to True & False
# note: p = m1 / ( m1 + m2 ) can make more sense depending
# on desired distribution
tag = np.random.binomial(n=1, p=.8, size=len(Z_df)) == 1

# assign True indices to idx1 and False indices to index 2
idx = np.array( range( len(Z_df) ) )
idx1, idx2 = idx[ tag ], idx[ np.logical_not( tag ) ]

# sample from idx1 and idx2
i1, i2 = np.random.choice( idx1, size=10000 ), np.random.choice( idx2, size=10000 )

In [21]:
df_data.head()

Unnamed: 0,IC50,Temp,pH,SMILES,Target Sequence,Organism,smiles2morgan,drug2emb_encoder,calcPubChemFingerAll,smiles2daylight,...,var_Sus scrofa,var_Torpedo marmorata,var_Trichomonas vaginalis G3,var_Trypanosoma brucei,var_Trypanosoma brucei brucei,var_Trypanosoma cruzi,var_Vibrio proteolyticus,var_Xenopus laevis,var_Yersinia pestis,var_[Bacteroides] pectinophilus ATCC 43243
180,8.5,37.0,6.0,CC(C)[C@H](NC(C)=O)C(=O)N[C@@H](Cc1ccccc1)[C@@...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 6.0, 1.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, ...","([2266, 117, 72, 124, 339, 295, 186, 277, 1880...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ...","[0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, ...",...,0,0,0,0,0,0,0,0,0,0
181,177.0,37.0,6.0,CCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccccc1)[...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 6.0, 1.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, ...","([240, 339, 1416, 295, 186, 277, 1880, 1436, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ...","[0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, ...",...,0,0,0,0,0,0,0,0,0,0
183,164.0,37.0,6.0,COCCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccccc1...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 6.0, 1.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, ...","([2227, 339, 1416, 295, 186, 277, 1880, 1436, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ...","[0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",...,0,0,0,0,0,0,0,0,0,0
184,67.0,37.0,6.0,COCCOCCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccc...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 6.0, 1.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, ...","([867, 486, 339, 1416, 295, 186, 277, 1880, 14...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",...,0,0,0,0,0,0,0,0,0,0
185,27.0,37.0,6.0,COC(=O)N[C@@H](C(C)C)C(=O)NN(C[C@H](O)[C@H](Cc...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 5.0, 1.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, ...","([92, 2346, 199, 179, 1397, 206, 763, 124, 81,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ...","[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",...,0,0,0,0,0,0,0,0,0,0


In [61]:
Z_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5659,5660,5661,5662,5663,5664,5665,5666,5667,5668
0,37.0,6.0,0.0,6.0,1.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.230999
1,37.0,6.0,0.0,6.0,1.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.519184
2,37.0,6.0,0.0,6.0,1.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.871435
3,37.0,6.0,0.0,6.0,1.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.848017
4,37.0,6.0,0.0,5.0,1.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.532343


In [62]:
ye

array([7.23099852, 7.51918411, 7.87143517, ..., 6.60698128, 6.97881603,
       7.66758728])

In [15]:
X_train = Z_df.iloc[i1].to_numpy(dtype = object)
X_test = Z_df.iloc[i2].to_numpy(dtype = object)

In [45]:
y_train.astype(np.float32)

array([ 718.  , 2290.  ,   16.  , ...,    2.78, 1691.  ,   47.  ],
      dtype=float32)

In [42]:
pic50_train = []
for i in range(len(y_train)):
    print(i)
    pic50_train[i] = -np.log10(y_train[i]*1e-9 + 1e-10)

0


TypeError: can't multiply sequence by non-int of type 'float'

In [47]:
pic50_train = -np.log10(y_train.astype(np.float32)*1e-9 + 1e-10)
pic50_test = -np.log10(y_test.astype(np.float32)*1e-9 + 1e-10)

In [18]:
y_train =df_want['IC50'].iloc[i1].to_numpy(dtype = object)
y_test = df_want['IC50'].iloc[i2].to_numpy(dtype = object)

In [None]:
from sklearn.model_selection import train_test_split
y = df_final["IC50"]
X_train, X_test, y_train, y_test = train_test_split(foo2, y, test_size=0.2)

In [19]:
test_list = pd.isnull(y_train)
res = [i for i, val in enumerate(test_list) if val] 
y_train = np.delete(y_train,res,0)
X_train = np.delete(X_train,res,0)

In [71]:
a_train = np.hstack((X_train, np.tile(X_train[:, [-1]], 5000)))
a_test = np.hstack((X_test, np.tile(X_test[:, [-1]], 5000)))

In [70]:
X_train.shape

(9997, 5669)

In [24]:
test_list = pd.isnull(y_test)
res = [i for i, val in enumerate(test_list) if val] 
y_test = np.delete(y_test,res,0)
X_test = np.delete(X_test,res,0)

In [34]:
y_train

array([' 718', ' 2290', ' 16', ..., ' 2.78', ' 1691', ' 47.0'],
      dtype=object)

In [72]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_scaled = scaler.fit_transform(a_train)
test_scaled = scaler.transform(a_test)

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
tree_model = DecisionTreeRegressor()

In [27]:
test_scaled

array([[-0.04306109, -4.10300125, -0.11596648, ..., -0.010002  ,
        -0.010002  ,  1.14333799],
       [-0.04306109,  0.14151057, -0.11596648, ..., -0.010002  ,
        -0.010002  ,  0.28769498],
       [-0.0958354 , -0.44393934, -0.11596648, ..., -0.010002  ,
        -0.010002  , -1.32471298],
       ...,
       [-0.04306109,  0.72696047, -0.11596648, ..., -0.010002  ,
        -0.010002  ,  0.23850083],
       [-0.04306109,  0.14151057, -0.11596648, ..., -0.010002  ,
        -0.010002  ,  0.42260208],
       [-0.04306109,  0.28787304, -0.11596648, ..., -0.010002  ,
        -0.010002  , -0.21357186]])

In [73]:
tree_model.fit(train_scaled, pic50_train)
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import average_precision_score
tree_mse = mean_squared_error(pic50_train, tree_model.predict(train_scaled))
tree_mae = mean_absolute_error(pic50_train, tree_model.predict(train_scaled))
from math import sqrt
print("Decision Tree training mse = ",tree_mse," & mae = ",tree_mae," & rmse = ", sqrt(tree_mse))
tree_test_mse = mean_squared_error(pic50_test, tree_model.predict(test_scaled))
tree_test_mae = mean_absolute_error(pic50_test, tree_model.predict(test_scaled))
print("Decision Tree test mse = ",tree_test_mse," & mae = ",tree_test_mae," & rmse = ", sqrt(tree_test_mse))

Decision Tree training mse =  1.266659074371282e-05  & mae =  5.671934353133088e-05  & rmse =  0.0035590154177402517
Decision Tree test mse =  1.525582855264291  & mae =  0.8445709963798523  & rmse =  1.2351448721766571


In [74]:
a = pearsonr(pic50_train.astype(np.float32), tree_model.predict(train_scaled.astype(np.float32)))
b = pearsonr(pic50_test.astype(np.float32), tree_model.predict(test_scaled.astype(np.float32)))
print("Decision Tree train r = ",a)
print("Decision Tree test r = ",b)

Decision Tree train r =  (0.9999971336722445, 0.0)
Decision Tree test r =  (0.6501324410314997, 0.0)


In [29]:
for i in range(len(importances)-len(typelist)):
        typelist.append('organism')

NameError: name 'importances' is not defined

In [75]:
importances = pd.DataFrame({'importance':np.round(tree_model.feature_importances_,3)})
out = importances.sort_values('importance',ascending=False)
importances["Type"] = typelist
type_importance = importances.groupby(by=['Type']).sum()
type_importance.sort_values('importance',ascending=False)

In [28]:
X.head()

Unnamed: 0,Kd,Ki,EC50,Temp,pH,smiles2morgan,drug2emb_encoder,CalculateConjointTriad,protein2emb_encoder,var_Abelson murine leukemia virus,...,var_Influenza B virus,var_Influenza B virus (B/Memphis/3/93),var_Influenza B virus (B/Victoria/517/2005),var_Influenza B virus (strain B/Lee/1940),var_Influenza B virus (strain B/Memphis/3/1989),var_Klebsiella pneumoniae,var_Mus musculus,var_Oryctolagus cuniculus,var_Pseudomonas aeruginosa,var_Rattus norvegicus
0,872669.450093,0.24,362878.629538,37.0,5.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","([515, 343, 982, 52, 93, 210, 614, 1244, 690, ...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...",0,...,0,0,0,0,0,0,0,0,0,0
1,872669.450093,0.25,362878.629538,37.0,5.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","([1138, 186, 144, 265, 199, 188, 381, 1734, 13...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...",0,...,0,0,0,0,0,0,0,0,0,0
2,872669.450093,0.41,362878.629538,37.0,5.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","([1138, 186, 144, 265, 261, 158, 322, 65, 188,...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...",0,...,0,0,0,0,0,0,0,0,0,0
3,872669.450093,0.8,362878.629538,37.0,5.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","([700, 409, 1769, 833, 144, 265, 261, 158, 258...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...",0,...,0,0,0,0,0,0,0,0,0,0
4,872669.450093,0.99,362878.629538,37.0,5.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","([700, 223, 1769, 833, 144, 265, 261, 158, 258...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...",0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
X_train

array([[37.0, 7.4, 0.0, ..., 0.0, 0.0, 6.013496398925781],
       [25.0, 5.5, 0.0, ..., 0.0, 0.0, 7.93550443649292],
       [25.0, 7.4, 0.0, ..., 0.0, 0.0, 3.347683906555176],
       ...,
       [22.0, 7.5, 0.0, ..., 0.0, 0.0, 7.682567596435547],
       [25.0, 7.2, 0.0, ..., 0.0, 0.0, 6.808877944946289],
       [25.0, 7.4, 0.0, ..., 0.0, 0.0, 6.159140586853027]], dtype=object)

In [54]:
pic50_train

array([6.143815 , 5.640146 , 7.7931743, ..., 8.540607 , 5.7718306,
       7.326979 ], dtype=float32)

In [65]:
importances

Unnamed: 0,importance
0,0.001
1,0.002
2,0.000
3,0.000
4,0.000
...,...
5664,0.000
5665,0.000
5666,0.000
5667,0.000


In [76]:
out.head(20)

Unnamed: 0,importance
1037,0.119
4485,0.04
1034,0.025
5505,0.019
5531,0.019
4513,0.017
4459,0.014
4524,0.013
4069,0.013
1744,0.013


In [45]:
data_top = out.head(20)  
typearray = np.array(typelist)
typearray[list(data_top.index.values)]

array(['protein2emb_encoder', 'smiles2daylight', 'smiles2morgan',
       'drug2emb_encoder', 'drug2emb_encoder', 'protein2emb_encoder',
       'organism', 'calcPubChemFingerAll', 'smiles2morgan',
       'smiles2daylight', 'smiles2daylight', 'smiles2daylight',
       'smiles2daylight', 'smiles2daylight', 'smiles2daylight',
       'smiles2daylight', 'smiles2daylight', 'smiles2daylight',
       'smiles2daylight', 'smiles2daylight'], dtype='<U22')

In [77]:
rf_model = RandomForestRegressor(n_estimators=10)
rf_model.fit(train_scaled, pic50_train)

RandomForestRegressor(n_estimators=10)

In [78]:
a = pearsonr(pic50_train, rf_model.predict(train_scaled))
b = pearsonr(pic50_test, rf_model.predict(test_scaled))
print("Random Forest train r = ",a)
print("Random Forest test r = ",b)

Random Forest train r =  (0.9717775404066947, 0.0)
Random Forest test r =  (0.7979784444661424, 0.0)


In [17]:
filename = 'rf_model.sav'
#pickle.dump(rf_model, open(filename, 'wb'))

In [None]:
def pred_ints(model, X, percentile=95):
    err_down = []
    err_up = []
    for x in range(len(X)):
        preds = []
        for pred in model.estimators_:
            preds.append(pred.predict(X[x])[0])
        err_down.append(np.percentile(preds, (100 - percentile) / 2. ))
        err_up.append(np.percentile(preds, 100 - (100 - percentile) / 2.))
    return err_down, err_up

err_down, err_up = pred_ints(rf, X_test, percentile=95)

In [None]:
from matplotlib import pyplot as plt

plt.scatter(mpg_y_test, mpg_y_hat)
plt.plot([5, 45], [5, 45], 'k--')
plt.xlabel('Reported MPG')
plt.ylabel('Predicted MPG')
plt.show()

# Calculate the variance
mpg_V_IJ_unbiased = fci.random_forest_error(mpg_forest, mpg_X_train,
                                            mpg_X_test)

# Plot error bars for predicted MPG using unbiased variance
plt.errorbar(mpg_y_test, mpg_y_hat, yerr=np.sqrt(mpg_V_IJ_unbiased), fmt='o')
plt.plot([5, 45], [5, 45], 'k--')
plt.xlabel('Reported MPG')
plt.ylabel('Predicted MPG')
plt.show()

In [None]:
inbag = forestci.calc_inbag(X_train.shape[0], forest)
y_err = forestci.random_forest_error(forest, inbag, X_train, X_test)

fig, axes = plt.subplots(figsize=(8,8))
axes.errorbar(forest.predict(X_test), y_test , yerr=np.sqrt(y_err), linestyle="",
              marker="o", markersize=5)

axes.plot([0,50], [0,50], color="r", linestyle="--")

In [None]:
param_grid = {
    "max_depth": np.arange(1, 15),
}

gs = model_selection.GridSearchCV(tree.DecisionTreeRegressor(), param_grid=param_grid, cv=5, n_jobs=-1)

gs.fit(X_train, pic50_train)

In [18]:
loaded_model = pickle.load(open(filename, 'rb'))

In [80]:
importances = pd.DataFrame({'importance':np.round(rf_model.feature_importances_,3)})
out = importances.sort_values('importance',ascending=False)
out.head(20)
#importances["Type"] = typelist
#type_importance = importances.groupby(by=['Type']).sum()
#type_importance.sort_values('importance',ascending=False)

Unnamed: 0,importance
1037,0.099
4485,0.042
4513,0.013
5581,0.012
1086,0.012
1088,0.011
4524,0.009
1034,0.008
5555,0.008
5531,0.007


In [84]:
importances = pd.DataFrame({'importance':np.round(rf_model.feature_importances_,3)})
out = importances.sort_values('importance',ascending=False)
out.head(20)

Unnamed: 0,importance
1037,0.099
4485,0.042
4513,0.013
5581,0.012
1086,0.012
1088,0.011
4524,0.009
1034,0.008
5555,0.008
5531,0.007


In [None]:
rf_mse = mean_squared_error(y_train, rf_model.predict(train_scaled))
rf_mae = mean_absolute_error(y_train, rf_model.predict(train_scaled))

print("Random Forest training mse = ",rf_mse," & mae = ",rf_mae," & rmse = ", sqrt(rf_mse))
rf_test_mse = mean_squared_error(y_test, rf_model.predict(test_scaled))
rf_test_mae = mean_absolute_error(y_test, rf_model.predict(test_scaled))
print("Random Forest test mse = ",rf_test_mse," & mae = ",rf_test_mae," & rmse = ", sqrt(rf_test_mse))

In [None]:
train

In [None]:
lr = config['LR']
decay = config['decay']
BATCH_SIZE = config['batch_size']
train_epoch = config['train_epoch']
loss_history = []
verbose = True

model = model.to(device)

# support multiple GPUs
if torch.cuda.device_count() > 1:
    if verbose:
        print("Let's use " + str(torch.cuda.device_count()) + " GPUs!")
    model = nn.DataParallel(model, dim = 0)
elif torch.cuda.device_count() == 1:
    if verbose:
        print("Let's use " + str(torch.cuda.device_count()) + " GPU!")
else:
    if verbose:
        print("Let's use CPU/s!")
# Future TODO: support multiple optimizers with parameters
opt = torch.optim.Adam(model.parameters(), lr = lr, weight_decay = decay)
if verbose:
    print('--- Data Preparation ---')

params = {'batch_size': BATCH_SIZE,
        'shuffle': True,
        'num_workers': config['num_workers'],
        'drop_last': False}

params['collate_fn'] = DTI.mpnn_collate_func

In [None]:
class data_process_loader_o(data.Dataset):

    def __init__(self, list_IDs, df, **config):
        'Initialization'
        self.list_IDs = list_IDs
        self.df = df
        self.config = config

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.list_IDs)

    def __getitem__(self, index):
        'Generates one sample of data'
        index = self.list_IDs[index]
        v = self.df.iloc[index]
#        v_d = self.df.iloc[index]['drug_encoding']        
#        v_d = drug_2_embed(v_d)
#        v_p = self.df.iloc[index]['target_encoding']
#        v_p = protein_2_embed(v_p)
        #y = self.labels[index]
        return v

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc_protein = OneHotEncoder().fit(np.array(amino_char).reshape(-1, 1))
enc_drug = OneHotEncoder().fit(np.array(smiles_char).reshape(-1, 1))

def protein_2_embed(x):
	return enc_protein.transform(np.array(x).reshape(-1,1)).toarray().T
def drug_2_embed(x):
	return enc_drug.transform(np.array(x).reshape(-1,1)).toarray().T    

In [None]:
from torch.utils.data.dataset import Dataset

class MyCustomDataset(Dataset):
    def __init__(self, ...):
        # stuff
        
    def __getitem__(self, index):
        # stuff
        return (img, label)

    def __len__(self):
        return count

In [None]:
training_generator = torch.utils.data.DataLoader(train, **params)
validation_generator = torch.utils.data.DataLoader(val, **params)