In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols
from DeepPurpose.pybiomed_helper import _GetPseudoAAC, CalculateAADipeptideComposition, \
calcPubChemFingerAll, CalculateConjointTriad, GetQuasiSequenceOrder
import torch
from torch.utils import data
from torch.autograd import Variable
try:
	from descriptastorus.descriptors import rdDescriptors, rdNormalizedDescriptors
except:
	raise ImportError("Please install pip install git+https://github.com/bp-kelley/descriptastorus.")
from DeepPurpose.chemutils import get_mol, atom_features, bond_features, MAX_NB, ATOM_FDIM, BOND_FDIM
from subword_nmt.apply_bpe import BPE
import codecs
import pickle
import wget
from zipfile import ZipFile 
import os
import sys

from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils.data import SequentialSampler
from torch import nn 

from tqdm import tqdm
import matplotlib.pyplot as plt
from time import time
from sklearn.metrics import mean_squared_error, roc_auc_score, average_precision_score, f1_score, log_loss
from lifelines.utils import concordance_index
from scipy.stats import pearsonr
import pickle 
torch.manual_seed(2)
np.random.seed(3)
import copy
from prettytable import PrettyTable

import os

from DeepPurpose.utils import *
from DeepPurpose.model_helper import Encoder_MultipleLayers, Embeddings        
from DeepPurpose.encoders import *
from DeepPurpose import DTI

In [13]:
data_path = './data//BindingDB_All.tsv'
df = pd.read_csv(data_path, sep = '\t', error_bad_lines=False)
df = df[df['Number of Protein Chains in Target (>1 implies a multichain complex)'] == 1.0]
df = df[df['Ligand SMILES'].notnull()]

b'Skipping line 772572: expected 193 fields, saw 205\nSkipping line 772598: expected 193 fields, saw 205\n'
b'Skipping line 805291: expected 193 fields, saw 205\n'
b'Skipping line 827961: expected 193 fields, saw 265\n'
b'Skipping line 1231688: expected 193 fields, saw 241\n'
b'Skipping line 1345591: expected 193 fields, saw 241\nSkipping line 1345592: expected 193 fields, saw 241\nSkipping line 1345593: expected 193 fields, saw 241\nSkipping line 1345594: expected 193 fields, saw 241\nSkipping line 1345595: expected 193 fields, saw 241\nSkipping line 1345596: expected 193 fields, saw 241\nSkipping line 1345597: expected 193 fields, saw 241\nSkipping line 1345598: expected 193 fields, saw 241\nSkipping line 1345599: expected 193 fields, saw 241\n'
b'Skipping line 1358864: expected 193 fields, saw 205\n'
b'Skipping line 1378087: expected 193 fields, saw 241\nSkipping line 1378088: expected 193 fields, saw 241\nSkipping line 1378089: expected 193 fields, saw 241\nSkipping line 1378090: e

In [20]:
df = df[['BindingDB Reactant_set_id', 'Ligand InChI', 'Ligand SMILES',\
                  'PubChem CID', 'UniProt (SwissProt) Primary ID of Target Chain',\
                  'Target Source Organism According to Curator or DataSource',\
                  'BindingDB Target Chain  Sequence', 'Kd (nM)', 'IC50 (nM)', 'Ki (nM)',\
                  'EC50 (nM)', 'kon (M-1-s-1)', 'koff (s-1)','pH','Temp (C)']]
df.rename(columns={'BindingDB Reactant_set_id':'ID',
                        'Ligand SMILES':'SMILES',
                        'Ligand InChI':'InChI',
                        'PubChem CID':'PubChem_ID',
                        'UniProt (SwissProt) Primary ID of Target Chain':'UniProt_ID',
                        'BindingDB Target Chain  Sequence': 'Target Sequence',
                        'Target Source Organism According to Curator or DataSource': 'Organism',
                        'Kd (nM)':'Kd',
                        'IC50 (nM)':'IC50',
                        'Ki (nM)':'Ki',
                        'EC50 (nM)':'EC50',
                        'kon (M-1-s-1)':'kon',
                        'koff (s-1)':'koff',
                        'Temp (C)':'Temp'}, 
                        inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [21]:
df.head()

Unnamed: 0,ID,InChI,SMILES,PubChem_ID,UniProt_ID,Organism,Target Sequence,Kd,IC50,Ki,EC50,kon,koff,pH,Temp
0,1,InChI=1S/C22H24BrFN4O2/c1-28-7-5-14(6-8-28)12-...,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1,3081361.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,,,0.24,,,,5.5,37.00 C
1,2,InChI=1S/C31H34N6O3/c38-29-27(17-23-9-3-1-4-10...,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,5327236.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,,,0.25,,,,5.5,37.00 C
2,3,InChI=1S/C29H34N4O3/c34-27-25(16-21-8-3-1-4-9-...,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,5327235.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,,,0.41,,,,5.5,37.00 C
3,4,InChI=1S/C29H40N2O4/c32-18-10-2-1-9-17-30-25(1...,OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,5327234.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,,,0.8,,,,5.5,37.00 C
4,5,InChI=1S/C28H38N2O4/c31-17-9-3-8-16-29-24(18-2...,OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,3009319.0,,Human immunodeficiency virus 1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,,,0.99,,,,5.5,37.00 C


In [38]:
df['Temp'] = df['Temp'].str.rstrip('C')
df.count()

ID                 1733850
InChI              1733282
SMILES             1733850
PubChem_ID         1718479
UniProt_ID         1538086
Organism           1238470
Target Sequence    1733850
Kd                   74761
IC50               1080811
Ki                  417859
EC50                164210
kon                    654
koff                   524
pH                  204919
Temp                191364
dtype: int64

In [36]:
df.to_pickle("./df.pkl")

In [35]:
df = pd.read_pickle("./df.pkl")

In [36]:
idx_str = ['Kd', 'IC50', 'Ki','EC50','Temp']
df_want = df
convert_to_log = 0

# have at least uniprot or pubchem ID
df_want = df_want[df_want.PubChem_ID.notnull() | df_want.UniProt_ID.notnull()]
df_want = df_want[df_want.InChI.notnull()]

for label in idx_str:
#    df_want = df_want[df_want[label].notnull()]
#    print(df_want.size)
    df_want[label] = df_want[label].str.replace('>', '')
    df_want[label] = df_want[label].str.replace('<', '')
    #df_want[label] = df_want[label].astype(float)
#    df_want = df_want[df_want[label] <= 10000000.0]

y = df_want[idx_str]
for label in idx_str:
    if convert_to_log:
            print('Default set to logspace (nM -> p) for easier regression')
            y[label] = convert_y_unit(df_want[label].values, 'nM', 'p') 
    else:
            y[label] = df_want[label].values

y['pH'] = df_want['pH']
X_drugs = df_want.SMILES.values
X_targets = df_want['Target Sequence'].values
y = y.apply(pd.to_numeric, errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [37]:
X_org = df_want['Organism']
y

Unnamed: 0,Kd,IC50,Ki,EC50,Temp,pH
0,,,0.24,,37.0,5.5
1,,,0.25,,37.0,5.5
2,,,0.41,,37.0,5.5
3,,,0.80,,37.0,5.5
4,,,0.99,,37.0,5.5
...,...,...,...,...,...,...
1813522,,,,152.0,,
1813523,,,,601.0,,
1813524,,,,12.0,,
1813525,,,,402.0,,


In [38]:
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer

imputer = SimpleImputer() #TODO: better imputer
y_i = imputer.fit_transform(y)

In [39]:
y_i = pd.DataFrame(data=y_i,columns=idx_str+['pH'])
y_i

Unnamed: 0,Kd,IC50,Ki,EC50,Temp,pH
0,872669.450093,1.264663e+08,2.400000e-01,362878.629538,37.000000,5.500000
1,872669.450093,1.264663e+08,2.500000e-01,362878.629538,37.000000,5.500000
2,872669.450093,1.264663e+08,4.100000e-01,362878.629538,37.000000,5.500000
3,872669.450093,1.264663e+08,8.000000e-01,362878.629538,37.000000,5.500000
4,872669.450093,1.264663e+08,9.900000e-01,362878.629538,37.000000,5.500000
...,...,...,...,...,...,...
1730861,872669.450093,1.264663e+08,1.941028e+06,152.000000,27.936324,7.242816
1730862,872669.450093,1.264663e+08,1.941028e+06,601.000000,27.936324,7.242816
1730863,872669.450093,1.264663e+08,1.941028e+06,12.000000,27.936324,7.242816
1730864,872669.450093,1.264663e+08,1.941028e+06,402.000000,27.936324,7.242816


In [40]:
df_data = y_i
df_data['SMILES'] = X_drugs
df_data['Target Sequence'] = X_targets
df_data['Organism'] = X_org

print('in total: ' + str(len(df_data)) + ' drug-target pairs')
df_data

in total: 1730866 drug-target pairs


Unnamed: 0,Kd,IC50,Ki,EC50,Temp,pH,SMILES,Target Sequence,Organism
0,872669.450093,1.264663e+08,2.400000e-01,362878.629538,37.000000,5.500000,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,Human immunodeficiency virus 1
1,872669.450093,1.264663e+08,2.500000e-01,362878.629538,37.000000,5.500000,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,Human immunodeficiency virus 1
2,872669.450093,1.264663e+08,4.100000e-01,362878.629538,37.000000,5.500000,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,Human immunodeficiency virus 1
3,872669.450093,1.264663e+08,8.000000e-01,362878.629538,37.000000,5.500000,OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,Human immunodeficiency virus 1
4,872669.450093,1.264663e+08,9.900000e-01,362878.629538,37.000000,5.500000,OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,Human immunodeficiency virus 1
...,...,...,...,...,...,...,...,...,...
1730861,872669.450093,1.264663e+08,1.941028e+06,152.000000,27.936324,7.242816,Oc1ccc(Br)cc1Cn1c(nc2ccc(cc12)[N+]([O-])=O)-c1...,MWRCGGRQGLCVLRRLSGGHAHHRAWRWNSNRACERALQYKLGDKI...,Mus musculus
1730862,872669.450093,1.264663e+08,1.941028e+06,601.000000,27.936324,7.242816,Oc1ccc(Br)cc1CN1C(N(Cc2cc(Br)ccc2O)c2cc(ccc12)...,MWRCGGRQGLCVLRRLSGGHAHHRAWRWNSNRACERALQYKLGDKI...,Mus musculus
1730863,872669.450093,1.264663e+08,1.941028e+06,12.000000,27.936324,7.242816,Oc1ccc(Br)cc1Cn1c(nc2ccc(cc12)[N+]([O-])=O)-c1...,MWRCGGRQGLCVLRRLSGGHAHHRAWRWNSNRACERALQYKLGDKI...,Mus musculus
1730864,872669.450093,1.264663e+08,1.941028e+06,402.000000,27.936324,7.242816,Oc1ccc(Br)cc1CN1C(N(Cc2cc(Br)ccc2O)c2cc(ccc12)...,MWRCGGRQGLCVLRRLSGGHAHHRAWRWNSNRACERALQYKLGDKI...,Mus musculus


In [8]:
df_backup = df_data
df_data = df_data.head(10000)

In [9]:
df_backup.IC50

0          1.264663e+08
1          1.264663e+08
2          1.264663e+08
3          1.264663e+08
4          1.264663e+08
               ...     
1730861    1.264663e+08
1730862    1.264663e+08
1730863    1.264663e+08
1730864    1.264663e+08
1730865    1.264663e+08
Name: IC50, Length: 1730866, dtype: float64

In [42]:
import math
def magnitude(x):
    if x > 0:
        return int(math.floor(math.log10(x)))
    else:
        return 0

magic50 = df_data['IC50'].apply(magnitude)

In [43]:
df_foo = df_data
df_foo['magic50'] = magic50
chaindf = df_foo.groupby(by=['magic50']).count()
chaindf

Unnamed: 0_level_0,Kd,IC50,Ki,EC50,Temp,pH,SMILES,Target Sequence,Organism
magic50,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
-8,1,1,1,1,1,1,1,1,1
-7,2,2,2,2,2,2,2,2,1
-6,26,26,26,26,26,26,26,26,18
-5,87,87,87,87,87,87,87,87,74
-4,66,66,66,66,66,66,66,66,57
-3,675,675,675,675,675,675,675,675,348
-2,3505,3505,3505,3505,3505,3505,3505,3505,2282
-1,30870,30870,30870,30870,30870,30870,30870,30870,19965
0,129086,129086,129086,129086,129086,129086,129086,129086,80546
1,219899,219899,219899,219899,219899,219899,219899,219899,139537


In [3]:
df_data = df_big

In [None]:
import time

drug_func_list= [smiles2morgan,trans_drug,drug2emb_encoder]
#drug_func_list= [calcPubChemFingerAll,smiles2mpnnfeature,smiles2daylight]
#TODO: add calcPubChemFingerAll back in when it's not broken
#TODO: smiles2rdkit2d takes forever and can be added later
#TODO: smiles2mpnnfeature doesn't take super long (around 40 min on desktop) but can be added later
#TODO: same wrt smiles2daylight
column_name = 'SMILES'
start = time.time()

for func in drug_func_list:
    save_column_name = func.__name__
    unique = pd.Series(df_data[column_name].unique()).apply(func)
    unique_dict = dict(zip(df_data[column_name].unique(), unique))
    df_data[save_column_name] = [unique_dict[i] for i in df_data[column_name]]
    end = time.time()
    print(end - start)

rdkit not found this smiles for morgan: CC(C)(C)OC(=O)N1CC(=O)N(C(=O)C1)c1ccc(cc1)N1CC(COC(=O)[N]2=CC=C(Cl)S2)OC1=O convert to all 0 features
rdkit not found this smiles for morgan: CSc1ccc(cc1)C1=C(C=C[N]([O-])=C1)[C@@H]1CCC(F)(F)C[C@H]1C(=O)NCC#N convert to all 0 features
rdkit not found this smiles for morgan: O=C1NC(=O)c2c1c1c3ccccc3n3[Ru](C#[O])[n+]4cccc2c4c13 convert to all 0 features
rdkit not found this smiles for morgan: CN1C(=O)c2c(C1=O)c1cc(F)c[n+]3[Ru](C#[O])n4c5ccc(O)cc5c2c4c13 convert to all 0 features
rdkit not found this smiles for morgan: CC(=O)NCCc1cc2ccccc2n1 convert to all 0 features
rdkit not found this smiles for morgan: [H][N]1(CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc1ccccc1)C(N)=O)C(=O)CNC(=O)[C@@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](NC(=O)[C@@H]1CCCN1)[C@@H](C)O)C(C)C convert to all 0 features
rdkit not found this smiles for morgan: NC(=N)c1ccc(CNC(=O)[C@H](CCC2CCNCC2)NC(=O)[C@@H](CCCC2=CC=[N](O)C=C2)NS(=O)(=O)Cc2ccccc2)cc1 convert to all 0 

rdkit not found this smiles for morgan: CC(C)n1c(Nc2ccccc2)nc2cnc(Nc3ccc(cc3)C(=O)N[N]3=CCN(C)CC3)nc12 convert to all 0 features
rdkit not found this smiles for morgan: CC(C)n1c(Nc2cccc(NC(C)=O)c2)nc2cnc(Nc3ccc(cc3F)C(=O)N[N]3=CCN(C)CC3)nc12 convert to all 0 features
rdkit not found this smiles for morgan: COc1ccc2C[C@@H]3[C@]45CC[C@](OC)([C@@H]6Oc1c2[C@]46CC[N]3(C)CC1CC1)[C@@H](COCc1ccc(Cl)c(Cl)c1)C5 convert to all 0 features
rdkit not found this smiles for morgan: COc1ccc2C[C@@H]3[C@]45CC[C@](OC)([C@@H]6Oc1c2[C@]46CC[N]3(C)CC1CC1)[C@@H](COCc1cc2OCOc2cc1Cl)C5 convert to all 0 features
rdkit not found this smiles for morgan: COc1ccc2C[C@@H]3[C@]45CC[C@](OC)([C@@H]6Oc1c2[C@]46CC[N]3(C)CC1CC1)[C@@H](COCc1ccc(C)cc1)C5 convert to all 0 features
rdkit not found this smiles for morgan: CO[C@]12CC[C@@]3(C[C@@H]1COCc1ccc(F)cc1)[C@H]1Cc4ccc(O)c5O[C@@H]2[C@]3(CC[N]1(C)CC1CC1)c45 convert to all 0 features
rdkit not found this smiles for morgan: CO[C@]12CC[C@@]3(C[C@@H]1COCc1cc4ccccc4s1)[C@H]1Cc4c

rdkit not found this smiles for morgan: Cc1cc2ccccc2c1-c1ccc(cc1)C(=O)NO convert to all 0 features
rdkit not found this smiles for morgan: Cc1cc(-c2ccc(cc2)C(=O)NO)c2ccccc12 convert to all 0 features
rdkit not found this smiles for morgan: ONC(=O)c1ccc(cc1)-c1cc(Cc2ccccc2)c2ccccc12 convert to all 0 features
rdkit not found this smiles for morgan: CN(C)CCc1cc(-c2ccc(cc2)C(=O)NO)c2ccccc12 convert to all 0 features
rdkit not found this smiles for morgan: CN1CCN(CCc2cc(-c3ccc(cc3)C(=O)NO)c3ccccc23)CC1 convert to all 0 features
rdkit not found this smiles for morgan: CC(C)(C)OC(=O)N(C1CC1)C1=NC(=C[N]2=C(\C=C3/NC(=O)NC3=O)C=NC12)c1cccc(OC(F)(F)F)c1 convert to all 0 features
rdkit not found this smiles for morgan: O=C1NC(=O)\C(N1)=C\C1=[N]2C=C(N=C(NC3CC3)C2N=C1)C#Cc1ccccc1 convert to all 0 features
rdkit not found this smiles for morgan: O=C1NC(=O)\C(N1)=C\C1=[N]2C=C(N=C(NC3CC3)C2N=C1)c1cccc(CN2CCOCC2)c1 convert to all 0 features
rdkit not found this smiles for morgan: FC(F)(F)Oc1cccc(c1)C1=C

rdkit not found this smiles for morgan: CC(C)c1nn(cc1CN1CCn2[nH]nnc2C1c1ccc(F)cc1)-c1ccccc1 convert to all 0 features
rdkit not found this smiles for morgan: Clc1cccc(Cl)c1NC(=O)N1CCN(CC1)c1ccc(cc1)-c1ncnn1 convert to all 0 features
rdkit not found this smiles for morgan: Cc1ccc(cc1)-c1nc([nH]o1)-c1ccc(cc1)N1CCN(CC1)C(=O)Nc1ccccc1Cl convert to all 0 features
rdkit not found this smiles for morgan: Cc1ccc(o1)-c1cnc(CCC2=NN3C(N2)C=CC=C3C)n1 convert to all 0 features
rdkit not found this smiles for morgan: CC1=CC=CC2NC(CCc3ncc(n3)-c3ccsc3)=NN12 convert to all 0 features
rdkit not found this smiles for morgan: CC1=CC=CC2NC(CCc3ncc(n3)-c3cccnc3)=NN12 convert to all 0 features
rdkit not found this smiles for morgan: Cc1cscc1-c1cnc(CCC2=NN3C(N2)C=CC=C3C)n1 convert to all 0 features
rdkit not found this smiles for morgan: CC1=CC=CN2N=C(CCc3ncc(n3)-c3cccnc3)NC12 convert to all 0 features
rdkit not found this smiles for morgan: COC1=CC=CN2N=C(CCc3ncc(n3)-c3cncs3)NC12 convert to all 0 features
rd

rdkit not found this smiles for morgan: Cc1n[nH]n(C)c1-c1cnc2c(c1)n([C@@H](C1CCOCC1)c1ccccc1F)c1c(F)c(ccc21)C(C)(O)C1CC1 convert to all 0 features
rdkit not found this smiles for morgan: Cc1n[nH]n(C)c1-c1cnc2c(c1)n(C(C1CCOCC1)c1ccccc1F)c1c(F)c(ccc21)C(C)(C)O convert to all 0 features
rdkit not found this smiles for morgan: Cc1n[nH]n(C)c1-c1cnc2c(c1)n([C@H](C1CCOCC1)c1cc(F)cc(F)c1)c1cc(ccc21)C(C)(C)O convert to all 0 features
rdkit not found this smiles for morgan: Cc1n[nH]n(C)c1-c1cnc2c(c1)n([C@@H](C1CCOCC1)c1cc(F)cc(F)c1)c1cc(ccc21)C(C)(C)O convert to all 0 features
rdkit not found this smiles for morgan: Cc1n[nH]n(C)c1-c1cnc2c(c1)n([C@@H](C1CCOCC1)c1cccc(F)c1)c1cc(ccc21)C(C)(C)O convert to all 0 features
rdkit not found this smiles for morgan: Cc1n[nH]n(C)c1-c1cnc2c(c1)n([C@H](C1CCOCC1)c1cccc(F)c1)c1cc(ccc21)C(C)(C)O convert to all 0 features
rdkit not found this smiles for morgan: Cc1n[nH]n(C)c1-c1cnc2c(c1)n([C@H](C1CCOCC1)c1c(F)cc(F)cc1F)c1cc(ccc21)C(C)(C)O convert to all 0 feature

rdkit not found this smiles for morgan: Cc1cc(N)nc(C)c1CNC(=O)c1ccc2[C@H]3O[C@@H](c2c1)c1cc(ccc31)C1=CC=CC=[N]1O convert to all 0 features
rdkit not found this smiles for morgan: Cc1cc(N)nc(C)c1CNC(=O)c1ccc2[C@H]3O[C@@H](c2c1)c1cc(ccc31)C1=[N](O)C=C(F)C=C1 convert to all 0 features
rdkit not found this smiles for morgan: CC(C)c1cc(N)nc(C)c1CNC(=O)c1ccc2[C@H]3O[C@@H](c2c1)c1cc(ccc31)C1=C(F)C=[N](O)C=C1 convert to all 0 features
rdkit not found this smiles for morgan: Cc1nc(N)cc(C2CC2)c1CNC(=O)c1ccc2[C@H]3O[C@@H](c2c1)c1cc(ccc31)C1=C(F)C=[N](O)C=C1 convert to all 0 features
rdkit not found this smiles for morgan: CC(C)c1cc(N)nc(C)c1CNC(=O)c1ccc2[C@H]3O[C@@H](c2c1)c1cc(ccc31)C1=C(F)C=CC=[N]1O convert to all 0 features
rdkit not found this smiles for morgan: CC(=O)N[C@@H](CC1=CNC=[N]1C)C(=O)Nc1cccc(n1)-c1ccc(Oc2ccc(F)cc2)cc1 convert to all 0 features
rdkit not found this smiles for morgan: CC(C)[N]1(O)CCC(CC1)c1ccc(Nc2nc(cnc2C(N)=O)N2CCC[C@H](C2)NC(=O)N(C)C)cc1 convert to all 0 features
rd

rdkit not found this smiles for morgan: COc1cc2ncnc(Oc3cccc(NC(=O)NC4=CC(=[N](N4)c4ccccc4)C(F)(F)F)c3)c2cc1OC convert to all 0 features
rdkit not found this smiles for morgan: COc1cc2ncnc(Sc3cccc(NC(=O)NC4=CC(=[N](N4)c4ccccc4)C(F)(F)F)c3)c2cc1OC convert to all 0 features
rdkit not found this smiles for morgan: COc1cc2ncnc(Sc3cccc(NC(=O)NC4=CC(=[N](C)N4)C(F)(F)F)c3)c2cc1OC convert to all 0 features
rdkit not found this smiles for morgan: CC(C)(O)c1cc(O[C@H]2CC[C@H](CC2)N2C[N](CC#N)(C2)n2cc(cn2)-c2ncnc3[nH]ccc23)nc(c1)C(F)(F)F convert to all 0 features
rdkit not found this smiles for morgan: COc1cc(OC)cc(c1)C(\O)=C1\CC=c2ncc(cc2=[N]1CCO)-c1cnn(C)c1 convert to all 0 features
rdkit not found this smiles for morgan: COC1CCN(CC1)c1cccc2C(N(CCc12)C(=O)C1=NC=C[N](=C1)c1cccc(Cl)c1F)C(=O)Nc1ccc(cc1)C(O)=O convert to all 0 features
rdkit not found this smiles for morgan: CC1=C(C(C)=[N](C)C=N1)c1ccc(Oc2nccc3N(CCc23)C2CCCCO2)cc1C convert to all 0 features
rdkit not found this smiles for morgan: CCO

rdkit not found this smiles for morgan: NC(CC#N)[N]1=NN=NC1 convert to all 0 features
rdkit not found this smiles for morgan: N[C@@H](CC#C)[N]1=NN=NC1 convert to all 0 features
rdkit not found this smiles for morgan: Nc1ccc(cn1)C1=[C](c2nn[nH]n2)=[C](=C(C=C1)S(=O)C1CCNCC1)S(N)(=O)=O convert to all 0 features
rdkit not found this smiles for morgan: C[N]1=CC(Nc2nccc(n2)-c2ccc3C(CCCCc3c2)NC(=O)c2nnc(s2)C(C)(C)C)=CN1 convert to all 0 features
rdkit not found this smiles for morgan: C[N]1(CCOc2ccc(CC(=O)NS(C)(=O)=O)cc2)CC(CN[C@@H]2C[C@H]2c2ccccc2)C1 convert to all 0 features
rdkit not found this smiles for morgan: C[N]1(CCOc2ccc(cc2)C(=O)NS(C)(=O)=O)CC(CN[C@@H]2C[C@H]2c2ccccc2)C1 convert to all 0 features
rdkit not found this smiles for morgan: CC(C)[C@@H](NC(=O)c1cc(ccc1F)C(F)(F)F)C(=O)N1CCC2(CC1)N(C(=O)N(C)C2=O)C1=CC=[N](O)C=C1 convert to all 0 features
rdkit not found this smiles for morgan: C[C@@H](C#[O])n1c2ccc(cc2c(=O)n(C)c1=O)S(=O)(=O)NC1(C)CC1 convert to all 0 features
rdkit not fou

In [None]:
prot_func_list = [CalculateConjointTriad, trans_protein, protein2emb_encoder]
#TODO: run CalculateAADipeptideComposition and _GetPseudoAAC when time permits
#TODO: GetQuasiSequenceOrder is broken
#prot_func_list = [GetQuasiSequenceOrder]
column_name = 'Target Sequence'
start = time.time()

for func in prot_func_list:
    save_column_name = func.__name__
    AA = pd.Series(df_data[column_name].unique()).apply(func)
    AA_dict = dict(zip(df_data[column_name].unique(), AA))
    df_data[save_column_name] = [AA_dict[i] for i in df_data[column_name]]
    end = time.time()
    print(end - start)

In [12]:
df_data.to_pickle("./df_data_small.pkl")

In [2]:
df_data = pd.read_pickle("./df_data_small.pkl")

In [3]:
df_data.head()

Unnamed: 0,Kd,IC50,Ki,EC50,Temp,pH,SMILES,Target Sequence,Organism,smiles2morgan,trans_drug,drug2emb_encoder,CalculateConjointTriad,trans_protein,protein2emb_encoder
0,872669.450093,126466300.0,0.24,362878.629538,37.0,5.5,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[C, O, c, 1, c, c, 2, c, (, N, c, 3, c, c, c, ...","([515, 343, 982, 52, 93, 210, 614, 1244, 690, ...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","[P, Q, I, T, L, W, Q, R, P, L, V, T, I, K, I, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21..."
1,872669.450093,126466300.0,0.25,362878.629538,37.0,5.5,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[O, [, C, ?, ?, H, ], 1, [, C, ?, ?, H, ], (, ...","([1138, 186, 144, 265, 199, 188, 381, 1734, 13...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","[P, Q, I, T, L, W, Q, R, P, L, V, T, I, K, I, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21..."
2,872669.450093,126466300.0,0.41,362878.629538,37.0,5.5,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[O, [, C, ?, ?, H, ], 1, [, C, ?, ?, H, ], (, ...","([1138, 186, 144, 265, 261, 158, 322, 65, 188,...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","[P, Q, I, T, L, W, Q, R, P, L, V, T, I, K, I, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21..."
3,872669.450093,126466300.0,0.8,362878.629538,37.0,5.5,OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[O, C, C, C, C, C, C, N, 1, [, C, ?, H, ], (, ...","([700, 409, 1769, 833, 144, 265, 261, 158, 258...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","[P, Q, I, T, L, W, Q, R, P, L, V, T, I, K, I, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21..."
4,872669.450093,126466300.0,0.99,362878.629538,37.0,5.5,OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[O, C, C, C, C, C, N, 1, [, C, ?, H, ], (, C, ...","([700, 223, 1769, 833, 144, 265, 261, 158, 258...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","[P, Q, I, T, L, W, Q, R, P, L, V, T, I, K, I, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21..."


In [19]:
df_data.to_csv('df_data1.csv.gz'
         , sep='|'
         , header=True
         , index=False
         , chunksize=10000
         , compression='gzip'
         , encoding='utf-8')

In [2]:
df_big = pd.read_csv('df_data1.csv.gz'
         , sep='|'
         , compression='gzip'
         , encoding='utf-8')

In [7]:
df_big.head()

Unnamed: 0,Kd,IC50,Ki,EC50,Temp,SMILES,Target Sequence,smiles2morgan,smiles2daylight,trans_drug,drug2emb_encoder,CalculateConjointTriad,trans_protein,protein2emb_encoder
0,872669.450093,126466300.0,0.24,362878.629538,37.0,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,[0. 0. 0. ... 0. 0. 0.],[1. 0. 0. ... 0. 0. 1.],"['C', 'O', 'c', '1', 'c', 'c', '2', 'c', '(', ...","(array([ 515, 343, 982, 52, 93, 210, 6...",[0 3 0 1 0 1 0 3 2 1 0 0 1 0 1 1 0 0 0 0 0 0 2...,"['P', 'Q', 'I', 'T', 'L', 'W', 'Q', 'R', 'P', ...","(array([ 14, 212, 35, 2864, 47, 69, ..."
1,872669.450093,126466300.0,0.25,362878.629538,37.0,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,[0. 0. 0. ... 0. 0. 0.],[1. 0. 0. ... 0. 1. 1.],"['O', '[', 'C', '?', '?', 'H', ']', '1', '[', ...","(array([1138, 186, 144, 265, 199, 188, 3...",[0 3 0 1 0 1 0 3 2 1 0 0 1 0 1 1 0 0 0 0 0 0 2...,"['P', 'Q', 'I', 'T', 'L', 'W', 'Q', 'R', 'P', ...","(array([ 14, 212, 35, 2864, 47, 69, ..."
2,872669.450093,126466300.0,0.41,362878.629538,37.0,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,[0. 0. 0. ... 0. 0. 0.],[1. 0. 0. ... 0. 1. 1.],"['O', '[', 'C', '?', '?', 'H', ']', '1', '[', ...","(array([1138, 186, 144, 265, 261, 158, 3...",[0 3 0 1 0 1 0 3 2 1 0 0 1 0 1 1 0 0 0 0 0 0 2...,"['P', 'Q', 'I', 'T', 'L', 'W', 'Q', 'R', 'P', ...","(array([ 14, 212, 35, 2864, 47, 69, ..."
3,872669.450093,126466300.0,0.8,362878.629538,37.0,OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 1. 1.],"['O', 'C', 'C', 'C', 'C', 'C', 'C', 'N', '1', ...","(array([ 700, 409, 1769, 833, 144, 265, 2...",[0 3 0 1 0 1 0 3 2 1 0 0 1 0 1 1 0 0 0 0 0 0 2...,"['P', 'Q', 'I', 'T', 'L', 'W', 'Q', 'R', 'P', ...","(array([ 14, 212, 35, 2864, 47, 69, ..."
4,872669.450093,126466300.0,0.99,362878.629538,37.0,OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 1. 1.],"['O', 'C', 'C', 'C', 'C', 'C', 'N', '1', '[', ...","(array([ 700, 223, 1769, 833, 144, 265, 2...",[0 3 0 1 0 1 0 3 2 1 0 0 1 0 1 1 0 0 0 0 0 0 2...,"['P', 'Q', 'I', 'T', 'L', 'W', 'Q', 'R', 'P', ...","(array([ 14, 212, 35, 2864, 47, 69, ..."


In [4]:
#turn categorical variables into numerical dummy variables for modeling

cat_list = pd.get_dummies(df_data['Organism'], prefix='var')
df_data1=df_data.join(cat_list)

In [5]:
df_data1.head()

Unnamed: 0,Kd,IC50,Ki,EC50,Temp,pH,SMILES,Target Sequence,Organism,smiles2morgan,...,var_Influenza B virus,var_Influenza B virus (B/Memphis/3/93),var_Influenza B virus (B/Victoria/517/2005),var_Influenza B virus (strain B/Lee/1940),var_Influenza B virus (strain B/Memphis/3/1989),var_Klebsiella pneumoniae,var_Mus musculus,var_Oryctolagus cuniculus,var_Pseudomonas aeruginosa,var_Rattus norvegicus
0,872669.450093,126466300.0,0.24,362878.629538,37.0,5.5,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",...,0,0,0,0,0,0,0,0,0,0
1,872669.450093,126466300.0,0.25,362878.629538,37.0,5.5,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",...,0,0,0,0,0,0,0,0,0,0
2,872669.450093,126466300.0,0.41,362878.629538,37.0,5.5,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",...,0,0,0,0,0,0,0,0,0,0
3,872669.450093,126466300.0,0.8,362878.629538,37.0,5.5,OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",...,0,0,0,0,0,0,0,0,0,0
4,872669.450093,126466300.0,0.99,362878.629538,37.0,5.5,OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,Human immunodeficiency virus 1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",...,0,0,0,0,0,0,0,0,0,0


In [6]:
discard=['SMILES','Target Sequence','Organism','trans_drug','trans_protein']
df_vars=df_data1.columns.values.tolist()
to_keep=[i for i in df_vars if i not in discard]
df_final=df_data1[to_keep]
df_final.head()

Unnamed: 0,Kd,IC50,Ki,EC50,Temp,pH,smiles2morgan,drug2emb_encoder,CalculateConjointTriad,protein2emb_encoder,...,var_Influenza B virus,var_Influenza B virus (B/Memphis/3/93),var_Influenza B virus (B/Victoria/517/2005),var_Influenza B virus (strain B/Lee/1940),var_Influenza B virus (strain B/Memphis/3/1989),var_Klebsiella pneumoniae,var_Mus musculus,var_Oryctolagus cuniculus,var_Pseudomonas aeruginosa,var_Rattus norvegicus
0,872669.450093,126466300.0,0.24,362878.629538,37.0,5.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","([515, 343, 982, 52, 93, 210, 614, 1244, 690, ...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...",...,0,0,0,0,0,0,0,0,0,0
1,872669.450093,126466300.0,0.25,362878.629538,37.0,5.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","([1138, 186, 144, 265, 199, 188, 381, 1734, 13...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...",...,0,0,0,0,0,0,0,0,0,0
2,872669.450093,126466300.0,0.41,362878.629538,37.0,5.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","([1138, 186, 144, 265, 261, 158, 322, 65, 188,...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...",...,0,0,0,0,0,0,0,0,0,0
3,872669.450093,126466300.0,0.8,362878.629538,37.0,5.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","([700, 409, 1769, 833, 144, 265, 261, 158, 258...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...",...,0,0,0,0,0,0,0,0,0,0
4,872669.450093,126466300.0,0.99,362878.629538,37.0,5.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","([700, 223, 1769, 833, 144, 265, 261, 158, 258...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...",...,0,0,0,0,0,0,0,0,0,0


In [7]:
len(df_final.iloc[12,9][1])

545

In [8]:
newV = df_final.iloc[1,:].values.flatten()

In [9]:
import collections


def flattener(x):
    if isinstance(x, collections.Iterable):
        return [a for i in x for a in flattener(i)]
    else:
        return [x]
            
h = flattener(df_final.iloc[1,:].values)
h

  """


[872669.450092565,
 126466272.14992414,
 0.25,
 362878.62953817454,
 37.0,
 5.5,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 4.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 6.0,
 2.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 4.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 2.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 

In [10]:
backup2 = df_data
df_data = df_final

In [26]:
X = df_final.drop(["IC50","Ki","Kd","EC50"], axis=1)
vec = X.apply(flattener,axis=1)

In [27]:
foo = vec.to_numpy(dtype = object)
foo2 = np.vstack(foo)

In [28]:
from sklearn.model_selection import train_test_split
y = df_final["IC50"]
X_train, X_test, y_train, y_test = train_test_split(foo2, y, test_size=0.2, random_state=123)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_scaled = scaler.fit_transform(X_train)
test_scaled = scaler.transform(X_test)

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
tree_model = DecisionTreeRegressor()
rf_model = RandomForestRegressor()

In [29]:
tree_model.fit(train_scaled, y_train)
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import average_precision_score
tree_mse = mean_squared_error(y_train, tree_model.predict(train_scaled))
tree_mae = mean_absolute_error(y_train, tree_model.predict(train_scaled))
from math import sqrt
print("Decision Tree training mse = ",tree_mse," & mae = ",tree_mae," & rmse = ", sqrt(tree_mse))
tree_test_mse = mean_squared_error(y_test, tree_model.predict(test_scaled))
tree_test_mae = mean_absolute_error(y_test, tree_model.predict(test_scaled))
print("Decision Tree test mse = ",tree_test_mse," & mae = ",tree_test_mae," & rmse = ", sqrt(tree_test_mse))
a = pearsonr(y_train, tree_model.predict(train_scaled))
b = pearsonr(y_test, tree_model.predict(test_scaled))
print("Decision Tree train r = ",a)
print("Decision Tree test r = ",b)

Decision Tree training mse =  999608099129.5325  & mae =  15854.621414610214  & rmse =  999804.0303627169
Decision Tree test mse =  304075586552767.8  & mae =  2440120.158040598  & rmse =  17437763.232501116
Decision Tree train r =  (0.9997760107027993, 0.0)
Decision Tree test r =  (0.9382436638050471, 0.0)


In [30]:
importances = pd.DataFrame({'importance':np.round(tree_model.feature_importances_,3)})
out = importances.sort_values('importance',ascending=False)

In [31]:
X.head()

Unnamed: 0,Temp,pH,smiles2morgan,drug2emb_encoder,CalculateConjointTriad,protein2emb_encoder,var_Abelson murine leukemia virus,var_Avian sarcoma virus,var_Bos taurus,var_Gallus gallus,...,var_Influenza B virus,var_Influenza B virus (B/Memphis/3/93),var_Influenza B virus (B/Victoria/517/2005),var_Influenza B virus (strain B/Lee/1940),var_Influenza B virus (strain B/Memphis/3/1989),var_Klebsiella pneumoniae,var_Mus musculus,var_Oryctolagus cuniculus,var_Pseudomonas aeruginosa,var_Rattus norvegicus
0,37.0,5.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","([515, 343, 982, 52, 93, 210, 614, 1244, 690, ...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,37.0,5.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","([1138, 186, 144, 265, 199, 188, 381, 1734, 13...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,37.0,5.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","([1138, 186, 144, 265, 261, 158, 322, 65, 188,...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,37.0,5.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","([700, 409, 1769, 833, 144, 265, 261, 158, 258...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,37.0,5.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","([700, 223, 1769, 833, 144, 265, 261, 158, 258...","[0, 3, 0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 1, 0, 1, ...","([14, 212, 35, 2864, 47, 69, 86, 497, 3636, 21...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
importances

Unnamed: 0,importance
0,0.004
1,0.038
2,0.000
3,0.000
4,0.000
...,...
2577,0.000
2578,0.000
2579,0.000
2580,0.000


In [34]:
out.head(20)

Unnamed: 0,importance
1477,0.273
1550,0.121
394,0.1
1639,0.062
38,0.043
1,0.038
185,0.032
1292,0.024
17,0.017
709,0.014


In [87]:
rf_model.fit(train_scaled, y_train)

KeyboardInterrupt: 

In [None]:
rf_mse = mean_squared_error(y_train, rf_model.predict(train_scaled))
rf_mae = mean_absolute_error(y_train, rf_model.predict(train_scaled))

print("Random Forest training mse = ",rf_mse," & mae = ",rf_mae," & rmse = ", sqrt(rf_mse))
rf_test_mse = mean_squared_error(y_test, rf_model.predict(test_scaled))
rf_test_mae = mean_absolute_error(y_test, rf_model.predict(test_scaled))
print("Random Forest test mse = ",rf_test_mse," & mae = ",rf_test_mae," & rmse = ", sqrt(rf_test_mse))

In [26]:
# dti split

split_method = 'cold_drug'
random_seed = 1
frac = [0.7, 0.2, 0.1]

print('splitting dataset...')

#TODO: what is HTS

if split_method == 'random': 
    train, val, test = create_fold(df_data, random_seed, frac)
elif split_method == 'cold_drug':
    train, val, test = create_fold_setting_cold_drug(df_data, random_seed, frac)
elif split_method == 'HTS':
    train, val, test = create_fold_setting_cold_drug(df_data, random_seed, frac)
    val = pd.concat([val[val.Label == 1].drop_duplicates(subset = 'SMILES'), val[val.Label == 0]])
    test = pd.concat([test[test.Label == 1].drop_duplicates(subset = 'SMILES'), test[test.Label == 0]])        
elif split_method == 'cold_protein':
    train, val, test = create_fold_setting_cold_protein(df_data, random_seed, frac)
elif split_method == 'repurposing_VS':
    train = df_data
    val = df_data
    test = df_data
elif split_method == 'no_split':
    print('do not do train/test split on the data for already splitted data')
else:
    raise AttributeError("Please select one of the three split method: random, cold_drug, cold_target!")
    
print('Done.')

train = train.reset_index(drop=True)
val = val.reset_index(drop=True)
test = test.reset_index(drop=True)
    

splitting dataset...


KeyError: 'SMILES'

In [None]:
train

In [None]:

result_folder = "./result/"
input_dim_drug = 1024
input_dim_protein = 8420
hidden_dim_drug = 256
hidden_dim_protein = 256
cls_hidden_dims = [1024, 1024, 512]
mlp_hidden_dims_drug = [1024, 256, 64]
mlp_hidden_dims_target = [1024, 256, 64]
batch_size = 256
train_epoch = 10
test_every_X_epoch = 20
LR = 1e-4
decay = 0
transformer_emb_size_drug = 128
transformer_intermediate_size_drug = 512
transformer_num_attention_heads_drug = 8
transformer_n_layer_drug = 8
transformer_emb_size_target = 64
transformer_intermediate_size_target = 256
transformer_num_attention_heads_target = 4
transformer_n_layer_target = 2
transformer_dropout_rate = 0.1
transformer_attention_probs_dropout = 0.1
transformer_hidden_dropout_rate = 0.1
mpnn_hidden_size = 50
mpnn_depth = 3
cnn_drug_filters = [32,64,96]
cnn_drug_kernels = [4,6,8]
cnn_target_filters = [32,64,96]
cnn_target_kernels = [4,8,12]
rnn_Use_GRU_LSTM_drug = 'GRU'
rnn_drug_hid_dim = 64
rnn_drug_n_layers = 2
rnn_drug_bidirectional = True
rnn_Use_GRU_LSTM_target = 'GRU'
rnn_target_hid_dim = 64
rnn_target_n_layers = 2
rnn_target_bidirectional = True
num_workers = 0 

base_config = {'input_dim_drug': input_dim_drug,
                'input_dim_protein': input_dim_protein,
                'hidden_dim_drug': hidden_dim_drug, # hidden dim of drug
                'hidden_dim_protein': hidden_dim_protein, # hidden dim of protein
                'cls_hidden_dims' : cls_hidden_dims, # decoder classifier dim 1
                'batch_size': batch_size,
                'train_epoch': train_epoch,
                'test_every_X_epoch': test_every_X_epoch, 
                'LR': LR,
                'result_folder': result_folder,
                'binary': False,
                'num_workers' : num_workers,
                'result_folder' : "./result/",
                'input_dim_drug' : 1024,
                'input_dim_protein': 8420,
                'hidden_dim_drug': 256,
                'hidden_dim_protein': 256,
                'cls_hidden_dims': [1024, 1024, 512],
                'mlp_hidden_dims_drug': [1024, 256, 64],
                'mlp_hidden_dims_target': [1024, 256, 64],
                'batch_size': 256,
                'train_epoch': 10,
                'test_every_X_epoch': 20,
                'LR': 1e-4,
                'decay': 0,
                'transformer_emb_size_drug': 128,
                'transformer_intermediate_size_drug': 512,
                'transformer_num_attention_heads_drug': 8,
                'transformer_n_layer_drug': 8,
                'transformer_emb_size_target': 64,
                'transformer_intermediate_size_target': 256,
                'transformer_num_attention_heads_target': 4,
                'transformer_n_layer_target': 2,
                'transformer_dropout_rate': 0.1,
                'transformer_attention_probs_dropout': 0.1,
                'transformer_hidden_dropout_rate': 0.1,
                'mpnn_hidden_size': 50,
                'mpnn_depth': 3,
                'cnn_drug_filters': [32,64,96],
                'cnn_drug_kernels': [4,6,8],
                'cnn_target_filters': [32,64,96],
                'cnn_target_kernels': [4,8,12],
                'rnn_Use_GRU_LSTM_drug': 'GRU',
                'rnn_drug_hid_dim': 64,
                'rnn_drug_n_layers': 2,
                'rnn_drug_bidirectional' : True,
                'rnn_Use_GRU_LSTM_target' : 'GRU',
                'rnn_target_hid_dim' : 64,
                'rnn_target_n_layers' : 2,
                'rnn_target_bidirectional' : True,
                'num_workers' : 0 
}
base_config['result_folder']

In [None]:
if not os.path.exists(base_config['result_folder']):
    os.makedirs(base_config['result_folder'])

base_config['mlp_hidden_dims_drug'] = mlp_hidden_dims_drug # MLP classifier dim 1				
base_config['input_dim_drug'] = 881 #could be 2048 or 200 or 2586
base_config['cnn_drug_filters'] = cnn_drug_filters
base_config['cnn_drug_kernels'] = cnn_drug_kernels
base_config['rnn_Use_GRU_LSTM_drug'] = rnn_Use_GRU_LSTM_drug
base_config['rnn_drug_hid_dim'] = rnn_drug_hid_dim
base_config['rnn_drug_n_layers'] = rnn_drug_n_layers
base_config['rnn_drug_bidirectional'] = rnn_drug_bidirectional 
base_config['transformer_emb_size_drug'] = transformer_emb_size_drug
base_config['transformer_num_attention_heads_drug'] = transformer_num_attention_heads_drug
base_config['transformer_intermediate_size_drug'] = transformer_intermediate_size_drug
base_config['transformer_n_layer_drug'] = transformer_n_layer_drug
base_config['transformer_dropout_rate'] = transformer_dropout_rate
base_config['transformer_attention_probs_dropout'] = transformer_attention_probs_dropout
base_config['transformer_hidden_dropout_rate'] = transformer_hidden_dropout_rate
base_config['hidden_dim_drug'] = transformer_emb_size_drug #could also be hidden_dim_drug
base_config['batch_size'] = batch_size 
base_config['mpnn_hidden_size'] = mpnn_hidden_size
base_config['mpnn_depth'] = mpnn_depth

base_config['mlp_hidden_dims_target'] = mlp_hidden_dims_target # MLP classifier dim 1				
base_config['input_dim_protein'] = 30 #could be 343 or 100 or 4114
base_config['cnn_target_filters'] = cnn_target_filters
base_config['cnn_target_kernels'] = cnn_target_kernels
base_config['rnn_Use_GRU_LSTM_target'] = rnn_Use_GRU_LSTM_target
base_config['rnn_target_hid_dim'] = rnn_target_hid_dim
base_config['rnn_target_n_layers'] = rnn_target_n_layers
base_config['rnn_target_bidirectional'] = rnn_target_bidirectional 
base_config['cnn_target_filters'] = cnn_target_filters
base_config['cnn_target_kernels'] = cnn_target_kernels
base_config['transformer_emb_size_target'] = transformer_emb_size_target
base_config['transformer_num_attention_heads_target'] = transformer_num_attention_heads_target
base_config['transformer_intermediate_size_target'] = transformer_intermediate_size_target
base_config['transformer_n_layer_target'] = transformer_n_layer_target	
base_config['transformer_dropout_rate'] = transformer_dropout_rate
base_config['transformer_attention_probs_dropout'] = transformer_attention_probs_dropout
base_config['transformer_hidden_dropout_rate'] = transformer_hidden_dropout_rate
base_config['hidden_dim_protein'] = transformer_emb_size_target

config = base_config

In [None]:
from collections import namedtuple
model_drug_tuple = namedtuple("model_drug_tuple", "MLP CNN CNN_RNN transformer MPNN")
model_protein_tuple = namedtuple("model_protein_tuple", "MLP CNN CNN_RNN transformer")

model_drug_MLP = MLP(config['input_dim_drug'], config['hidden_dim_drug'], config['mlp_hidden_dims_drug'])
model_drug_CNN = CNN('drug', **config)
model_drug_CNN_RNN = CNN_RNN('drug', **config)
model_drug_transformer = transformer('drug', **config)
model_drug_MPNN = MPNN(config['hidden_dim_drug'], config['mpnn_depth'])

model_drug = model_drug_tuple(model_drug_MLP, model_drug_CNN, model_drug_CNN_RNN, model_drug_transformer, model_drug_MPNN)

model_protein_MLP = MLP(config['input_dim_protein'], config['hidden_dim_protein'], config['mlp_hidden_dims_target'])
model_protein_CNN = CNN('protein', **config)
model_protein_CNN_RNN = CNN_RNN('protein', **config)
model_protein_transformer = transformer('protein', **config)

model_protein = model_protein_tuple(model_protein_MLP, model_protein_CNN, model_protein_CNN_RNN, model_protein_transformer)

model_feature_tuple = namedtuple("model_feature_tuple","model_drug model_protein model_df")
model_features = model_feature_tuple(model_drug, model_protein, train)

In [None]:
class Classifier_o(nn.Sequential):
    def __init__(self, model_struct, **config):
        super(Classifier_o, self).__init__()
        self.input_dim_drug = config['hidden_dim_drug']
        self.input_dim_protein = config['hidden_dim_protein']

        self.model_struct = model_struct

        self.dropout = nn.Dropout(0.1)

        self.hidden_dims = config['cls_hidden_dims']
        layer_size = len(self.hidden_dims) + 1
        dims = [self.input_dim_drug + self.input_dim_protein] + self.hidden_dims + [1]

        self.predictor = nn.ModuleList([nn.Linear(dims[i], dims[i+1]) for i in range(layer_size)])

    def forward(self, v_D, v_P):
        # each encoding
        v_D = self.model_struct.model_drug(v_D)
        v_P = self.model_struct.model_protein(v_P)
        # concatenate and classify
        v_f = torch.cat((v_D, v_P), 1)
        for i, l in enumerate(self.predictor):
            if i==(len(self.predictor)-1):
                v_f = l(v_f)
            else:
                v_f = F.relu(self.dropout(l(v_f)))
        return v_f

In [None]:
model = Classifier_o(model_features, **config)

In [None]:
input_dim_drug = config['hidden_dim_drug']
input_dim_protein = config['hidden_dim_protein']
model_drug = model_drug
model_protein = model_protein
dropout = nn.Dropout(0.1)
hidden_dims = config['cls_hidden_dims']
layer_size = len(hidden_dims) + 1
dims = [input_dim_drug + input_dim_protein] + hidden_dims + [1]
predictor = nn.ModuleList([nn.Linear(dims[i], dims[i+1]) for i in range(layer_size)])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
result_folder = config['result_folder']
       
binary = False

In [None]:
lr = config['LR']
decay = config['decay']
BATCH_SIZE = config['batch_size']
train_epoch = config['train_epoch']
loss_history = []
verbose = True

model = model.to(device)

# support multiple GPUs
if torch.cuda.device_count() > 1:
    if verbose:
        print("Let's use " + str(torch.cuda.device_count()) + " GPUs!")
    model = nn.DataParallel(model, dim = 0)
elif torch.cuda.device_count() == 1:
    if verbose:
        print("Let's use " + str(torch.cuda.device_count()) + " GPU!")
else:
    if verbose:
        print("Let's use CPU/s!")
# Future TODO: support multiple optimizers with parameters
opt = torch.optim.Adam(model.parameters(), lr = lr, weight_decay = decay)
if verbose:
    print('--- Data Preparation ---')

params = {'batch_size': BATCH_SIZE,
        'shuffle': True,
        'num_workers': config['num_workers'],
        'drop_last': False}

params['collate_fn'] = DTI.mpnn_collate_func

In [None]:
class data_process_loader_o(data.Dataset):

    def __init__(self, list_IDs, df, **config):
        'Initialization'
        self.list_IDs = list_IDs
        self.df = df
        self.config = config

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.list_IDs)

    def __getitem__(self, index):
        'Generates one sample of data'
        index = self.list_IDs[index]
        v = self.df.iloc[index]
#        v_d = self.df.iloc[index]['drug_encoding']        
#        v_d = drug_2_embed(v_d)
#        v_p = self.df.iloc[index]['target_encoding']
#        v_p = protein_2_embed(v_p)
        #y = self.labels[index]
        return v

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc_protein = OneHotEncoder().fit(np.array(amino_char).reshape(-1, 1))
enc_drug = OneHotEncoder().fit(np.array(smiles_char).reshape(-1, 1))

def protein_2_embed(x):
	return enc_protein.transform(np.array(x).reshape(-1,1)).toarray().T
def drug_2_embed(x):
	return enc_drug.transform(np.array(x).reshape(-1,1)).toarray().T    

In [None]:
from torch.utils.data.dataset import Dataset

class MyCustomDataset(Dataset):
    def __init__(self, ...):
        # stuff
        
    def __getitem__(self, index):
        # stuff
        return (img, label)

    def __len__(self):
        return count

In [None]:
training_generator = torch.utils.data.DataLoader(train, **params)
validation_generator = torch.utils.data.DataLoader(val, **params)

In [None]:
if test is not None:
    info = data_process_loader_o(test.index.values,  test, **config)
    params_test = {'batch_size': BATCH_SIZE,
            'shuffle': False,
            'num_workers': config['num_workers'],
            'drop_last': False,
            'sampler':SequentialSampler(info)}
    params_test['collate_fn'] = DTI.mpnn_collate_func
    testing_generator = data.DataLoader(data_process_loader_o(test.index.values, test, **config), **params_test)

# early stopping
if binary:
    max_auc = 0
else:
    max_MSE = 10000
model_max = copy.deepcopy(model)

valid_metric_record = []
valid_metric_header = ["# epoch"] 
if binary:
    valid_metric_header.extend(["AUROC", "AUPRC", "F1"])
else:
    valid_metric_header.extend(["MSE", "Pearson Correlation", "with p-value", "Concordance Index"])
table = PrettyTable(valid_metric_header)
float2str = lambda x:'%0.4f'%x
if verbose:
    print('--- Go for Training ---')
t_start = time() 
for epo in range(train_epoch):
    for v in enumerate(training_generator):
        if self.target_encoding == 'Transformer':
            v_p = v_p
        else:
            v_p = v_p.float().to(self.device) 
        if self.drug_encoding == "MPNN" or self.drug_encoding == 'Transformer':
            v_d = v_d
        else:
            v_d = v_d.float().to(self.device)                
            #score = self.model(v_d, v_p.float().to(self.device))

        score = self.model(v_d, v_p)
        label = Variable(torch.from_numpy(np.array(label)).float()).to(self.device)

        if self.binary:
            loss_fct = torch.nn.BCELoss()
            m = torch.nn.Sigmoid()
            n = torch.squeeze(m(score), 1)
            loss = loss_fct(n, label)
        else:
            loss_fct = torch.nn.MSELoss()
            n = torch.squeeze(score, 1)
            loss = loss_fct(n, label)
        loss_history.append(loss.item())

        opt.zero_grad()
        loss.backward()
        opt.step()

        if verbose:
            if (i % 100 == 0):
                t_now = time()
                print('Training at Epoch ' + str(epo + 1) + ' iteration ' + str(i) + \
                    ' with loss ' + str(loss.cpu().detach().numpy())[:7] +\
                    ". Total time " + str(int(t_now - t_start)/3600)[:7] + " hours") 
                ### record total run time

    ##### validate, select the best model up to now 
    with torch.set_grad_enabled(False):
        if self.binary:  
            ## binary: ROC-AUC, PR-AUC, F1, cross-entropy loss
            auc, auprc, f1, loss, logits = self.test_(validation_generator, self.model)
            lst = ["epoch " + str(epo)] + list(map(float2str,[auc, auprc, f1]))
            valid_metric_record.append(lst)
            if auc > max_auc:
                model_max = copy.deepcopy(self.model)
                max_auc = auc   
            if verbose:
                print('Validation at Epoch '+ str(epo + 1) + ' , AUROC: ' + str(auc)[:7] + \
                  ' , AUPRC: ' + str(auprc)[:7] + ' , F1: '+str(f1)[:7] + ' , Cross-entropy Loss: ' + \
                  str(loss)[:7])
        else:  
            ### regression: MSE, Pearson Correlation, with p-value, Concordance Index  
            mse, r2, p_val, CI, logits = self.test_(validation_generator, self.model)
            lst = ["epoch " + str(epo)] + list(map(float2str,[mse, r2, p_val, CI]))
            valid_metric_record.append(lst)
            if mse < max_MSE:
                model_max = copy.deepcopy(self.model)
                max_MSE = mse
            if verbose:
                print('Validation at Epoch '+ str(epo + 1) + ' , MSE: ' + str(mse)[:7] + ' , Pearson Correlation: '\
                 + str(r2)[:7] + ' with p-value: ' + str(p_val)[:7] +' , Concordance Index: '+str(CI)[:7])
    table.add_row(lst)


# load early stopped model
self.model = model_max

#### after training 
prettytable_file = os.path.join(self.result_folder, "valid_markdowntable.txt")
with open(prettytable_file, 'w') as fp:
    fp.write(table.get_string())

if test is not None:
    if verbose:
        print('--- Go for Testing ---')
    if self.binary:
        auc, auprc, f1, loss, logits = self.test_(testing_generator, model_max, test = True)
        test_table = PrettyTable(["AUROC", "AUPRC", "F1"])
        test_table.add_row(list(map(float2str, [auc, auprc, f1])))
        if verbose:
            print('Validation at Epoch '+ str(epo + 1) + ' , AUROC: ' + str(auc)[:7] + \
              ' , AUPRC: ' + str(auprc)[:7] + ' , F1: '+str(f1)[:7] + ' , Cross-entropy Loss: ' + \
              str(loss)[:7])				
    else:
        mse, r2, p_val, CI, logits = self.test_(testing_generator, model_max)
        test_table = PrettyTable(["MSE", "Pearson Correlation", "with p-value", "Concordance Index"])
        test_table.add_row(list(map(float2str, [mse, r2, p_val, CI])))
        if verbose:
            print('Testing MSE: ' + str(mse) + ' , Pearson Correlation: ' + str(r2) 
              + ' with p-value: ' + str(p_val) +' , Concordance Index: '+str(CI))
    np.save(os.path.join(self.result_folder, str(self.drug_encoding) + '_' + str(self.target_encoding) 
             + '_logits.npy'), np.array(logits))                

    ######### learning record ###########

    ### 1. test results
    prettytable_file = os.path.join(self.result_folder, "test_markdowntable.txt")
    with open(prettytable_file, 'w') as fp:
        fp.write(test_table.get_string())

### 2. learning curve 
fontsize = 16
iter_num = list(range(1,len(loss_history)+1))
plt.figure(3)
plt.plot(iter_num, loss_history, "bo-")
plt.xlabel("iteration", fontsize = fontsize)
plt.ylabel("loss value", fontsize = fontsize)
pkl_file = os.path.join(self.result_folder, "loss_curve_iter.pkl")
with open(pkl_file, 'wb') as pck:
    pickle.dump(loss_history, pck)

fig_file = os.path.join(self.result_folder, "loss_curve.png")
plt.savefig(fig_file)
if verbose:
    print('--- Training Finished ---')