In [2]:
import pandas as pd
import tensorflow as tf
from keras import optimizers
from keras.layers import Input, Dense, Dropout, Lambda, Concatenate
from keras.models import Model, Sequential
from keras.utils import multi_gpu_model
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras import backend as K
import numpy as np
from sklearn.linear_model import LinearRegression
preprocessed_info_df=pd.read_csv('preprocessed_info.csv')
preprocessed_info_df=preprocessed_info_df.drop(['Unnamed: 0_x'], axis=1)
preprocessed_info_df=preprocessed_info_df.drop(['Unnamed: 0_y'], axis=1)
preprocessed_info_df=preprocessed_info_df.drop(['Unnamed: 0'], axis=1)
preprocessed_info_df=preprocessed_info_df.drop_duplicates(subset=['cell_line','smiles'])
preprocessed_info_df=preprocessed_info_df.reset_index(drop=True)
drug_set1=preprocessed_info_df['smiles'].value_counts()[np.where(preprocessed_info_df['smiles'].value_counts()>=50)[0]].index
train_dat=preprocessed_info_df.groupby('smiles').sample(frac=0.8)
test_dat=preprocessed_info_df.drop(train_dat.index)
val_dat=test_dat.groupby('smiles').sample(frac=0.5)

In [6]:
preprocessed_info_df['smiles'].value_counts()

COc1cc2c(Oc3ccc(NC(=O)C4(C(=O)Nc5ccc(F)cc5)CC4)cc3F)ccnc2cc1OCCCN1CCOCC1    643
C1CN(CCN1)c1ccc(cc1)-c1cnc2c(cnn2c1)-c1ccnc2ccccc12                         636
O=C(CCCCCCC(=O)Nc1ccccc1)NO                                                 604
O=C(/C=C/c1ccc(CN(CCO)CCc2c[nH]c3ccccc23)cc1)NO                             590
Cc1[nH]c2ccccc2c1CCNCc1ccc(/C=C/C(=O)NO)cc1                                 587
                                                                           ... 
COc1nc(C)cnc1NS(=O)(=O)c1cccnc1-c1ccc(-c2nnco2)cc1                            1
CC(=O)[C@@H](C#N)C(=O)Nc1cc(Br)ccc1Br                                         1
O=NN(CCCl)C(=O)NCCCl                                                          1
C[C@]1(c2nc3cccc(C(N)=O)c3[nH]2)CCCN1                                         1
CC(O)(CS(=O)(=O)c1ccc(F)cc1)C(=O)Nc1ccc(C#N)c(C(F)(F)F)c1                     1
Name: smiles, Length: 237, dtype: int64

In [33]:
drug_latent_n=56
cell_line_latent_n=50
train_x=np.array(train_dat.iloc[:,4:(4+drug_latent_n+cell_line_latent_n)])
test_x=np.array(test_dat.iloc[:,4:(4+drug_latent_n+cell_line_latent_n)])
val_x=np.array(val_dat.iloc[:,4:(4+drug_latent_n+cell_line_latent_n)])
train_y=np.array(train_dat.iloc[:,2])
test_y=np.array(test_dat.iloc[:,2])
val_y=np.array(val_dat.iloc[:,2])

In [34]:
reg = LinearRegression().fit(train_x, train_y)
test_pred_y=reg.predict(test_x)

print(np.corrcoef(test_pred_y,test_y))


[[1.         0.73355263]
 [0.73355263 1.        ]]


In [35]:
input1=Input(shape=(drug_latent_n+cell_line_latent_n,))
x= Dense(2048,activation='relu')(input1)
x = Dropout(0.2)(x)
x = Dense(2048,activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(2048,activation='sigmoid')(x)
x = Dropout(0.2)(x)
IC50_val = Dense(1,activation='linear')(x)
model = Model(inputs = input1, outputs = IC50_val)
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss',
                                  factor = 0.2,
                                  patience = 6,
                                  min_lr = 0.0001)
early_stop=tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min',restore_best_weights=True,patience=10)        
sgd = optimizers.SGD(lr=0.1, decay=0, momentum=0.9, nesterov=True)
rms = optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
ada = optimizers.Adagrad(lr=0.01, epsilon=None, decay=0.0)
adaD = optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=1e-8, decay=0.0)
adam = optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-8, decay=0.0)
model.compile(optimizer="adam", loss='mean_squared_error')
      
his = model.fit(train_x, 
                train_y, epochs=10,
                batch_size=128,
                shuffle=True, 
                callbacks=[reduce_lr,early_stop],validation_data=(val_x,val_y),
                verbose=2)
model.save_weights('best1_test.hdf5')


# predicted_y=model.predict([OH,x2])
train_pred_y=model.predict(train_x)
test_pred_y=model.predict(test_x)
print(np.corrcoef(train_y,train_pred_y.T))
print(np.corrcoef(test_y,test_pred_y.T))



Train on 29615 samples, validate on 3697 samples
Epoch 1/10
 - 28s - loss: 8.9422 - val_loss: 4.4778
Epoch 2/10
 - 28s - loss: 4.1620 - val_loss: 3.3838
Epoch 3/10
 - 28s - loss: 3.0265 - val_loss: 2.7370
Epoch 4/10
 - 28s - loss: 2.6626 - val_loss: 2.1271
Epoch 5/10
 - 28s - loss: 2.4890 - val_loss: 2.4582
Epoch 6/10
 - 28s - loss: 2.3795 - val_loss: 2.0932
Epoch 7/10
 - 28s - loss: 2.3044 - val_loss: 2.0193
Epoch 8/10
 - 28s - loss: 2.2088 - val_loss: 1.9678
Epoch 9/10
 - 28s - loss: 2.1649 - val_loss: 2.0017
Epoch 10/10
 - 28s - loss: 2.1510 - val_loss: 1.9677
[[1.         0.92668173]
 [0.92668173 1.        ]]
[[1.         0.92415155]
 [0.92415155 1.        ]]


In [36]:
train_dat['predicted_IC50']=train_pred_y
test_dat['predicted_IC50']=test_pred_y
train_dat.to_csv('train_dat_test.csv')
test_dat.to_csv('test_dat_test.csv')

In [66]:
len(test_drug_y)

6413