In [None]:
import pandas as pd
import numpy as np
from matplotlib import pylab as plt
from rdkit.Chem import AllChem as Chem
from rdkit import DataStructs
import random
from sklearn.preprocessing import StandardScaler

import keras
from keras import Sequential
from keras.layers import Conv1D, Dense, Flatten, BatchNormalization, Dropout, Activation
from keras.optimizers import Adam
from keras.models import load_model
from keras.callbacks import ModelCheckpoint
import keras.backend as K

In [None]:
PREDICTOR_DATASET = './dataset/predictor_dataset.csv'

VAL_SPLIT = .20

#Parameters for fingerprint generation
RADIUS = 3
NBITS = 2048

In [None]:
df = pd.read_csv(PREDICTOR_DATASET)

In [None]:
class Fingerprint_Generation:
    def __init__(self, smiles,radius=RADIUS,nbits=NBITS):
        self.lookupfps = {}
        
        for key, value in lookupsmiles.items():
            mol = Chem.MolFromSmiles(value)
            fp = np.array(Chem.GetMorganFingerprintAsBitVect(mol,radius,nbits))
            self.lookupfps[key] = fp
        self.lookupfps[' '] = np.zeros(self.lookupfps['A'].shape)
    
    def seq(self, seq):
        fp = np.asarray([self.lookupfps[seq[i]] for i in range(len(seq))])
        return fp

In [None]:
lookupsmiles = {
         '2': 'NC(CSC1=C(F)C(F)=C(C(F)=C1F)C1=C(F)C(F)=C(SCC(N)C(N)=O)C(F)=C1F)C(N)=O',
         '3': 'CC(=O)CC1=CN(CCCCC(N)C(N)=O)N=N1',
         'A': 'N[C@@H](C)C(O)=O',
         'B': 'C(CN)C(=O)O',
         'X': 'C(CCC(=O)O)CCN',
         'R': 'N[C@@H](CCCNC(N)=N)C(O)=O', 
         'N': 'N[C@@H](CC(N)=O)C(O)=O', 
         'D': 'N[C@@H](CC(O)=O)C(O)=O', 
         'C': 'N[C@H](C(O)=O)CS', 
         'E': 'N[C@@H](CCC(O)=O)C(O)=O', 
         'Q': 'N[C@@H](CCC(N)=O)C(O)=O', 
         'G': 'NCC(O)=O', 
         'H': 'N[C@@H](CC1=CNC=N1)C(O)=O', 
         'I': 'N[C@@H]([C@@H](C)CC)C(O)=O', 
         'L': 'N[C@@H](CC(C)C)C(O)=O', 
         'K': 'N[C@@H](CCCCN)C(O)=O', 
         'M': 'N[C@@H](CCSC)C(O)=O', 
         'F': 'N[C@@H](CC1=CC=CC=C1)C(O)=O', 
         'P': 'O=C(O)[C@H]1NCCC1', 
         'S': 'N[C@@H](CO)C(O)=O', 
         'T': 'N[C@@H]([C@H](O)C)C(O)=O', 
         'W': 'N[C@@H](CC1=CNC2=C1C=CC=C2)C(O)=O', 
         'Y': 'N[C@@H](CC1=CC=C(O)C=C1)C(O)=O', 
         'V': 'N[C@@H](C(C)C)C(O)=O',
         '@': 'N[C@@H](CSC1=C(C(F)=C(C(F)=C1F)C2=C(C(F)=C(C(F)=C2F)SC[C@@H](C(O)=O)N)F)F)C(O)=O',
         '#': 'N[C@H](C(O)=O)CSC1=CC(SC[C@@H](N)C(O)=O)=CC(SC[C@H](N)C(O)=O)=C1'
}

fp = Fingerprint_Generation(lookupsmiles) #Instantiating Fingerprint_Generation Class

#### Featurization

In [None]:
X_df = pd.DataFrame(columns=['sequence', 'feature'])
Y_df = pd.DataFrame(columns=['intensity'])

In [None]:
features_max = 108

for i in range(0, df.shape[0]):
    X_df.at[i, 'sequence'] = df['sequences'][i]
    X_df.at[i, 'feature'] = fp.seq(df['sequences'][i])
    
    Y_df.at[i, 'intensity'] = df['intensity'][i]

In [None]:
for i in range(0, X_df.shape[0]):
    n_rows = features_max - len(X_df.at[i, 'feature'])
    shape_padding = (n_rows, NBITS)
    padding_array = np.zeros(shape_padding)
    X_df.at[i, 'feature'] = np.concatenate((X_df.at[i, 'feature'], padding_array), axis = 0)

In [None]:
nnX = np.ndarray(shape=(X_df.shape[0],features_max,NBITS), dtype=int)
for i in range(0,X_df.shape[0]):
    nnX[i] = X_df.at[i, 'feature']

In [None]:
dict_data = {}
dict_data['mean_Intensity'] = Y_df['intensity'].mean()
dict_data['std_Intensity'] = Y_df['intensity'].std()

scaler = StandardScaler()
Y_df.fillna(0, inplace=True) #There are few missing values in the Spreadsheet, so replacing them with 0
Y_df[['intensity']] = scaler.fit_transform(Y_df[['intensity']])

Y_Intensity = np.asarray(Y_df['intensity'].values.tolist())

In [None]:
indices = np.random.RandomState(seed=108).permutation(np.arange(nnX.shape[0]))

nnX = nnX[indices]
Y_Intensity = Y_Intensity[indices]

In [None]:
nnX_valid = nnX[indices][-int(len(indices)*VAL_SPLIT):]
Y_Intensity_valid = Y_Intensity[indices][-int(len(indices)*VAL_SPLIT):]

In [None]:
X_df = None
Y_df = None

#### Deep Learning Model

In [None]:
model = Sequential()

DIM = 256
SIZE = 2

model.add(Conv1D(DIM, SIZE, input_shape=(features_max,NBITS)))
model.add(Dropout(0.1))
model.add(Conv1D(DIM, SIZE))
model.add(Dropout(0.1))
model.add(Activation('relu'))
model.add(Conv1D(DIM, SIZE))
model.add(Dropout(0.1))
model.add(Flatten())
model.add(Dense(DIM))
model.add(Activation('softplus'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='linear'))

optimizer = Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

model.compile(optimizer=optimizer,
              loss='mse')

In [None]:
filepath = './model/predictor/epoch-{epoch:02d}-loss-{loss:.4f}-val_loss-{val_loss:.4f}-.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

print (model.summary())

model.fit(nnX, Y_Intensity, batch_size=25, epochs=2, validation_split=VAL_SPLIT, 
          callbacks=callbacks_list, 
          verbose=True)

In [None]:
plt.scatter((Y_Intensity_valid*dict_data['std_Intensity'])+dict_data['mean_Intensity'],
            (model.predict(nnX_valid)*dict_data['std_Intensity'])+dict_data['mean_Intensity'])
# plt.plot(np.linspace(0,20,100), np.linspace(0,20,100), '-')

plt.ylabel('Predicted Intensity', fontdict={'size':16})
plt.xlabel('Experimental Intensity', fontdict={'size':16})
plt.tick_params(labelsize=14)
plt.show()