In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import tensorflow as tf
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Activation, Dropout, Conv1D, MaxPooling1D, Flatten
import keras.backend as K
import keras
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.utils import to_categorical
from keras import regularizers
from gensim.models import word2vec
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from rdkit.Chem import AllChem
from rdkit import Chem

In [None]:
wmodel = word2vec.Word2Vec.load('../model_300dim.pkl')

In [None]:
df = pd.read_csv("cleaned_pc.csv")
#remove wrongly labeled molecules
df.drop([113,396,241,256],inplace=True)
df.dropna(axis=1,inplace=True)
df['active'] = df['T'].apply(lambda x:0 if x<1.5 else 1)
df = df[['SMILES','AATS4s','TopoPSA', 'GATS8s','active']]
df["vec"] = df["SMILES"].apply(lambda x:AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), useChirality=True, radius=2, nBits=1024))
df['sentence'] = df['SMILES'].apply(lambda x: MolSentence(mol2alt_sentence(Chem.MolFromSmiles(x), 1)))
df['mol2vec'] = [x for x in sentences2vec(df['sentence'], wmodel, unseen='UNK')]

In [None]:
pretrained_weights = wmodel.wv.syn0
vocab_size, emdedding_size = pretrained_weights.shape

In [None]:
X = sentences2vec(df['sentence'], wmodel, unseen='UNK')
y = df['active']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21,stratify=y)

In [None]:
ros = RandomOverSampler(random_state=0)
X_train, y_train = ros.fit_resample(X_train, y_train)

In [None]:
y_train = to_categorical(y_train,num_classes=2)
y_test = to_categorical(y_test,num_classes=2)

In [None]:
model_glove = Sequential()
model_glove.add(Embedding(vocab_size, 300, input_length=300, weights=[pretrained_weights], trainable=False))
model_glove.add(Conv1D(500, 10, activation='relu'))
model_glove.add(MaxPooling1D(pool_size=10))
model_glove.add(Conv1D(250, 10, activation='relu'))
model_glove.add(Dropout(0.2))
model_glove.add(Conv1D(200, 10, activation='relu'))
model_glove.add(Dropout(0.2))
model_glove.add(LSTM(200))
model_glove.add(Dense(100, activation='relu'))
model_glove.add(Dense(2, activation='softmax'))
model_glove.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=1e-4), metrics=['binary_accuracy'])

In [None]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="./model",
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [None]:
model_glove.fit(X_train,y_train,epochs=20, validation_data=(X_test,y_test),callbacks=[model_checkpoint_callback])