In [2]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.preprocessing.text import Tokenizer
from keras.models import load_model
from sklearn.utils import resample

from tensorflow.keras.preprocessing.sequence import pad_sequences
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
df = pd.read_csv('/content/drive/MyDrive/Thesis_Implementation/Dataset/toxicity.csv')

In [10]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(df['SMILES'])
smiles_sequences = tokenizer.texts_to_sequences(df['SMILES'])
smiles_sequences = pad_sequences(smiles_sequences)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(smiles_sequences, df['Toxicity'], test_size=0.2)

smote = SMOTE(sampling_strategy='minority', random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [11]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32, input_length=smiles_sequences.shape[1]))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

In [12]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc21d295670>

In [14]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print('Test accuracy:', test_accuracy)

Test accuracy: 0.880917489528656


In [15]:
model.save('/content/drive/MyDrive/Thesis_Implementation/Model/toxicity.h5')

In [16]:
model = load_model('/content/drive/MyDrive/Thesis_Implementation/Model/toxicity.h5')

In [17]:
def predict_toxicity(smiles):
    sequence = tokenizer.texts_to_sequences([smiles])
    padded_sequence = pad_sequences(sequence, maxlen=smiles_sequences.shape[1], padding='post')
    prediction = model.predict(padded_sequence)
    return prediction[0][0]

In [20]:
smiles = 'OC(=O)CC(O)(CC(O)=O)C(O)=O.CC\C(=C(/C1=CC=CC=C1)C2=CC=C(OCCN(C)C)C=C2)C3=CC=CC=C3'
prediction = predict_toxicity(smiles)
print(f'The toxicity is: {prediction:.5f}')

The toxicity is: 0.30934
