# image_classification (hasRing problem) with RNN model

In [30]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, LSTM
from keras.layers import Merge, Dropout
from keras.optimizers import SGD
from collections import Counter
import numpy as np

### prepare data

In [31]:
import getpass
import os
from Teemo.utils import fileops
from Teemo.examples.mol_rnn.load_data import gen_bond_atom_target_sequences
from Teemo.algorithm.utils import matrixops

max_seq_length = 100
data_dir = '/home/'+getpass.getuser()+'/git_test/test_data/examples/has_ring/alogps/'
smiles_file = data_dir + 'alogps.smi'
hasRing_file = data_dir + 'alogps.hasring'
assert os.path.exists(smiles_file), os.path.exists(hasRing_file)

num_mols = 100
smiles_data = [x[0] for x in fileops.read_file(smiles_file)][:num_mols]
hasRing_data = [int(x[0]) for x in fileops.read_file(hasRing_file)][:num_mols]
print ('hasRing_data count: {0}'.format(Counter(hasRing_data)))

bond_seq, atom_seq, target_seq, sample_atom_count = gen_bond_atom_target_sequences(smiles_data, hasRing_data, max_seq_length)
target_seq = matrixops.one_hot_transformer(target_seq, 2)
print ('target_seq count: {0}'.format(Counter(np.argmax(target_seq, axis=1))))

hasRing_data count: Counter({1: 53, 0: 47})
max_atoms_count: 18, max_seq_length: 30
bond_sequences.shape: (747, 100),
atom_sequences.shape: (747, 100),
target_sequences.shape: (747, 1),
sample_atom_count: (747,)
target_seq count: Counter({1: 449, 0: 298})


### build model 1

In [32]:
atom_vec_dim = 150
bond_vec_dim = 150
atom_type_size = 100
bond_type_size = 24
lstm_hidden_dim = 100
output_dim = 2

branch_1 = Sequential()
branch_1.add(Embedding(output_dim=bond_vec_dim, input_dim=bond_type_size, mask_zero=True))
branch_2 = Sequential()
branch_2.add(Embedding(output_dim=atom_vec_dim, input_dim=atom_type_size, mask_zero=True))
merged = Merge([branch_1, branch_2], mode='concat')
model = Sequential()
model.add(merged)
model.add(LSTM(output_dim = lstm_hidden_dim))
model.add(Dropout(0.5))
model.add(Dense(output_dim = output_dim, activation='softmax'))
print (model.input_shape, model.output_shape)

([(None, None), (None, None)], (None, 2))


In [33]:
model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=0.01, momentum=0.0), metrics=['accuracy'])
model.fit([bond_seq, atom_seq], target_seq, nb_epoch=10, sample_weight=1./(sample_atom_count))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9c92d06190>

In [34]:
from Teemo.algorithm.utils.evaluations import classification_evaluate
from Teemo.algorithm.utils.report_funcs import classification_report
y_pred = model.predict([bond_seq, atom_seq])
res = classification_evaluate(y_pred, target_seq)
res = classification_report(res)
print (res)

            precision   recall      f_measure   support     
class 0     0.0         0.0         0.0         298         
class 1     0.6011      1.0         0.7508      449         
avg/total   0.3613      0.6011      0.4513      747         
matrix
            Pred        
True        class 0     class 1     
class 0     0           298         
class 1     0           449         



### build model 2

In [None]:
voca_size = 2300
emb_dim = 200
lstm_hidden_dim = 200
output_dim = 2
model = Sequential()
## mask_zeros: refer to https://github.com/fchollet/keras/blob/master/keras/layers/embeddings.py
model.add(Embedding(output_dim = emb_dim, input_dim = voca_size, mask_zero=True))
model.add(LSTM(output_dim=lstm_hidden_dim, return_sequences=False))
model.add(Dense(output_dim=output_dim))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=0.002, momentum=0.0), metrics=['accuracy'])