In [1]:
import os, argparse
from build_nnfps import main as build_nnfps_main
import numpy as np
import pandas as pds
import pickle, math
from preprocessing import data_preparation, _char_set, get_property, canonocalize
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import SSVAE
import torch

# Molecule fingerprint regeneration for transfer learning, if necessary

In [2]:
# training datafiles to use, pick the combination as you like
csv_files = ['./data/paper_MP_IE_EA.csv',
             './data/paper_MP_clean_canonize_cut.csv',
             './data/paper_ZINC_310k.csv',
             './data/paper_clean_viscosity.csv',
             './data/paper_pubchem_fluorocarbon.csv'] 
# 'data/paper_clean_DC.csv' is skipped for now because it has many wrong & unphysical labels because of NLP

out_pkl = 'data/smiles2nn.pkl'
if os.path.isfile(out_pkl):
    os.system('rm '+out_pkl)

first = True
for csv_file in csv_files:
    args = argparse.Namespace()
    args.csv_file = csv_file
    if first:
        args.input_vocab_file = ''
        args.output_vocab_file = out_pkl
        args.fp_check = False
        # build_nnfps_main(args)
        first = False
    else:
        args.input_vocab_file = out_pkl
        args.output_vocab_file = out_pkl
        args.fp_check = False
        # build_nnfps_main(args)

# Prepare Combination of Dirty Data for Training SSVAE Model

In [3]:
# are we doing pre-training version? if yes, change pretrain to True
pretrain = False
pretrain_pkl = 'data/smiles2nn.pkl'

# we will try train generative model with these training data files
if 'csv_files' in locals():
    data_uri = csv_files
else:
    data_uri = ['./data/paper_MP_IE_EA.csv',
                './data/paper_MP_clean_canonize_cut.csv',
                './data/paper_ZINC_310k.csv',
                './data/paper_clean_viscosity.csv',
                './data/paper_pubchem_fluorocarbon.csv'] 

# first check how many data points we have in the training data
tmp_smiles = {}
for csv_in in data_uri:
    data = pds.read_csv( csv_in )
    for i in range(len(data['SMILES'])):
        tmp_smiles[ canonocalize(data['SMILES'][i]) ] = True

# define training, validation, and test set division
ntotal = len(tmp_smiles)
ntrn = math.floor( 0.9 * ntotal )
ntst = ntotal - ntrn
frac_val = 0.05
del data, tmp_smiles

# data preparation
if pretrain:
    data, scaler_Y = data_preparation(data_uri, ntrn, ntst,
                                      frac_val = frac_val,
                                      pretrain_uri = pretrain_pkl)
else:
    data, scaler_Y = data_preparation(data_uri, ntrn, ntst,
                                      frac_val = frac_val)

# tag for dumping intermediate models
dt = '221122121322'
model_dir = 'models/'+dt
tmp_model_tag = model_dir+'/'    
scaler_Y_pkl = 'preprocessed_scaler_Y.pkl'
scaler_Y = pickle.load(open(tmp_model_tag+scaler_Y_pkl,'rb'))

print('::: Data preparation completed')

::: Data preparation completed


# Load Previously Trained Model, if Needed

In [4]:
# pre-defined parameters
beta=10000.
char_set = _char_set()
dim_z = 100
dim_h = 250
n_hidden = 3
batch_size = 100

# tag for dumping intermediate models
save_uri = tmp_model_tag+'model_final.pth.tar'

# Instantiate the model
model = SSVAE.TorchModel(sample_data = data, dim_z = dim_z, dim_h = dim_h,
                         n_hidden = n_hidden, batch_size = batch_size, beta = float(beta), char_set = char_set,
                         tmp_model_tag = tmp_model_tag)
dev = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
model.initialize(torch.Tensor(data['trnY']), torch.Tensor(data['trnMask']), dev)
model.load_state_dict( torch.load(save_uri)['state_dict'] )
model.eval()

TorchModel(
  (predictor): bi_GRU_zero(
    (GRU): GRU(43, 250, num_layers=3, batch_first=True, bidirectional=True)
    (linear_out_y): Linear(in_features=500, out_features=12, bias=True)
  )
  (encoder): bi_GRU_init(
    (GRU): GRU(86, 250, num_layers=3, batch_first=True, bidirectional=True)
    (linear_in): Linear(in_features=6, out_features=500, bias=True)
    (linear_peek): Linear(in_features=6, out_features=43, bias=True)
    (linear_out_z): Linear(in_features=500, out_features=200, bias=True)
  )
  (decoder): uni_GRU_init(
    (GRU): GRU(86, 250, num_layers=3, batch_first=True)
    (linear_in): Linear(in_features=106, out_features=250, bias=True)
    (linear_peek): Linear(in_features=106, out_features=43, bias=True)
    (linear_out_x): Linear(in_features=250, out_features=43, bias=True)
  )
  (softmax): Softmax(dim=-1)
  (sigmoid): Sigmoid()
)

# Property Prediction Performance

In [5]:
scaled_tstY_hat = model.predict_routine(sample_data = data).detach().cpu().numpy()
tstY_hat = [scaler_Y[i].inverse_transform(scaled_tstY_hat[:,i:i+1]) for i in range(scaled_tstY_hat.shape[1])]
tstY_hat = np.concatenate(tstY_hat, axis=1)

dim_y = data['dim_y']
tstY = data['tstY']
tstMask = data['tstMask']
Y_names = data['Y_names']
for j in range(dim_y):
    idx = np.where( tstMask[:,j] == 1 )[0]
    print('Label Name:', Y_names[j])
    print([j, mean_absolute_error(tstY[idx,j], tstY_hat[idx,j])])

Label Name: EA
[0, 0.1326845341430336]
Label Name: IE
[1, 0.1409745993432492]
Label Name: LogVis
[2, 0.12709771448069]
Label Name: MolWt
[3, 1.3718315230057891]
Label Name: n_F
[4, 0.012760020706849168]
Label Name: n_O
[5, 0.02006005412503683]


# Unconditional Generation

In [6]:
print('Unconditional')
for t in range(10):
    smi = model.sampling_unconditional()
    print([t, smi, get_property(smi)])

Unconditional
[0, 'COC(=O)c1ncc(C(F)(F)F)cc1-c1ccc(F)c(F)c1F', [335.03809778400006, 3.971300000000001, 0.4735385576400655]]


RDKit ERROR: [12:36:55] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6 7 8 26 27
[12:36:55] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6 7 8 26 27

RDKit ERROR: 


[1, 'COc1cccc2cc(C(=O)N3CCN(c4ccccc4)CC3)c(=O)oc2c1', 'invalid']
[2, 'Cn1c(C(F)(F)Cl)nc2c(Cl)cccc2c1=O', [277.98252461199996, 2.875000000000001, 0.7521948780070166]]
[3, 'CN(Cc1cc(F)cc(F)c1)C(=O)C1CCCC1', [253.127820604, 3.113400000000002, 0.8101524203304447]]
[4, 'NC(=O)c1c(C(F)(F)F)[nH]c2c(Cl)cccc2c1=O', [290.006989772, 2.2992, 0.8452828637797497]]
[5, 'CCc1cc([N+](=O)[O-])cc(Br)c1O', [244.968755216, 2.6253000000000015, 0.6438949543829067]]
[6, 'NC(c1cc(F)ccc1Br)c1ccc(F)c(C(F)(F)F)c1F', [382.974430676, 4.933300000000001, 0.7303953204838246]]
[7, 'Cc1ccnc(N2CCN(c3cc(-c4ccc(Cl)cc4)no3)CC2)c1C#N', [379.119987876, 3.8967000000000027, 0.6870728426269869]]
[8, 'FC(F)C(F)(F)C(F)(F)C(F)(Br)C', [275.93845976800003, 3.6027000000000005, 0.5458122040660648]]
[9, 'CC(=O)c1cc(Br)c(OC(F)(F)F)cc1C=O', [309.94524081199995, 3.362800000000001, 0.6348645296825359]]


# Conditional Generation

In [7]:
print('Conditional')
# Determine the property values we want to use for conditional generation
ynames = ['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O']
yids = [Y_names.index(yname) for yname in ynames]

for i1 in [0.5, 0.0]:
    for i2 in [7.0, 7.5]:
        for i3 in [0.0, -0.1]:
            for i4 in [250, 300]:
                for i5 in [4.0, 6.0]:
                    for i6 in [1.0, 2.0]:
                        ytargets = [i1,i2,i3,i4,i5,i6]
                        ymeans = np.array([scaler_Y[yid].mean_[0] for yid in yids])
                        ystds = np.array([np.sqrt(scaler_Y[yid].var_[0]) for yid in yids])
                        ytargets_transform = ( np.array(ytargets) - ymeans ) / ystds
                        
                        print(ynames, ':', ytargets)
                        for t in range(5):
                            smi = model.sampling_conditional(yids, ytargets_transform)
                            print([t, smi, get_property(smi)])
                        print('')

Conditional
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.0, 0.0, 250, 4.0, 1.0]
[0, 'Cc1ccc(OCC(F)(F)C(F)(F)Cl)cc1', [256.027805468, 3.840720000000002, 0.5874324807507978]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.0, 0.0, 250, 4.0, 1.0]
[1, 'Nc1c(F)cc(OC(F)(F)F)cc1CCl', [243.007404372, 3.045300000000001, 0.4922380929663293]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.0, 0.0, 250, 4.0, 1.0]
[2, 'CC(C)C(=O)Nc1ccc(F)c(C(F)(F)F)c1', [249.077676852, 3.439000000000002, 0.7990657028483503]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.0, 0.0, 250, 4.0, 1.0]
[3, 'CN(C)C(=O)Nc1c(F)c(F)cc(F)c1CF', [250.07292582, 2.666900000000001, 0.6348126724053753]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.0, 0.0, 250, 4.0, 1.0]
[4, 'OCc1nc(C(F)(F)F)nc2c(F)cccc12', [246.041625692, 2.2800000000000002, 0.784972977193708]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.0, 0.0, 250, 4.0, 2.0]
[0, 'CN(C)C(=O)OC1(C(F)(F)F)CCC(F)C1', [243.08824

RDKit ERROR: [12:38:17] SMILES Parse Error: extra close parentheses while parsing: Cc1n(C(F)(F)F)C(=O)OC(F)(F)F)nn1
[12:38:17] SMILES Parse Error: extra close parentheses while parsing: Cc1n(C(F)(F)F)C(=O)OC(F)(F)F)nn1
RDKit ERROR: [12:38:17] SMILES Parse Error: Failed parsing SMILES 'Cc1n(C(F)(F)F)C(=O)OC(F)(F)F)nn1' for input: 'Cc1n(C(F)(F)F)C(=O)OC(F)(F)F)nn1'
[12:38:17] SMILES Parse Error: Failed parsing SMILES 'Cc1n(C(F)(F)F)C(=O)OC(F)(F)F)nn1' for input: 'Cc1n(C(F)(F)F)C(=O)OC(F)(F)F)nn1'


[2, 'Cc1n(C(F)(F)F)C(=O)OC(F)(F)F)nn1', 'invalid']
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.0, 0.0, 250, 6.0, 2.0]
[3, 'Oc1cc(C(F)(F)F)cc(C(F)(F)F)c1O', [246.011548688, 3.1354, 0.5449595566744846]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.0, 0.0, 250, 6.0, 2.0]
[4, 'COC(=O)CCC(F)(C(F)(F)F)C(F)(F)F', [256.03342700400003, 2.772500000000001, 0.5729267498882376]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.0, 0.0, 300, 4.0, 1.0]
[0, 'Cc1noc(-c2ccc(C(F)(F)F)cc2Cl)c1C(F)F', [311.013632624, 5.259820000000001, 0.7070251517911332]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.0, 0.0, 300, 4.0, 1.0]
[1, 'Cc1cc(OC(F)(F)F)cc(F)c1I', [319.93212566, 3.6373200000000008, 0.5670448892947801]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.0, 0.0, 300, 4.0, 1.0]
[2, 'COc1cnc(C(F)(F)F)c(F)c1CBr', [286.95688879200003, 3.1430000000000016, 0.6158228204465405]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.0, 0.0, 300, 4.0, 1.0]
[3, 'Nc1

RDKit ERROR: [12:43:28] Can't kekulize mol.  Unkekulized atoms: 1 2 18
[12:43:28] Can't kekulize mol.  Unkekulized atoms: 1 2 18

RDKit ERROR: 


[1, 'Cc1cn(-c2cc(C(F)(F)F)cc(C(F)(F)F)c2)no1', 'invalid']
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.5, 0.0, 300, 6.0, 1.0]
[2, 'Nc1cc(OC(F)(F)F)cc(C(F)(F)F)c1CCl', [293.004210812, 3.9250000000000007, 0.5093443143929853]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.5, 0.0, 300, 6.0, 1.0]
[3, 'FC(F)(F)Oc1ccc(-c2ccc(C(F)(F)F)cc2)cc1', [306.047934196, 5.271000000000003, 0.6866747159445941]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.5, 0.0, 300, 6.0, 1.0]
[4, 'O=Cc1ccc(C(F)(F)F)c(C(F)(F)F)c1CCl', [289.99331178, 4.275500000000001, 0.45135479469340883]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.5, 0.0, 300, 6.0, 2.0]
[0, 'OCC(Oc1cccc(C(F)(F)F)c1)CC(F)(F)CF', [302.074148944, 3.440000000000002, 0.8157391651560885]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.5, 0.0, 300, 6.0, 2.0]
[1, 'O=Cc1cc(OCC(F)(F)C(F)(F)C(F)F)ccc1N', [301.05374784800006, 2.995800000000001, 0.4990692145879178]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] :

RDKit ERROR: [12:44:34] Can't kekulize mol.  Unkekulized atoms: 2 3 4 9 10
[12:44:34] Can't kekulize mol.  Unkekulized atoms: 2 3 4 9 10

RDKit ERROR: 


[2, 'NCc1cc(C(F)(F)F)nc(C(F)(F)F)o1', 'invalid']
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.5, -0.1, 250, 6.0, 1.0]
[3, 'C[C@H](N)CC(=O)NC(C(F)(F)F)C(F)(F)F', [252.06973226, 1.3331, 0.7444164054209939]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.5, -0.1, 250, 6.0, 1.0]
[4, 'Cc1cc(OC(F)(F)F)nc(C(F)(F)F)c1', [245.0275331, 3.3074200000000005, 0.7079078588518117]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.5, -0.1, 250, 6.0, 2.0]
[0, 'OCc1c(F)c(F)c(OC(F)(F)F)c(F)c1', [246.011548688, 2.4948000000000006, 0.6412837531178605]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.5, -0.1, 250, 6.0, 2.0]
[1, 'C=CCOC(=O)CC(F)(F)C(F)(F)C(F)F', [250.042848816, 2.641400000000001, 0.4112523328045644]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.5, -0.1, 250, 6.0, 2.0]
[2, 'OCc1cc(C(F)(F)F)cc(C(F)(F)F)c1O', [260.027198752, 2.922100000000001, 0.7619966870072578]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.5, -0.1, 250, 6.0, 2.0]
[3, 'Cc

RDKit ERROR: [12:45:23] Can't kekulize mol.  Unkekulized atoms: 8 9 10 11 16
[12:45:23] Can't kekulize mol.  Unkekulized atoms: 8 9 10 11 16

RDKit ERROR: 


[2, 'OCc1ccc(OCc2nnc(C(F)(F)F)n2)c(F)c1F', 'invalid']
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.5, -0.1, 300, 4.0, 2.0]
[3, 'O=S(=O)(Cc1ccc(F)cc1)c1c(F)cc(F)cc1F', [304.018113376, 3.2169000000000016, 0.6443387279782008]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.5, -0.1, 300, 4.0, 2.0]
[4, 'OC(O)(c1c(F)cccc1F)c1cc(F)c(Cl)cc1F', [306.00707002400003, 3.0821000000000005, 0.5084124290368293]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.5, -0.1, 300, 6.0, 1.0]
[0, 'OC(C(F)(F)C(F)(F)C(F)(F)F)C(Cl)(Cl)Cl', [315.905945264, 3.5504, 0.6098708811361466]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.5, -0.1, 300, 6.0, 1.0]
[1, 'OCC(c1c(F)cc(F)cc1Cl)C(F)(F)C(F)(F)F', [309.99954003199997, 3.8917, 0.8397994768265501]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 7.5, -0.1, 300, 6.0, 1.0]
[2, 'FC(F)(F)Oc1ccnc(-c2ccc(C(F)(F)F)cc2)c1', [307.043183164, 4.666000000000001, 0.7524524788537045]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.5, 

RDKit ERROR: [12:49:09] Explicit valence for atom # 11 C, 5, is greater than permitted
[12:49:09] Explicit valence for atom # 11 C, 5, is greater than permitted


[4, 'OC(O)(c1ccc(F)cc1F)C(F)(F)(F)F', 'invalid']
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.0, 7.0, -0.1, 300, 4.0, 1.0]
[0, 'Cc1ccc(CC(=O)Nc2cc(F)cc(F)c2)c(F)c1F', [297.077676852, 3.7326200000000016, 0.8592905532581026]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.0, 7.0, -0.1, 300, 4.0, 1.0]
[1, 'Cc1cc(C(F)(F)F)nc(Oc2cc(F)cc(Cl)c2)n1', [306.01830340400005, 4.388620000000001, 0.77190317119358]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.0, 7.0, -0.1, 300, 4.0, 1.0]
[2, 'CCCc1ncc(C(F)(F)F)c(Oc2ccc(F)cc2)n1', [300.088575884, 4.379300000000002, 0.7859681173757983]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.0, 7.0, -0.1, 300, 4.0, 1.0]
[3, 'CC(OCC(F)(F)C(F)F)c1ccc(Cl)cc1Cl', [304.00448317999997, 4.971400000000003, 0.697763629651831]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.0, 7.0, -0.1, 300, 4.0, 1.0]
[4, 'N#Cc1ccc(OCC(F)(F)C(F)F)cc1Br', [310.95688879200003, 3.5999800000000013, 0.7959543160877326]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : 

RDKit ERROR: [12:54:06] Can't kekulize mol.  Unkekulized atoms: 2 3 8
[12:54:06] Can't kekulize mol.  Unkekulized atoms: 2 3 8

RDKit ERROR: 


[2, 'Cn1nc(C(F)(F)F)c(-c2ccc(C(F)(F)F)cc2)c1=O', 'invalid']
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.0, 7.5, -0.1, 300, 6.0, 1.0]
[3, 'Cc1ccc(OCC(F)(F)C(F)(F)C(F)(F)Cl)cc1', [306.024611908, 4.476020000000002, 0.5756619727305032]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.0, 7.5, -0.1, 300, 6.0, 1.0]
[4, 'Fc1cc(OC(F)(F)F)ccc1-c1cc(F)cc(F)c1', [292.03228413200003, 4.669500000000001, 0.7308402051099989]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.0, 7.5, -0.1, 300, 6.0, 2.0]
[0, 'OC(Cc1ccc(OC(F)(F)F)cc1)CC(F)(F)CF', [302.074148944, 3.483500000000002, 0.8152320560510349]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.0, 7.5, -0.1, 300, 6.0, 2.0]
[1, 'OCc1ccc(COCC(F)(F)C(F)(F)C(F)F)cc1', [302.074148944, 3.231200000000002, 0.7836399336441776]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O'] : [0.0, 7.5, -0.1, 300, 6.0, 2.0]
[2, 'Oc1cc(OC(F)(F)F)ccc1SCC(F)(F)F', [291.999269752, 3.9452000000000007, 0.6755165063047532]]
['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O']