In [None]:
import os, argparse
from build_nnfps import main as build_nnfps_main
import numpy as np
import pandas as pds
import pickle, math
from preprocessing import data_preparation, _char_set, get_property, canonocalize
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import SSVAE
import torch
import random

# Molecule fingerprint regeneration for transfer learning, if necessary

In [None]:
# training datafiles to use, pick the combination as you like
csv_files = ['./data/paper_MP_IE_EA.csv',
             './data/paper_MP_clean_canonize_cut.csv',
             './data/paper_ZINC_310k.csv',
             './data/paper_clean_viscosity.csv',
             './data/paper_pubchem_fluorocarbon.csv'] 
# 'data/paper_clean_DC.csv' is skipped for now because it has many wrong & unphysical labels because of NLP

out_pkl = 'data/smiles2nn.pkl'
if os.path.isfile(out_pkl):
    os.system('rm '+out_pkl)

first = True
for csv_file in csv_files:
    args = argparse.Namespace()
    args.csv_file = csv_file
    if first:
        args.input_vocab_file = ''
        args.output_vocab_file = out_pkl
        args.fp_check = False
        # build_nnfps_main(args)
        first = False
    else:
        args.input_vocab_file = out_pkl
        args.output_vocab_file = out_pkl
        args.fp_check = False
        # build_nnfps_main(args)

# Prepare Combination of Dirty Data for Training SSVAE Model

In [None]:
# are we doing pre-training version? if yes, change pretrain to True
pretrain = False
pretrain_pkl = 'data/smiles2nn.pkl'

# we will try train generative model with these training data files
if 'csv_files' in locals():
    data_uri = csv_files
else:
    data_uri = ['./data/paper_MP_IE_EA.csv',
                './data/paper_MP_clean_canonize_cut.csv',
                './data/paper_ZINC_310k.csv',
                './data/paper_clean_viscosity.csv',
                './data/paper_pubchem_fluorocarbon.csv'] 

# first check how many data points we have in the training data
tmp_smiles = {}
for csv_in in data_uri:
    data = pds.read_csv( csv_in )
    for i in range(len(data['SMILES'])):
        tmp_smiles[ canonocalize(data['SMILES'][i]) ] = True

# define training, validation, and test set division
ntotal = len(tmp_smiles)
ntrn = math.floor( 0.9 * ntotal )
ntst = ntotal - ntrn
frac_val = 0.05
del data, tmp_smiles

# data preparation
if pretrain:
    data, scaler_Y = data_preparation(data_uri, ntrn, ntst,
                                      frac_val = frac_val,
                                      pretrain_uri = pretrain_pkl)
else:
    data, scaler_Y = data_preparation(data_uri, ntrn, ntst,
                                      frac_val = frac_val)

# tag for dumping intermediate models
dt = '221122121322'
model_dir = 'models/'+dt
tmp_model_tag = model_dir+'/'    
scaler_Y_pkl = 'preprocessed_scaler_Y.pkl'
scaler_Y = pickle.load(open(tmp_model_tag+scaler_Y_pkl,'rb'))

print('::: Data preparation completed')

# Load Previously Trained Model, if Needed

In [None]:
# pre-defined parameters
beta=10000.
char_set = _char_set()
dim_z = 100
dim_h = 250
n_hidden = 3
batch_size = 100

# tag for dumping intermediate models
save_uri = tmp_model_tag+'model_final.pth.tar'

# Instantiate the model
model = SSVAE.TorchModel(sample_data = data, dim_z = dim_z, dim_h = dim_h,
                         n_hidden = n_hidden, batch_size = batch_size, beta = float(beta), char_set = char_set,
                         tmp_model_tag = tmp_model_tag)
dev = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
model.initialize(torch.Tensor(data['trnY']), torch.Tensor(data['trnMask']), dev)
model.load_state_dict( torch.load(save_uri)['state_dict'] )
model.eval()

# Property Prediction Performance

In [None]:
scaled_tstY_hat = model.predict_routine(sample_data = data).detach().cpu().numpy()
tstY_hat = [scaler_Y[i].inverse_transform(scaled_tstY_hat[:,i:i+1]) for i in range(scaled_tstY_hat.shape[1])]
tstY_hat = np.concatenate(tstY_hat, axis=1)

dim_y = data['dim_y']
tstY = data['tstY']
tstMask = data['tstMask']
Y_names = data['Y_names']
for j in range(dim_y):
    idx = np.where( tstMask[:,j] == 1 )[0]
    print('Label Name:', Y_names[j])
    print([j, mean_absolute_error(tstY[idx,j], tstY_hat[idx,j])])

# Unconditional Generation

In [None]:
print('Unconditional')
for t in range(10):
    smi = model.sampling_unconditional()
    print([t, smi, get_property(smi)])

# Conditional Generation

In [None]:
def print_log(text):
    out_file = 'gen_log.txt'
    if os.path.isfile(out_file):
        open(out_file,'a').write(text+'\n')
    else:
        open(out_file,'w').write(text+'\n')
    return

print('Conditional')
# Determine the property values we want to use for conditional generation
ynames = ['EA', 'IE', 'LogVis', 'MolWt', 'n_F', 'n_O']
yids = [Y_names.index(yname) for yname in ynames]

for i in range(50000):
    i1 = random.sample([j*0.2 for j in range(11)], 1)[0]
    i2 = random.sample([j*0.2 + 6.0 for j in range(11)], 1)[0]
    i3 = random.sample([j*0.1 - 0.5 for j in range(11)], 1)[0]
    i4 = random.sample([j*10 + 150 for j in range(21)], 1)[0]
    i5 = random.sample([j*1.0 for j in range(4,10)], 1)[0]
    i6 = random.sample([j*1.0 for j in range(1,4)], 1)[0]

    ytargets = [i1,i2,i3,i4,i5,i6]
    ymeans = np.array([scaler_Y[yid].mean_[0] for yid in yids])
    ystds = np.array([np.sqrt(scaler_Y[yid].var_[0]) for yid in yids])
    ytargets_transform = ( np.array(ytargets) - ymeans ) / ystds
    
    print(ynames, ':', ytargets)
    smi = model.sampling_conditional(yids, ytargets_transform)
    props = get_property(smi)
    print([i, smi, props])
    print_log(str([i, smi, props]))