# Create Training Data Set for VAE

In [11]:
import numpy as np
import pandas as pd
import os
import sys
mypath = os.path.dirname(os.path.dirname(os.path.abspath('num_molecules.db')))
sys.path.insert(1, mypath)
from scripts.FindCombinations import findCombinations

### Import num_molecules.db

In [12]:
molecules = {}
with open(mypath+'/data/num_molecules.db', 'r') as f:
    for line in f:
        keys = line.split(':')[0]
        keys = tuple(map(int, keys.split(',')))
        values = line.split(':')[1].strip('\n')
        values = list(values.split(','))
        molecules[keys] = values

In [13]:
# Need to add CO2, CO, O2, H2, H2O
#molecules[(1, 2, 0)] = molecules[(1, 2, 0)].append('C(=O)=O')
molecules[(1, 2, 0)] = ['C(=O)=O'] # CO2
molecules[(0, 1, 2)] = ['O'] # H2O
molecules[(1, 1, 0)] = ['[C-]#[O+]'] # CO
molecules[(0, 0, 2)] = ['[HH]'] # H2
molecules[(0, 2, 0)] = ['O=O'] # O2

In [14]:
combinations = findCombinations(molecules, 'CCOCCO', seperateSmileStrings=True)
#print(combinations) # (4, 2, 10)
print()
print('Number of combinations:', len(combinations))


Number of combinations: 223


In [15]:
df = pd.read_csv(mypath+'/data/shuffled_database.csv', skiprows=1, names=['id', 'smiles', 'U', 'H', 'F'])

'''
Add data for [HH], O=O, C(=O)=O, [C-]#[O+]
TO DO: Fill out the actual value for [HH], O=O, C(=O)=O, [C-]#[O+]
'''
df = df.append({'id' : 119865, 'smiles' : '[HH]', 'U' : 0, 'H' : 0, 'F' : 0} , ignore_index=True)
df = df.append({'id' : 119866, 'smiles' : 'O=O', 'U' : 0, 'H' : 0, 'F' : 0} , ignore_index=True)
df = df.append({'id' : 119867, 'smiles' : 'C(=O)=O', 'U' : 0, 'H' : 0, 'F' : 0} , ignore_index=True)
df = df.append({'id' : 119868, 'smiles' : '[C-]#[O+]', 'U' : 0, 'H' : 0, 'F' : 0} , ignore_index=True)
df.tail()

Unnamed: 0,id,smiles,U,H,F
119862,24734,Nc1occ2c1OCC2,-437.888934,-437.88799,-437.928548
119863,119865,[HH],0.0,0.0,0.0
119864,119866,O=O,0.0,0.0,0.0
119865,119867,C(=O)=O,0.0,0.0,0.0
119866,119868,[C-]#[O+],0.0,0.0,0.0


In [16]:
# Find the delta U, F, H for each combination, put them in a list called data
data = []
for comb in combinations:
    deltaU = 0
    deltaH = 0
    deltaF = 0
    for molecule in comb:
        row = df.loc[df['smiles'] == molecule]
        try:
            deltaU += float(row['U'])
            deltaH += float(row['H'])
            deltaF += float(row['F'])
        except:
            print(molecule)
    data.append([comb, deltaU, deltaH, deltaF])
    #print(comb, ': deltaU =', deltaU, 'deltaH =', deltaH, 'deltaF =', deltaF)

In [17]:
# Rewrite the combination string into a single joint SMILE
for row in data:
    comb = row[0] # combination is placed at the 0th index
    jointSMILE = comb[0]
    for i in range(1,len(comb)):
        jointSMILE += '.' + comb[i]
    row[0] = jointSMILE

data

[['O=O.CC(C)C', -158.336603, -158.335658, -158.370016],
 ['O=O.CCCC', -158.33517, -158.334226, -158.36893999999995],
 ['C.C1COCO1', -308.745077, -308.74318800000003, -308.80089000000004],
 ['C.COCC=O', -308.74309500000004, -308.741205, -308.80171700000005],
 ['C.CC(=O)CO', -308.771146, -308.769257, -308.829904],
 ['C.OCC1CO1', -308.719553, -308.717664, -308.775825],
 ['C.O=CCCO', -308.75266400000004, -308.75077500000003, -308.81093500000003],
 ['C.OC1COC1', -308.719296, -308.717407, -308.77516900000006],
 ['C.CC(O)C=O', -308.757567, -308.755678, -308.815579],
 ['C.CCOC=O', -308.778053, -308.77616400000005, -308.83603600000004],
 ['C.OC1CC1O', -308.721914, -308.720025, -308.777531],
 ['C.COC(C)=O', -308.791113, -308.789223, -308.84981100000005],
 ['C=O.CCOC', -308.729331, -308.727443, -308.787167],
 ['C=O.CCCO', -308.736241, -308.734353, -308.794198],
 ['C=O.CC(C)O', -308.742494, -308.740606, -308.79993099999996],
 ['CO.OC1CC1', -308.70617200000004, -308.70428300000003, -308.76356499999

In [18]:
# Reformat into a database and output to a csv file
joint_SMILE_df = pd.DataFrame(data, columns=['Joint SMILE', 'U', 'H', 'F'])
joint_SMILE_df.to_csv(mypath+'/data/joint_SMILE_db.csv', sep='\t')