In [22]:
import psi4
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import pandas as pd
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import CalcMolFormula

# Import CSV file containing fuels

In [23]:
df = pd.read_csv('FuelList1.csv')
df.drop(['Boiling Point (F)'],axis='columns',inplace=True)
df.drop(df.index[70:102],inplace=True)
df

Unnamed: 0,ID,Fuel,Boiling Point (K)
0,1,"1,3,5-trimethylbenzene",437.872222
1,2,1-Butene,267.038889
2,3,1-Pentene,303.038889
3,4,"2,3-Dimethylpentane",363.150000
4,5,"2,5-dimethylhexane",382.261111
...,...,...,...
65,66,Methyl chloride,248.927778
66,67,Methyl decanoate,497.150000
67,68,Methyl formate,304.950000
68,69,Methyl methacrylate,374.150000


In [24]:
df.drop(df.index[0:39],inplace=True)
df.reset_index(drop=True,inplace=True)
df

Unnamed: 0,ID,Fuel,Boiling Point (K)
0,40,Ethyl bromide,311.538889
1,41,Ethyl chloride,285.427778
2,42,Ethyl formate,327.372222
3,43,ethylbenzene,409.372222
4,44,ethylcyclohexane,402.538889
5,45,Ethylene,169.427778
6,46,Ethylene dichloride,356.65
7,47,Formaldehyde,254.15
8,48,Furfural,434.872222
9,49,Hexane,342.15


# Function to query the NIH database for molecules matching the input string and then perform chemical structure identifier conversion: IUPAC to SMILES

In [25]:
from urllib.request import urlopen
from urllib.parse import quote
from rdkit.Chem import AllChem

#Chemical name to SMILES:

def toSmiles(ids):
    try:
        url = 'https://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/smiles'
        ans = urlopen(url).read().decode('utf8')
        return ans
    except:
        return 'Enter a valid IUPAC name'

# Generate SMILES column and add it to existing dataframe

In [26]:
fuel_name = df['Fuel']
smiles_list = []
index = -1
for ids in fuel_name:
    index+=1
    smiles_list.append(toSmiles(ids))
    if toSmiles(ids) == 'Enter a valid IUPAC name':
        print(ids + ' failed at index ' + str(index))

Isocetane failed at index 13


In [27]:
smiles_list[13] = 'CC(CC(C)(C)C)CC(C)(C)CC(C)(C)C'
df['SMIILES'] = smiles_list
df

Unnamed: 0,ID,Fuel,Boiling Point (K),SMIILES
0,40,Ethyl bromide,311.538889,CCBr
1,41,Ethyl chloride,285.427778,CCCl
2,42,Ethyl formate,327.372222,CCOC=O
3,43,ethylbenzene,409.372222,CCc1ccccc1
4,44,ethylcyclohexane,402.538889,CCC1CCCCC1
5,45,Ethylene,169.427778,C=C
6,46,Ethylene dichloride,356.65,ClCCCl
7,47,Formaldehyde,254.15,C=O
8,48,Furfural,434.872222,O=Cc1occc1
9,49,Hexane,342.15,CCCCCC


# Define function that will optimize a molecule's 3D structure to obtain the structure with the lowest energy - Round 1
# **Important for desriptor calculations

In [28]:
def optimize(m):
    """ 
        Input: RDKit molecule object
        
        Optimizes molecular structure by applying the Merck molecular force field 94(MMFF94) before performing tasks:
        
        1. Calculates total number of atoms
        
        2. Coverts geometry of mol object to a string of xyz coordinates
    
    """
    m = Chem.AddHs(m)#add hydrogens to structure
    #Convert from 2D to 3D
    AllChem.EmbedMolecule(m,randomSeed=0xf00d,useExpTorsionAnglePrefs=True,useBasicKnowledge=True)#intialize molecular conformation
    AllChem.MMFFOptimizeMolecule(m,'MMFF94')#apply force field
    atoms = m.GetAtoms()
    string = "\n"
    for i, atom in enumerate(atoms):
        pos = m.GetConformer().GetAtomPosition(atom.GetIdx())
        string += "{} {} {} {}\n".format(atom.GetSymbol(), pos.x, pos.y, pos.z)
    string += "units angstrom\n"
    string += "symmetry c1\n"
    #-------------------------------------------
    totAtoms = m.GetNumAtoms()
    #-----------------------------------------------
    
    
    return totAtoms, string, m

# Calculate descriptors: Number of atoms, molecular weight, IC0, PJI3, SIC0, GATS1v, Wiener Index, Zagreb Index

In [29]:
from mordred import InformationContent
from mordred import GeometricalIndex
from mordred import Autocorrelation
from mordred import WienerIndex
from mordred import ZagrebIndex

IC0_calc = InformationContent.InformationContent(order=0)
petitjean_calc = GeometricalIndex.PetitjeanIndex3D()
SIC0_calc = InformationContent.StructuralIC(order=0)
GATS1v_calc = Autocorrelation.GATS(order=1,prop='v')
wiener_index = WienerIndex.WienerIndex()
zagreb_index1 = ZagrebIndex.ZagrebIndex(version = 1) 



IC0_list = [] # Information content index (neighborhood symmetry of 0-order)
petitjean_3D = [] #3D petitjean shape index
SIC0_list = []
GATS1v_list = []
wiener_list= [] #stores wiener indices
Z1_list = [] #stores zagreb1 indices



MW_list = [] #stores molecular weight
mol_form = [] #stores molecular formulas
N_atoms = [] #stores number of atoms
xyzgeom_list = [] #stores xyz coordinates


for smiles in smiles_list:

    mol = Chem.MolFromSmiles(smiles)
    MW = Descriptors.MolWt(mol)
    MW_list.append(MW)
    form = CalcMolFormula(mol) 
    mol_form.append(form)
    
    totN_atoms, xyzcoords, mol2 = optimize(mol)
    xyzgeom_list.append(xyzcoords)
    N_atoms.append(totN_atoms)
    
    ic0 = IC0_calc(mol2)
    IC0_list.append(ic0)
    petit_index = petitjean_calc(mol2)
    petitjean_3D.append(petit_index)
    sic0_index = SIC0_calc(mol2)
    SIC0_list.append(sic0_index)
    gats = GATS1v_calc(mol2)
    GATS1v_list.append(gats)
    wiener = wiener_index(mol2)
    wiener_list.append(wiener)
    Z1 = zagreb_index1(mol2)
    Z1_list.append(Z1)

# Add columns with new properties to existing dataframe

In [30]:
df['Molecular Formula'] = mol_form
df['N_Atoms'] = N_atoms
df['Molecular Weight (g/mol)'] = MW_list
df['IC0'] = IC0_list
df['PJ3'] = petitjean_3D
df['SIC0'] = SIC0_list
df['GATS1v'] = GATS1v_list
df['Wiener'] = wiener_list
df['Z1'] = Z1_list
df

Unnamed: 0,ID,Fuel,Boiling Point (K),SMIILES,Molecular Formula,N_Atoms,Molecular Weight (g/mol),IC0,PJ3,SIC0,GATS1v,Wiener,Z1
0,40,Ethyl bromide,311.538889,CCBr,C2H5Br,8,108.966,1.298795,0.43459,0.432932,1.028368,4,6.0
1,41,Ethyl chloride,285.427778,CCCl,C2H5Cl,8,64.515,1.298795,0.41181,0.432932,1.226676,4,6.0
2,42,Ethyl formate,327.372222,CCOC=O,C3H6O2,11,74.079,1.435371,0.347865,0.414915,1.520356,20,14.0
3,43,ethylbenzene,409.372222,CCc1ccccc1,C8H10,18,106.168,0.991076,0.458236,0.237672,1.0625,64,34.0
4,44,ethylcyclohexane,402.538889,CCC1CCCCC1,C8H16,24,112.216,0.918296,0.468125,0.200284,1.4375,64,34.0
5,45,Ethylene,169.427778,C=C,C2H4,6,28.054,0.918296,0.314547,0.355245,1.5,1,2.0
6,46,Ethylene dichloride,356.65,ClCCCl,C2H4Cl2,8,98.96,1.5,0.25923,0.5,0.886911,10,10.0
7,47,Formaldehyde,254.15,C=O,CH2O,4,30.026,1.5,0.399053,0.75,1.487738,1,2.0
8,48,Furfural,434.872222,O=Cc1occc1,C5H4O2,11,96.085,1.494919,0.418089,0.432128,0.910178,43,30.0
9,49,Hexane,342.15,CCCCCC,C6H14,20,86.178,0.881291,0.352204,0.203912,1.666667,35,18.0


# Import molecular geometries from cartesian xyz coordinates

In [31]:
psi4_mols = []
for geo in xyzgeom_list:
    psi4mol = psi4.geometry(geo)
    psi4_mols.append(psi4mol)
    
E_list = []
wfn_list = []
homo_list = []
lumo_list = []
DM_list = []
const = psi4.constants.dipmom_au2debye #atomic units to Debye conversion factor for dipoles

# Molecular structure optimization round 2 + calculate HOMO, LUMO, and DM

In [32]:
psi4.set_options({'reference': 'uhf'})

k=39
for molec in psi4_mols: #iterate over list of molecular geometries
    
    E, wfn= psi4.optimize("B3LYP/6-31G*", molecule = molec,return_wfn=True) #returns energy and wavefunction of molecule
    
    E_list.append(E)
    wfn_list.append(wfn)
    
    HOMO = ( np.array(wfn.epsilon_a_subset("AO", "ALL")) )[wfn.nalpha()-1]
    LUMO = ( np.array(wfn.epsilon_a_subset("AO", "ALL")) )[wfn.nalpha()]
    dipole_xyz = wfn.variable("SCF DIPOLE")
    dipole_debye = np.linalg.norm(dipole_xyz) *const
  
    homo_list.append(HOMO)
    lumo_list.append(LUMO)
    DM_list.append(dipole_debye)
    
    k+=1
    strID = "fuel"+str(k)+ ".xyz"
    molec.save_xyz_file(strID,1) #write final optimized geometry to XYZ file
    print("Fuel "+str(k)+" done, "+ "k: "+str(k))

Optimizer: Optimization complete!
Fuel 40 done, k: 40
Optimizer: Optimization complete!
Fuel 41 done, k: 41
Optimizer: Optimization complete!
Fuel 42 done, k: 42
Optimizer: Optimization complete!
Fuel 43 done, k: 43
Optimizer: Optimization complete!
Fuel 44 done, k: 44
Optimizer: Optimization complete!
Fuel 45 done, k: 45
Optimizer: Optimization complete!
Fuel 46 done, k: 46
Optimizer: Optimization complete!
Fuel 47 done, k: 47
Optimizer: Optimization complete!
Fuel 48 done, k: 48
Optimizer: Optimization complete!
Fuel 49 done, k: 49
Optimizer: Optimization complete!
Fuel 50 done, k: 50
Optimizer: Optimization complete!
Fuel 51 done, k: 51
Optimizer: Optimization complete!
Fuel 52 done, k: 52
Optimizer: Optimization complete!
Fuel 53 done, k: 53
Optimizer: Optimization complete!
Fuel 54 done, k: 54
Optimizer: Optimization complete!
Fuel 55 done, k: 55
Optimizer: Optimization complete!
Fuel 56 done, k: 56
Optimizer: Optimization complete!
Fuel 57 done, k: 57
Optimizer: Optimization comp

# Add HOMO, LUMO, DM lists to dataframe and save as CSV file

In [33]:
df2 = pd.DataFrame(list(zip(homo_list,lumo_list,DM_list)), columns=['HOMO','LUMO','DM'])
df2

Unnamed: 0,HOMO,LUMO,DM
0,-0.273771,-0.005205,2.344247
1,-0.291561,0.027839,2.267049
2,-0.281113,0.007657,1.895545
3,-0.235742,0.005216,0.279909
4,-0.285769,0.080818,0.054273
5,-0.266618,0.01878,9e-06
6,-0.298933,0.006168,2.933239
7,-0.268489,-0.042115,2.18693
8,-0.249584,-0.061061,4.168316
9,-0.305762,0.093285,0.02568


In [34]:
df3 = pd.concat([df,df2], axis=1)
df3

Unnamed: 0,ID,Fuel,Boiling Point (K),SMIILES,Molecular Formula,N_Atoms,Molecular Weight (g/mol),IC0,PJ3,SIC0,GATS1v,Wiener,Z1,HOMO,LUMO,DM
0,40,Ethyl bromide,311.538889,CCBr,C2H5Br,8,108.966,1.298795,0.43459,0.432932,1.028368,4,6.0,-0.273771,-0.005205,2.344247
1,41,Ethyl chloride,285.427778,CCCl,C2H5Cl,8,64.515,1.298795,0.41181,0.432932,1.226676,4,6.0,-0.291561,0.027839,2.267049
2,42,Ethyl formate,327.372222,CCOC=O,C3H6O2,11,74.079,1.435371,0.347865,0.414915,1.520356,20,14.0,-0.281113,0.007657,1.895545
3,43,ethylbenzene,409.372222,CCc1ccccc1,C8H10,18,106.168,0.991076,0.458236,0.237672,1.0625,64,34.0,-0.235742,0.005216,0.279909
4,44,ethylcyclohexane,402.538889,CCC1CCCCC1,C8H16,24,112.216,0.918296,0.468125,0.200284,1.4375,64,34.0,-0.285769,0.080818,0.054273
5,45,Ethylene,169.427778,C=C,C2H4,6,28.054,0.918296,0.314547,0.355245,1.5,1,2.0,-0.266618,0.01878,9e-06
6,46,Ethylene dichloride,356.65,ClCCCl,C2H4Cl2,8,98.96,1.5,0.25923,0.5,0.886911,10,10.0,-0.298933,0.006168,2.933239
7,47,Formaldehyde,254.15,C=O,CH2O,4,30.026,1.5,0.399053,0.75,1.487738,1,2.0,-0.268489,-0.042115,2.18693
8,48,Furfural,434.872222,O=Cc1occc1,C5H4O2,11,96.085,1.494919,0.418089,0.432128,0.910178,43,30.0,-0.249584,-0.061061,4.168316
9,49,Hexane,342.15,CCCCCC,C6H14,20,86.178,0.881291,0.352204,0.203912,1.666667,35,18.0,-0.305762,0.093285,0.02568


In [35]:
df3.to_csv('FuelDescriptors2.csv', encoding='utf-8', index=False)