In [1]:
import psi4
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import pandas as pd
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import CalcMolFormula

# Import CSV file containing fuels

In [2]:
df= pd.read_csv('FuelList2.csv')
df

Unnamed: 0,ID,Fuel,Boiling Point (K)
0,103,Methane,111.65
1,104,Triptane,353.95
2,105,"2,2-Dimethylpropane",282.65
3,106,"2,2-Dimethylbutane",322.85
4,107,Vinylacetylene,278.25
5,108,Methylacetylene,249.95
6,109,"1,3-Butadiene",268.75
7,110,1-Heptyne,372.85
8,111,2-Pentene,309.883
9,112,Diisobutylene,374.35


# Function to query the NIH database for molecules matching the input string and then perform chemical structure identifier conversion: IUPAC to SMILES

In [3]:
from urllib.request import urlopen
from urllib.parse import quote
from rdkit.Chem import AllChem

#Chemical name to SMILES:

def toSmiles(ids):
    try:
        url = 'https://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/smiles'
        ans = urlopen(url).read().decode('utf8')
        return ans
    except:
        return 'Enter a valid IUPAC name'

# Generate SMILES column and add it to existing dataframe

In [4]:
fuel_name = df['Fuel']
smiles_list = []
index = -1
for ids in fuel_name:
    index+=1
    smiles_list.append(toSmiles(ids))
    if toSmiles(ids) == 'Enter a valid IUPAC name':
        print(ids + ' failed at index ' + str(index))

# Define function that will optimize a molecule's 3D structure to obtain the structure with the lowest energy - Round 1
# **Important for desriptor calculations

In [5]:
def optimize(m):
    """ 
        Input: RDKit molecule object
        
        Optimizes molecular structure by applying the Merck molecular force field 94(MMFF94) before performing tasks:
        
        1. Calculates total number of atoms
        
        2. Coverts geometry of mol object to a string of xyz coordinates
    
    """
    m = Chem.AddHs(m)#add hydrogens to structure
    #Convert from 2D to 3D
    AllChem.EmbedMolecule(m,randomSeed=0xf00d,useExpTorsionAnglePrefs=True,useBasicKnowledge=True)#intialize molecular conformation
    AllChem.MMFFOptimizeMolecule(m,'MMFF94')#apply force field
    atoms = m.GetAtoms()
    string = "\n"
    for i, atom in enumerate(atoms):
        pos = m.GetConformer().GetAtomPosition(atom.GetIdx())
        string += "{} {} {} {}\n".format(atom.GetSymbol(), pos.x, pos.y, pos.z)
    string += "units angstrom\n"
    string += "symmetry c1\n"
    #-------------------------------------------
    totAtoms = m.GetNumAtoms()
    #-----------------------------------------------
    
    
    return totAtoms, string, m

# Calculate descriptors: Number of atoms, molecular weight, IC0, PJI3, SIC0, GATS1v, Wiener Index, Zagreb Index

In [6]:
from mordred import InformationContent
from mordred import GeometricalIndex
from mordred import Autocorrelation
from mordred import WienerIndex
from mordred import ZagrebIndex

IC0_calc = InformationContent.InformationContent(order=0)
petitjean_calc = GeometricalIndex.PetitjeanIndex3D()
SIC0_calc = InformationContent.StructuralIC(order=0)
GATS1v_calc = Autocorrelation.GATS(order=1,prop='v')
wiener_index = WienerIndex.WienerIndex()
zagreb_index1 = ZagrebIndex.ZagrebIndex(version = 1) 



IC0_list = [] # Information content index (neighborhood symmetry of 0-order)
petitjean_3D = [] #3D petitjean shape index
SIC0_list = []
GATS1v_list = []
wiener_list= [] #stores wiener indices
Z1_list = [] #stores zagreb1 indices



MW_list = [] #stores molecular weight
mol_form = [] #stores molecular formulas
N_atoms = [] #stores number of atoms
xyzgeom_list = [] #stores xyz coordinates


for smiles in smiles_list:

    mol = Chem.MolFromSmiles(smiles)
    MW = Descriptors.MolWt(mol)
    MW_list.append(MW)
    form = CalcMolFormula(mol) 
    mol_form.append(form)
    
    totN_atoms, xyzcoords, mol2 = optimize(mol)
    xyzgeom_list.append(xyzcoords)
    N_atoms.append(totN_atoms)
    
    ic0 = IC0_calc(mol2)
    IC0_list.append(ic0)
    petit_index = petitjean_calc(mol2)
    petitjean_3D.append(petit_index)
    sic0_index = SIC0_calc(mol2)
    SIC0_list.append(sic0_index)
    gats = GATS1v_calc(mol2)
    GATS1v_list.append(gats)
    wiener = wiener_index(mol2)
    wiener_list.append(wiener)
    Z1 = zagreb_index1(mol2)
    Z1_list.append(Z1)

# Add columns with new properties to existing dataframe

In [7]:
df['Molecular Formula'] = mol_form
df['N_Atoms'] = N_atoms
df['Molecular Weight (g/mol)'] = MW_list
df['IC0'] = IC0_list
df['PJ3'] = petitjean_3D
df['SIC0'] = SIC0_list
df['GATS1v'] = GATS1v_list
df['Wiener'] = wiener_list
df['Z1'] = Z1_list
df

Unnamed: 0,ID,Fuel,Boiling Point (K),Molecular Formula,N_Atoms,Molecular Weight (g/mol),IC0,PJ3,SIC0,GATS1v,Wiener,Z1
0,103,Methane,111.65,CH4,5,16.043,0.721928,0.387628,0.310918,2.5,0,0.0
1,104,Triptane,353.95,C7H16,23,100.205,0.886541,0.373455,0.195983,1.642857,42,30.0
2,105,"2,2-Dimethylpropane",282.65,C5H12,17,72.151,0.873981,0.49582,0.21382,1.7,16,20.0
3,106,"2,2-Dimethylbutane",322.85,C6H14,20,86.178,0.881291,0.37567,0.203912,1.666667,28,24.0
4,107,Vinylacetylene,278.25,C4H4,8,52.076,1.0,0.392134,0.333333,1.0,10,10.0
5,108,Methylacetylene,249.95,C3H4,7,40.065,0.985228,0.465948,0.350945,1.166667,4,6.0
6,109,"1,3-Butadiene",268.75,C4H6,10,54.092,0.970951,0.377314,0.292285,1.25,10,10.0
7,110,1-Heptyne,372.85,C7H12,19,96.173,0.949452,0.471906,0.223509,1.357143,56,22.0
8,111,2-Pentene,309.883,C5H10,15,70.135,0.918296,0.47465,0.235045,1.5,20,14.0
9,112,Diisobutylene,374.35,C8H16,24,112.216,0.918296,0.466897,0.200284,1.5,66,34.0


# Import molecular geometries from cartesian xyz coordinates

In [8]:
psi4_mols = []
for geo in xyzgeom_list:
    psi4mol = psi4.geometry(geo)
    psi4_mols.append(psi4mol)
    
E_list = []
wfn_list = []
homo_list = []
lumo_list = []
DM_list = []
const = psi4.constants.dipmom_au2debye #atomic units to Debye conversion factor for dipoles

# Molecular structure optimization round 2 + calculate HOMO, LUMO, and DM

In [9]:
psi4.set_options({'reference': 'uhf'})

k=102
for molec in psi4_mols: #iterate over list of molecular geometries
    
    E, wfn= psi4.optimize("B3LYP/6-31G*", molecule = molec,return_wfn=True) #returns energy and wavefunction of molecule
    
    E_list.append(E)
    wfn_list.append(wfn)
    
    HOMO = ( np.array(wfn.epsilon_a_subset("AO", "ALL")) )[wfn.nalpha()-1]
    LUMO = ( np.array(wfn.epsilon_a_subset("AO", "ALL")) )[wfn.nalpha()]
    dipole_xyz = wfn.variable("SCF DIPOLE")
    dipole_debye = np.linalg.norm(dipole_xyz) *const
  
    homo_list.append(HOMO)
    lumo_list.append(LUMO)
    DM_list.append(dipole_debye)
    
    k+=1
    strID = "fuel"+str(k)+ ".xyz"
    molec.save_xyz_file(strID,1) #write final optimized geometry to XYZ file
    print("Fuel "+str(k)+" done, "+ "k: "+str(k))

Optimizer: Optimization complete!
Fuel 103 done, k: 103
Optimizer: Optimization complete!
Fuel 104 done, k: 104
Optimizer: Optimization complete!
Fuel 105 done, k: 105
Optimizer: Optimization complete!
Fuel 106 done, k: 106
Optimizer: Optimization complete!
Fuel 107 done, k: 107
Optimizer: Optimization complete!
Fuel 108 done, k: 108
Optimizer: Optimization complete!
Fuel 109 done, k: 109
Optimizer: Optimization complete!
Fuel 110 done, k: 110
Optimizer: Optimization complete!
Fuel 111 done, k: 111
Optimizer: Optimization complete!
Fuel 112 done, k: 112
Optimizer: Optimization complete!
Fuel 113 done, k: 113
Optimizer: Optimization complete!
Fuel 114 done, k: 114
Optimizer: Optimization complete!
Fuel 115 done, k: 115
Optimizer: Optimization complete!
Fuel 116 done, k: 116
Optimizer: Optimization complete!
Fuel 117 done, k: 117
Optimizer: Optimization complete!
Fuel 118 done, k: 118
Optimizer: Optimization complete!
Fuel 119 done, k: 119
Optimizer: Optimization complete!
Fuel 120 done,

# Add HOMO, LUMO, DM lists to dataframe and save as CSV file

In [10]:
df2 = pd.DataFrame(list(zip(homo_list,lumo_list,DM_list)), columns=['HOMO','LUMO','DM'])
df2

Unnamed: 0,HOMO,LUMO,DM
0,-0.38906,0.117915,4e-06
1,-0.296902,0.075274,0.040989
2,-0.314737,0.075818,0.000123
3,-0.303212,0.076642,0.040468
4,-0.24119,-0.025804,0.332664
5,-0.259467,0.061985,0.686964
6,-0.22901,-0.02251,5e-06
7,-0.257193,0.055737,0.635912
8,-0.235301,0.033577,0.042378
9,-0.237243,0.024634,0.419141


In [11]:
df3 = pd.concat([df,df2], axis=1)
df3

Unnamed: 0,ID,Fuel,Boiling Point (K),Molecular Formula,N_Atoms,Molecular Weight (g/mol),IC0,PJ3,SIC0,GATS1v,Wiener,Z1,HOMO,LUMO,DM
0,103,Methane,111.65,CH4,5,16.043,0.721928,0.387628,0.310918,2.5,0,0.0,-0.38906,0.117915,4e-06
1,104,Triptane,353.95,C7H16,23,100.205,0.886541,0.373455,0.195983,1.642857,42,30.0,-0.296902,0.075274,0.040989
2,105,"2,2-Dimethylpropane",282.65,C5H12,17,72.151,0.873981,0.49582,0.21382,1.7,16,20.0,-0.314737,0.075818,0.000123
3,106,"2,2-Dimethylbutane",322.85,C6H14,20,86.178,0.881291,0.37567,0.203912,1.666667,28,24.0,-0.303212,0.076642,0.040468
4,107,Vinylacetylene,278.25,C4H4,8,52.076,1.0,0.392134,0.333333,1.0,10,10.0,-0.24119,-0.025804,0.332664
5,108,Methylacetylene,249.95,C3H4,7,40.065,0.985228,0.465948,0.350945,1.166667,4,6.0,-0.259467,0.061985,0.686964
6,109,"1,3-Butadiene",268.75,C4H6,10,54.092,0.970951,0.377314,0.292285,1.25,10,10.0,-0.22901,-0.02251,5e-06
7,110,1-Heptyne,372.85,C7H12,19,96.173,0.949452,0.471906,0.223509,1.357143,56,22.0,-0.257193,0.055737,0.635912
8,111,2-Pentene,309.883,C5H10,15,70.135,0.918296,0.47465,0.235045,1.5,20,14.0,-0.235301,0.033577,0.042378
9,112,Diisobutylene,374.35,C8H16,24,112.216,0.918296,0.466897,0.200284,1.5,66,34.0,-0.237243,0.024634,0.419141


In [12]:
df3.to_csv('FuelDescriptors4.csv', encoding='utf-8', index=False)