In [1]:
import psi4
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import pandas as pd
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import CalcMolFormula

# Import CSV file containing fuels

In [2]:
df0 = pd.read_csv('FuelList1.csv')
df0.drop(['Boiling Point (F)'],axis='columns',inplace=True)
length = df0.shape[0]
df = df0[70:length]
df

Unnamed: 0,ID,Fuel,Boiling Point (K)
70,71,Methylethyl ketone,352.761111
71,72,m-xylene,412.261111
72,73,n-Butyl alcohol,390.872222
73,74,n-decane,447.261111
74,75,n-dodecane,489.372222
75,76,Neopentane,282.65
76,77,n-heptane,371.594444
77,78,n-hexadecane,560.372222
78,79,n-octane,398.761111
79,80,Nonane,424.15


# Function to query the NIH database for molecules matching the input string and then perform chemical structure identifier conversion: IUPAC to SMILES

In [3]:
from urllib.request import urlopen
from urllib.parse import quote
from rdkit.Chem import AllChem

#Chemical name to SMILES:

def toSmiles(ids):
    try:
        url = 'https://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/smiles'
        ans = urlopen(url).read().decode('utf8')
        return ans
    except:
        return 'Enter a valid IUPAC name'

# Generate SMILES column and add it to existing dataframe

In [4]:
fuel_name = df['Fuel']
smiles_list = []
index = -1
for ids in fuel_name:
    index+=1
    smiles_list.append(toSmiles(ids))
    if toSmiles(ids) == 'Enter a valid IUPAC name':
        print(ids + ' failed at index ' + str(index))

# Define function that will optimize a molecule's 3D structure to obtain the structure with the lowest energy - Round 1
# **Important for desriptor calculations

In [5]:
def optimize(m):
    """ 
        Input: RDKit molecule object
        
        Optimizes molecular structure by applying the Merck molecular force field 94(MMFF94) before performing tasks:
        
        1. Calculates total number of atoms
        
        2. Coverts geometry of mol object to a string of xyz coordinates
    
    """
    m = Chem.AddHs(m)#add hydrogens to structure
    #Convert from 2D to 3D
    AllChem.EmbedMolecule(m,randomSeed=0xf00d,useExpTorsionAnglePrefs=True,useBasicKnowledge=True)#intialize molecular conformation
    AllChem.MMFFOptimizeMolecule(m,'MMFF94')#apply force field
    atoms = m.GetAtoms()
    string = "\n"
    for i, atom in enumerate(atoms):
        pos = m.GetConformer().GetAtomPosition(atom.GetIdx())
        string += "{} {} {} {}\n".format(atom.GetSymbol(), pos.x, pos.y, pos.z)
    string += "units angstrom\n"
    string += "symmetry c1\n"
    #-------------------------------------------
    totAtoms = m.GetNumAtoms()
    #-----------------------------------------------
    
    
    return totAtoms, string, m

# Calculate descriptors: Number of atoms, molecular weight, IC0, PJI3, SIC0, GATS1v, Wiener Index, Zagreb Index

In [6]:
from mordred import InformationContent
from mordred import GeometricalIndex
from mordred import Autocorrelation
from mordred import WienerIndex
from mordred import ZagrebIndex

IC0_calc = InformationContent.InformationContent(order=0)
petitjean_calc = GeometricalIndex.PetitjeanIndex3D()
SIC0_calc = InformationContent.StructuralIC(order=0)
GATS1v_calc = Autocorrelation.GATS(order=1,prop='v')
wiener_index = WienerIndex.WienerIndex()
zagreb_index1 = ZagrebIndex.ZagrebIndex(version = 1) 



IC0_list = [] # Information content index (neighborhood symmetry of 0-order)
petitjean_3D = [] #3D petitjean shape index
SIC0_list = []
GATS1v_list = []
wiener_list= [] #stores wiener indices
Z1_list = [] #stores zagreb1 indices



MW_list = [] #stores molecular weight
mol_form = [] #stores molecular formulas
N_atoms = [] #stores number of atoms
xyzgeom_list = [] #stores xyz coordinates


for smiles in smiles_list:

    mol = Chem.MolFromSmiles(smiles)
    MW = Descriptors.MolWt(mol)
    MW_list.append(MW)
    form = CalcMolFormula(mol) 
    mol_form.append(form)
    
    totN_atoms, xyzcoords, mol2 = optimize(mol)
    xyzgeom_list.append(xyzcoords)
    N_atoms.append(totN_atoms)
    
    ic0 = IC0_calc(mol2)
    IC0_list.append(ic0)
    petit_index = petitjean_calc(mol2)
    petitjean_3D.append(petit_index)
    sic0_index = SIC0_calc(mol2)
    SIC0_list.append(sic0_index)
    gats = GATS1v_calc(mol2)
    GATS1v_list.append(gats)
    wiener = wiener_index(mol2)
    wiener_list.append(wiener)
    Z1 = zagreb_index1(mol2)
    Z1_list.append(Z1)

# Add columns with new properties to existing dataframe

In [7]:
df['Molecular Formula'] = mol_form
df['N_Atoms'] = N_atoms
df['Molecular Weight (g/mol)'] = MW_list
df['IC0'] = IC0_list
df['PJ3'] = petitjean_3D
df['SIC0'] = SIC0_list
df['GATS1v'] = GATS1v_list
df['Wiener'] = wiener_list
df['Z1'] = Z1_list
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Molecular Formula'] = mol_form
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['N_Atoms'] = N_atoms
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Molecular Weight (g/mol)'] = MW_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer

Unnamed: 0,ID,Fuel,Boiling Point (K),Molecular Formula,N_Atoms,Molecular Weight (g/mol),IC0,PJ3,SIC0,GATS1v,Wiener,Z1
70,71,Methylethyl ketone,352.761111,C4H8O,13,72.107,1.238901,0.340794,0.334798,1.489558,18,16.0
71,72,m-xylene,412.261111,C8H10,18,106.168,0.991076,0.415169,0.237672,1.0625,61,36.0
72,73,n-Butyl alcohol,390.872222,C4H10O,15,74.123,1.158939,0.377818,0.29664,1.61168,20,14.0
73,74,n-decane,447.261111,C10H22,32,142.286,0.896038,0.439397,0.179208,1.6,165,34.0
74,75,n-dodecane,489.372222,C12H26,38,170.34,0.899744,0.477742,0.171447,1.583333,286,42.0
75,76,Neopentane,282.65,C5H12,17,72.151,0.873981,0.49582,0.21382,1.7,16,20.0
76,77,n-heptane,371.594444,C7H16,23,100.205,0.886541,0.46002,0.195983,1.642857,56,22.0
77,78,n-hexadecane,560.372222,C16H34,50,226.448,0.904381,0.428114,0.160242,1.5625,680,58.0
78,79,n-octane,398.761111,C8H18,26,114.232,0.890492,0.411098,0.189449,1.625,84,26.0
79,80,Nonane,424.15,C9H20,29,128.259,0.893571,0.486952,0.183939,1.611111,120,30.0


# Import molecular geometries from cartesian xyz coordinates

In [8]:
psi4_mols = []
for geo in xyzgeom_list:
    psi4mol = psi4.geometry(geo)
    psi4_mols.append(psi4mol)
    
E_list = []
wfn_list = []
homo_list = []
lumo_list = []
DM_list = []
const = psi4.constants.dipmom_au2debye #atomic units to Debye conversion factor for dipoles

# Molecular structure optimization round 2 + calculate HOMO, LUMO, and DM

In [9]:
psi4.set_options({'reference': 'uhf'})

k=70
for molec in psi4_mols: #iterate over list of molecular geometries
    
    E, wfn= psi4.optimize("B3LYP/6-31G*", molecule = molec,return_wfn=True) #returns energy and wavefunction of molecule
    
    E_list.append(E)
    wfn_list.append(wfn)
    
    HOMO = ( np.array(wfn.epsilon_a_subset("AO", "ALL")) )[wfn.nalpha()-1]
    LUMO = ( np.array(wfn.epsilon_a_subset("AO", "ALL")) )[wfn.nalpha()]
    dipole_xyz = wfn.variable("SCF DIPOLE")
    dipole_debye = np.linalg.norm(dipole_xyz) *const
  
    homo_list.append(HOMO)
    lumo_list.append(LUMO)
    DM_list.append(dipole_debye)
    
    k+=1
    strID = "fuel"+str(k)+ ".xyz"
    molec.save_xyz_file(strID,1) #write final optimized geometry to XYZ file
    print("Fuel "+str(k)+" done, "+ "k: "+str(k))

Optimizer: Optimization complete!
Fuel 71 done, k: 71
Optimizer: Optimization complete!
Fuel 72 done, k: 72
Optimizer: Optimization complete!
Fuel 73 done, k: 73
Optimizer: Optimization complete!
Fuel 74 done, k: 74
Optimizer: Optimization complete!
Fuel 75 done, k: 75
Optimizer: Optimization complete!
Fuel 76 done, k: 76
Optimizer: Optimization complete!
Fuel 77 done, k: 77
Optimizer: Optimization complete!
Fuel 78 done, k: 78
Optimizer: Optimization complete!
Fuel 79 done, k: 79
Optimizer: Optimization complete!
Fuel 80 done, k: 80
Optimizer: Optimization complete!
Fuel 81 done, k: 81
Optimizer: Optimization complete!
Fuel 82 done, k: 82
Optimizer: Optimization complete!
Fuel 83 done, k: 83
Optimizer: Optimization complete!
Fuel 84 done, k: 84
Optimizer: Optimization complete!
Fuel 85 done, k: 85
Optimizer: Optimization complete!
Fuel 86 done, k: 86
Optimizer: Optimization complete!
Fuel 87 done, k: 87
Optimizer: Optimization complete!
Fuel 88 done, k: 88
Optimizer: Optimization comp

# Add HOMO, LUMO, DM lists to dataframe and save as CSV file

In [22]:
df.drop(df.index[0:69])
df.reset_index(drop=True,inplace=True)

In [23]:
df2 = pd.DataFrame(list(zip(homo_list,lumo_list,DM_list)), columns=['HOMO','LUMO','DM'])
df2

Unnamed: 0,HOMO,LUMO,DM
0,-0.243434,-0.009696,2.689507
1,-0.229545,0.00733,0.309809
2,-0.258675,0.078305,1.72999
3,-0.293026,0.090717,0.053636
4,-0.29124,0.081033,0.073399
5,-0.314737,0.075818,0.000123
6,-0.299338,0.091101,0.045526
7,-0.291898,0.076586,0.085244
8,-0.299494,0.091258,0.055305
9,-0.297976,0.083187,0.04762


In [25]:
df3 = pd.concat([df,df2], axis=1)
df3

Unnamed: 0,ID,Fuel,Boiling Point (K),Molecular Formula,N_Atoms,Molecular Weight (g/mol),IC0,PJ3,SIC0,GATS1v,Wiener,Z1,HOMO,LUMO,DM
0,71,Methylethyl ketone,352.761111,C4H8O,13,72.107,1.238901,0.340794,0.334798,1.489558,18,16.0,-0.243434,-0.009696,2.689507
1,72,m-xylene,412.261111,C8H10,18,106.168,0.991076,0.415169,0.237672,1.0625,61,36.0,-0.229545,0.00733,0.309809
2,73,n-Butyl alcohol,390.872222,C4H10O,15,74.123,1.158939,0.377818,0.29664,1.61168,20,14.0,-0.258675,0.078305,1.72999
3,74,n-decane,447.261111,C10H22,32,142.286,0.896038,0.439397,0.179208,1.6,165,34.0,-0.293026,0.090717,0.053636
4,75,n-dodecane,489.372222,C12H26,38,170.34,0.899744,0.477742,0.171447,1.583333,286,42.0,-0.29124,0.081033,0.073399
5,76,Neopentane,282.65,C5H12,17,72.151,0.873981,0.49582,0.21382,1.7,16,20.0,-0.314737,0.075818,0.000123
6,77,n-heptane,371.594444,C7H16,23,100.205,0.886541,0.46002,0.195983,1.642857,56,22.0,-0.299338,0.091101,0.045526
7,78,n-hexadecane,560.372222,C16H34,50,226.448,0.904381,0.428114,0.160242,1.5625,680,58.0,-0.291898,0.076586,0.085244
8,79,n-octane,398.761111,C8H18,26,114.232,0.890492,0.411098,0.189449,1.625,84,26.0,-0.299494,0.091258,0.055305
9,80,Nonane,424.15,C9H20,29,128.259,0.893571,0.486952,0.183939,1.611111,120,30.0,-0.297976,0.083187,0.04762


In [26]:
df3.to_csv('FuelDescriptors3.csv', encoding='utf-8', index=False)