# Exergy Calculator
#### Must have excel file with a "SMILES" Column. 
###### Can also include "ID", "CASN", "Formula" columns

In [None]:
import sys, pathlib
from rdkit import Chem
from rdkit.Chem import Draw
import pandas as pd

from Exergy_Bones.group_loader import load_smarts_groups_from_excel
import glob
from padelpy import padeldescriptor
xml_files = glob.glob("./Exergy_Bones/fingerprints_xml/*.xml")
xml_files.sort()
FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']
fp = dict(zip(FP_list, xml_files))
smarts_db = load_smarts_groups_from_excel()

read data file

In [2]:
# Enter excel path
file_path="./Data/processed/Exergy_Clean_Test.csv" #full dataset
#test sample: ./Data/raw/Test_sample_excel.xlsx
if '.xlsx' in file_path:
    df = pd.read_excel(file_path)
elif '.csv' in file_path:
    df = pd.read_csv(file_path)
smilecol_name = 'SMILES'
chemidcol_name = 'CASN'

generate padel descriptor

In [3]:
df[[smilecol_name,chemidcol_name]].to_csv('./Data/exergy_test.smi', sep='\t', index=False, header=False)

for fingerprint in ['AtomPairs2D','AtomPairs2DCount']:
    #fingerprint = 'AtomPairs2D'

    fingerprint_output_file = f'./Data/exergy_test_{fingerprint}.csv' #Substructure.csv
    fingerprint_descriptortypes = fp[fingerprint]

    padeldescriptor(mol_dir='./Data/exergy_test.smi', 
                    d_file=fingerprint_output_file, #'Substructure.csv'
                    #descriptortypes='SubstructureFingerprint.xml', 
                    descriptortypes= fingerprint_descriptortypes,
                    detectaromaticity=True,
                    standardizenitro=True,
                    standardizetautomers=True,
                    threads=2,
                    removesalt=True,
                    log=True,
                    fingerprints=True)

In [3]:
group_names = [g.name for g in smarts_db.groups]

padelcount = pd.read_csv(f'./Data/exergy_test_AtomPairs2DCount.csv')
padelhit = pd.read_csv(f'./Data/exergy_test_AtomPairs2D.csv')
# add group counts padelcount[padelcount['Name']==casrn]
for gname in group_names:
    df[gname + " (calc)"] = 0

for idx, row in df.iterrows():
    smi = row[smilecol_name]
    casrn = row[chemidcol_name]
    if not isinstance(smi, str) or not smi.strip():
        continue
    res = smarts_db.analyze_smiles(smi,padelcount[padelcount['Name']==casrn],padelhit[padelhit['Name']==casrn])
    count_map = {r["name"]: r["count"] for r in res}
    for gname in group_names:
        df.at[idx, gname + " (calc)"] = count_map.get(gname, 0)

# now run your existing attach_exergy_columns
from Exergy_Bones.exergy_calc import attach_exergy_columns

df_exergy = attach_exergy_columns(df, smarts_db)



verify with the benchmarking set

In [4]:
combined = pd.concat([df[[x for x in df.columns if 'calc' not in x]],df_exergy[[x for x in df_exergy.columns if 'calc' in x]]],axis=1)

In [None]:
print("Group contains incorrect results")
for i in range(1,79):
    col_true = f'Group {i}'
    col_cal = f'Group {i} (calc)'
    subdf = combined[combined[col_true]!=combined[col_cal]]
    if len(subdf)>0:
        print(col_true)

Group 19
Group 26
Group 30
Group 52
Group 66


In [10]:
groupindex = 19
col_true = f'Group {groupindex}'
col_cal = f'Group {groupindex} (calc)'
combined[(combined[col_true]!=combined[col_cal])][[col_true,col_cal,'SMILES','CASN','Chemical Name']]

Unnamed: 0,Group 19,Group 19 (calc),SMILES,CASN,Chemical Name
1920,4,1,CC(C)(C)CC(C)(C)S,141-59-3,tert-octyl mercaptan
2107,4,1,CC(C)(C)CC(C)(C)C1=CC=C(C=C1)O,140-66-9,p-tert-octylphenol


In [30]:
df_exergy[[x for x in df_exergy.columns if 'calc' in x]]

Unnamed: 0,Group 1 (calc),Group 2 (calc),Group 3 (calc),Group 4 (calc),Group 5 (calc),Group 6 (calc),Group 7 (calc),Group 8 (calc),Group 9 (calc),Group 10 (calc),...,Group 69 (calc),Group 70 (calc),Group 71 (calc),Group 72 (calc),Group 73 (calc),Group 74 (calc),Group 75 (calc),Group 76 (calc),Group 77 (calc),Group 78 (calc)
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2605,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2606,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2607,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
output_path="C:/Users/mmarc/Documents/Exergy/Data/processed/Test_output.xlsx"
df_exergy.to_excel(output_path, index=False)

  df_exergy.to_excel(output_path, index=False)
