## ioChem-BD for computational FAIR Data
Through this notebook, we leverage the `py_iochem` library to connect to the ioChem-BD database 
and retrieve calculation data automatically, showcasing a way to simplify the batch management
of CompChem information

### Getting information
ioChem-BD stores information in the form of CML files: here, we will search a collection and download all
the files it contains.

The collection can be accessed at [ioChem-BD](https://iochem-bd.iciq.es/browse/handle/100/93451): take a look at its structure and contents in the web interface and then check how the Python API streamlines information management.

In [1]:
import os

def py_mkdir(folder_name):
	'''Basic function mimicking mkdir -p inside of Python, required to build
	folder structures
	Input:
	- folder_name. String, name of the folder.
	Output:
	- 0 if folder is created, 1 elsewhere
	- Generates requestd folder in working dir'''
	# check if it exists, and if not, build
	if (os.path.exists(folder_name) == False):
		os.makedirs(folder_name)
		return 0
	else:
		return 1

import ase.io
import ase.visualize
from io import StringIO

def xyz_string_to_ase(xyz_string):
    '''Reads a XYZ-formatted block, of the form
    Natoms
    
    C x0 y0 z0
    H x1 y1 z1
    ...
    into an ASE.Atoms object    
    '''
    Nat = len(xyz_string.split("\n"))
    xyz_string = "%d\n\n%s" % (Nat,xyz_string)
    ff = StringIO(xyz_string)
    ase_mol = ase.io.read(ff,format="xyz")
    return ase_mol

def visualize_xyz_string(xyz_string):
    '''Loads a X3D-based visualization from ASE for a target
    XYZ string of the form 
    Natoms
    
    C x0 y0 z0
    H x1 y1 z1
    ...
    '''
    ase_mol = xyz_string_to_ase(xyz_string)
    view = ase.visualize.view(ase_mol,viewer='x3d')
    # hack-ish line to remove shininess
    view.data = view.data.replace("specularColor=\"0.5 0.5 0.5\"","specularColor=\"0 0 0\"").replace('height:100%;','')
    # There are issues with expanding cells: patching CSS height for proper visualization
    css_blk = """<style>
    .x3dom-canvas {height: 400px;}
    </style>"""

    html_code = view.data
    html_code = html_code.replace("</script>","</script>%s\n")
    html_code = html_code % css_blk
    view.data = html_code
    return view

In [2]:
from py_iochem import RESTAPIManager

# REST URL
rest_url = "https://iochem-bd.iciq.es/rest"

# Collection identifier 
handle = "100/93451"

# Create the object that manages the connection
iochem = RESTAPIManager.CollectionHandler(rest_url,handle,token=None,service="Browse")

# Get all the items in the collection
iochem.get_items()

for item in iochem.itemList:
    print(item)

Built item list with 34 entries
{'id': 94575, 'name': '/common H2', 'handle': '100/93452', 'type': 'item', 'expand': ['metadata', 'parentCollection', 'parentCollectionList', 'parentCommunityList', 'bitstreams', 'all'], 'lastModified': '2025-03-26 11:46:28.521', 'parentCollection': None, 'parentCollectionList': None, 'parentCommunityList': None, 'bitstreams': None, 'archived': 'true', 'withdrawn': 'false', 'link': '/RESTapi/items/94575', 'metadata': None, 'isTarget': True}
{'id': 94576, 'name': '/common ete', 'handle': '100/93453', 'type': 'item', 'expand': ['metadata', 'parentCollection', 'parentCollectionList', 'parentCommunityList', 'bitstreams', 'all'], 'lastModified': '2025-03-26 11:46:28.699', 'parentCollection': None, 'parentCollectionList': None, 'parentCommunityList': None, 'bitstreams': None, 'archived': 'true', 'withdrawn': 'false', 'link': '/RESTapi/items/94576', 'metadata': None, 'isTarget': True}
{'id': 94577, 'name': '/common prod', 'handle': '100/93454', 'type': 'item', 

In [3]:
# We get a lot of information from each entry in the list: we can focus on the id, name and handle
for item in iochem.itemList:
    sel_item = [item[prop] for prop in ["id","name","handle"]]
    print(sel_item)

[94575, '/common H2', '100/93452']
[94576, '/common ete', '100/93453']
[94577, '/common prod', '100/93454']
[94578, '/common CO', '100/93455']
[94579, '/common PMe3', '100/93456']
[94580, '/1L I8_1L', '100/93457']
[94581, '/1L I2t_1L', '100/93458']
[94582, '/1L I3_1L', '100/93459']
[94583, '/1L I7_1L', '100/93460']
[94584, '/1L I4_1L', '100/93461']
[94585, '/1L TS4c_1L', '100/93462']
[94586, '/1L TS4t_1L', '100/93463']
[94587, '/1L I9c_1L', '100/93464']
[94588, '/1L I6_1L', '100/93465']
[94589, '/1L I9t_1L', '100/93466']
[94590, '/1L I5_1L', '100/93467']
[94591, '/1L TS2_1L', '100/93468']
[94592, '/1L TS1_1L', '100/93469']
[94593, '/1L TS3_1L', '100/93470']
[94594, '/1L I1_1L', '100/93471']
[94595, '/1L I2c_1L', '100/93472']
[94596, '/0L I4_0L', '100/93473']
[94597, '/0L TS3_0L', '100/93474']
[94598, '/0L I1_0L', '100/93475']
[94599, '/0L I7_0L', '100/93476']
[94600, '/0L TS1_0L', '100/93477']
[94601, '/0L I5_0L', '100/93478']
[94602, '/0L I9_0L', '100/93479']
[94603, '/0L I2_0L', '100

In [4]:
# In general, these identifiers enable us to build py-iochem to build queries to the API to retrieve information
# Now we will get all the input and output files in the collection
target_dir = "cml_files"
py_mkdir(target_dir)

tracking_info,_ = iochem.get_files(outdir=target_dir)

Starting download:
0 calculations downloaded
20 calculations downloaded


In [5]:
# The information about the retrieved files is stored in the tracking_info variable, containing:
# Calculation name, calculation handle, path to downloaded file and name of original file in the platform
for entry in tracking_info:
    print(entry)

['/common H2', '100/93452', 'cml_files/H2.in', 'H2.in']
['/common H2', '100/93452', 'cml_files/calc_100_93452.cml', 'output.cml']
['/common ete', '100/93453', 'cml_files/ete.in', 'ete.in']
['/common ete', '100/93453', 'cml_files/calc_100_93453.cml', 'output.cml']
['/common prod', '100/93454', 'cml_files/prod.in', 'prod.in']
['/common prod', '100/93454', 'cml_files/calc_100_93454.cml', 'output.cml']
['/common CO', '100/93455', 'cml_files/CO.in', 'CO.in']
['/common CO', '100/93455', 'cml_files/calc_100_93455.cml', 'output.cml']
['/common PMe3', '100/93456', 'cml_files/PMe3.in', 'PMe3.in']
['/common PMe3', '100/93456', 'cml_files/calc_100_93456.cml', 'output.cml']
['/1L I8_1L', '100/93457', 'cml_files/I8_1L.in', 'I8_1L.in']
['/1L I8_1L', '100/93457', 'cml_files/calc_100_93457.cml', 'output.cml']
['/1L I2t_1L', '100/93458', 'cml_files/I2t_1L.in', 'I2t_1L.in']
['/1L I2t_1L', '100/93458', 'cml_files/calc_100_93458.cml', 'output.cml']
['/1L I3_1L', '100/93459', 'cml_files/I3_1L.in', 'I3_1L.in

In [6]:
# We can now look at the directory, which contains CML and IN files 
os.listdir(target_dir)

['TS2_1L.in',
 'calc_100_93472.cml',
 'calc_100_93469.cml',
 'I7_0L.in',
 'calc_100_93452.cml',
 'I9t_1L.in',
 'CO.in',
 'calc_100_93453.cml',
 'calc_100_93463.cml',
 'I2t_1L.in',
 'prod.in',
 'TS3_1L.in',
 'calc_100_93476.cml',
 'calc_100_93458.cml',
 'calc_100_93485.cml',
 'PMe3.in',
 'TS1_1L.in',
 'TS4_0L.in',
 'calc_100_93460.cml',
 'I6_0L.in',
 'calc_100_93484.cml',
 'I1_1L.in',
 'calc_100_93468.cml',
 'calc_100_93462.cml',
 'I9c_1L.in',
 'calc_100_93465.cml',
 'I5_1L.in',
 'calc_100_93466.cml',
 'TS4c_1L.in',
 'I6_1L.in',
 'calc_100_93455.cml',
 'calc_100_93456.cml',
 'TS4t_1L.in',
 'calc_100_93475.cml',
 'H2.in',
 'calc_100_93481.cml',
 'calc_100_93478.cml',
 'I8_1L.in',
 'ete.in',
 'calc_100_93470.cml',
 'calc_100_93454.cml',
 'I1_0L.in',
 'calc_100_93482.cml',
 'calc_100_93477.cml',
 'TS2_0L.in',
 'calc_100_93459.cml',
 'I3_1L.in',
 'calc_100_93457.cml',
 'calc_100_93473.cml',
 'I8_0L.in',
 'calc_100_93479.cml',
 'calc_100_93474.cml',
 'I2_0L.in',
 'I2c_1L.in',
 'calc_100_9346

In [7]:
# Preview of an input file
target_input = "cml_files/CO.in"
with open(target_input,"r") as f1:
    preview = f1.readlines()

print("Contents of the input file:")
print("".join(preview))

Contents of the input file:
%nprocshared=12
%mem=22GB
%chk=CO.chk
#p opt=(maxstep=5) freq pseudo=read WB97XD gen scrf(smd,solvent=toluene)

Auto-generated job for CO

0 1
C   0.000000   0.000000  -0.641554
O   0.000000   0.000000   0.481166

O C 0
def2tzvp
****





In [8]:
# Preview of an output file
target_cml = "cml_files/calc_100_93455.cml"
with open(target_cml,"r") as f1:
    preview = f1.readlines()[0:50]

print("Contents of the CML file:")
print("".join(preview))

Contents of the CML file:
<?xml version="1.0" encoding="UTF-8"?>
<module xmlns="http://www.xml-cml.org/schema"
        xmlns:cc="http://www.xml-cml.org/dictionary/compchem/"
        xmlns:cml="http://www.xml-cml.org/schema"
        xmlns:cmlx="http://www.xml-cml.org/schema/cmlx"
        xmlns:compchem="http://www.xml-cml.org/dictionary/compchem/"
        xmlns:convention="http://www.xml-cml.org/convention/"
        xmlns:g="http://www.iochem-bd.org/dictionary/gaussian/"
        xmlns:nonsi="http://www.xml-cml.org/unit/nonSi/"
        xmlns:nonsi2="http://www.iochem-bd.org/unit/nonSi2/"
        xmlns:si="http://www.xml-cml.org/unit/si/"
        xmlns:xi="http://www.w3.org/2001/XInclude"
        xmlns:xsd="http://www.w3.org/2001/XMLSchema"
        convention="convention:compchem"
        id="gaussian.log">
   <module dictRef="cc:jobList" id="jobList1">
      <module cmlx:templateRef="job" dictRef="cc:job" id="job">
         <module dictRef="cc:environment" id="environment">
            <

In [9]:
# Of course, directly reading CML is not a very attractive option...
# and py-iochem provides tools to simplify this
from py_iochem import CMLtoPy
parsed_info = CMLtoPy.xslt_parsing_saxon(target_cml)

# We get a dict of dicts -> organized as Gaussian jobs, in different jobs
print(parsed_info.keys())

# First job is optimization, second is frequency calculation
# What properties do we have?
print("Properties of frequency calculation job:")
print(parsed_info["job2"])

dict_keys(['general', 'job1', 'job2'])
Properties of frequency calculation job:
{'method': 'RwB97XD', 'basis': 'Gen', 'initialCoordinates': 'C 0.0000 0.0000 -0.6416\nO 0.0000 0.0000 0.4812', 'electronicEnergy': '-113.317213690', 'electronicEnergyUnits': 'Eh', 'zeroPointEnergyCorr': '0.005119', 'zeroPointEnergyCorrUnits': 'Eh', 'thermalEnergyCorr': '0.007480', 'thermalEnergyCorrUnits': 'Eh', 'enthalpyCorr': '0.008424', 'enthalpyCorrUnits': 'Eh', 'gibbsFreeEnergyCorr': '-0.013994', 'gibbsFreeEnergyCorrUnits': 'Eh', 'zeroPointEnergy': '-113.312094', 'zeroPointEnergyUnits': 'Eh', 'thermalEnergy': '-113.309734', 'thermalEnergyUnits': 'Eh', 'enthalpy': '-113.308789', 'enthalpyUnits': 'Eh', 'gibbsFreeEnergy': '-113.331207', 'gibbsFreeEnergyUnits': 'Eh', 'entropy': '47.182', 'entropyTransl': '35.923', 'entropyRot': '11.259', 'entropyVib': '0.000', 'thermalEnergyTransl': '0.889', 'thermalEnergyRot': '0.592', 'thermalEnergyVib': '3.213', 'entropyUnits': 'nonsi2', 'thermalContributionUnits': 'non

In [10]:
# Thus, we get the most relevant information about the calculation in a format that can be easily manipulated with Python

# Knowing the kind of calculations that we have, we could read all the CML files and get the electronic and free energies of our molecules
# Using tracking_info to select the CML files AND map them to the names
tracking_cmls = [line for line in tracking_info if ".cml" in line[2]]

thermo_info = []
for line in tracking_cmls:
    name,handle,file,orig_file = line
    # We will clean the name, which preserves the directory structure of the collection
    name_clean = name.split()[1]
    
    # Read and parse
    parsed_info = CMLtoPy.xslt_parsing_saxon(file)
    freq_job_info = parsed_info["job2"]

    energy = freq_job_info["electronicEnergy"]
    free_energy = freq_job_info["gibbsFreeEnergy"]

    print(name_clean,energy,free_energy)

    thermo_info.append([name_clean,float(energy),float(free_energy)])

H2 -1.17570886736 -1.177112
ete -78.5903957788 -78.560594
prod -193.165544543 -193.107883
CO -113.317213690 -113.331207
PMe3 -461.120246160 -461.036119
I8_1L -992.224952677 -992.050200
I2t_1L -799.058923293 -798.962268
I3_1L -877.673542524 -877.522946
I7_1L -1104.38411846 -1104.219443
I4_1L -877.688550621 -877.535586
TS4c_1L -992.205135296 -992.029842
TS4t_1L -992.208858813 -992.033928
I9c_1L -992.236920178 -992.063625
I6_1L -991.043106239 -990.883605
I9t_1L -992.235662106 -992.062346
I5_1L -991.028717104 -990.870195
TS2_1L -991.006875155 -990.849961
TS1_1L -877.648580414 -877.500750
TS3_1L -992.211166874 -992.038312
I1_1L -912.410851406 -912.308227
I2c_1L -799.063875866 -798.966912
I4_0L -529.868585958 -529.818144
TS3_0L -644.387535554 -644.315904
I1_0L -564.590217977 -564.589347
I7_0L -756.561500122 -756.500663
TS1_0L -529.835285663 -529.789112
I5_0L -643.211282819 -643.156458
I9_0L -644.417860607 -644.348169
I2_0L -451.243273721 -451.247194
I6_0L -643.221506035 -643.164819
I3_0L -52

FAIR principles: our data is Findable, Accessible, Interoperable and Reusable

We can retrieve key parameters (energy, free energy... and many others) for a set of calculations very easily
and with very few lines of code

Data becomes much more open, and we could, for instance, employ these energies
to build visualizations or models with them

In [11]:
import pandas as pd

# Get a proper DataFrame to manage our information
thermo_table = pd.DataFrame.from_records(thermo_info,columns=["name","E","G"]).set_index("name")
display(thermo_table)

Unnamed: 0_level_0,E,G
name,Unnamed: 1_level_1,Unnamed: 2_level_1
H2,-1.175709,-1.177112
ete,-78.590396,-78.560594
prod,-193.165545,-193.107883
CO,-113.317214,-113.331207
PMe3,-461.120246,-461.036119
I8_1L,-992.224953,-992.0502
I2t_1L,-799.058923,-798.962268
I3_1L,-877.673543,-877.522946
I7_1L,-1104.384118,-1104.219443
I4_1L,-877.688551,-877.535586


In [12]:
# And now it will be very easy to work with these energies: for instance, if we define a given reaction, we can get
# its relative energy from the table

# Reaction: ethylene (ete) + CO + H2 -> propionaldehyde (prod)
reactants = ["ete","CO","H2"]
products = ["prod"]

e_reactants = thermo_table.loc[reactants,"E"]
display(e_reactants)

e_products = thermo_table.loc[products,"E"]
display(e_products)

rel_e = e_products.sum() - e_reactants.sum() 
print("===> Relative energy: %.4f hartree (%.2f kcal/mol)" % (rel_e,rel_e * 627.509))

name
ete    -78.590396
CO    -113.317214
H2      -1.175709
Name: E, dtype: float64

name
prod   -193.165545
Name: E, dtype: float64

===> Relative energy: -0.0822 hartree (-51.60 kcal/mol)


In [13]:
# We are not limited to energies: we can also get the geometries of the target structures 

geometry_info = {}
for line in tracking_cmls:
    name,handle,file,orig_file = line
    # We will clean the name, which preserves the directory structure of the collection
    name_clean = name.split()[1]

    # Read and parse
    parsed_info = CMLtoPy.xslt_parsing_saxon(file)
    freq_job_info = parsed_info["job2"]

    geometry = freq_job_info["initialCoordinates"]
    geometry_info[name_clean] = geometry


In [14]:
# Now we can check any molecule in our set:
target_name = "prod"
geo = geometry_info[target_name]
print(geo)

C -0.9260 0.4181 0.0001
H -1.5837 1.3123 0.0003
C 0.5414 0.7216 0.0003
H 0.7264 1.3644 -0.8686
H 0.7262 1.3638 0.8697
C 1.4443 -0.4966 -0.0000
H 2.4925 -0.1948 0.0002
H 1.2677 -1.1166 0.8797
H 1.2678 -1.1160 -0.8802
O -1.4065 -0.6842 -0.0003


In [15]:
# XYZ-like format: for each atom, we have the symbol and the x,y,z coordinates
# We can look at it in the notebook itself with some magic

visualize_xyz_string(geo)

In [16]:
# Imagine if we wanted to recompute all structures in the collection with a different method
# Building Gaussian inputs: keywords, title, charge/multiplicity and geometry

# Select parameters
keywords = "opt"
method = "MP2"
basis = "6-311G(d,p)"

gaussian_input_content = {}
# Build blocks for each molecule
for line in tracking_cmls:
    name,handle,file,orig_file = line
    # We will clean the name, which preserves the directory structure of the collection
    name_clean = name.split()[1]

    # Read and parse -> geometry, charge and multiplicity
    
    parsed_info = CMLtoPy.xslt_parsing_saxon(file)

    charge = int(parsed_info["general"]["charge"])
    multiplicity = int(parsed_info["general"]["multiplicity"])
    geometry = parsed_info["job2"]["initialCoordinates"]

    # Prepare the block
    gaussian_block = ""
    gaussian_block += "#p %s %s %s\n\n" % (keywords,method,basis)
    gaussian_block += "Automated job for %s\n\n" % name_clean
    gaussian_block += "%d %d\n" % (charge,multiplicity)
    gaussian_block += geometry

    gaussian_input_content[name_clean] = gaussian_block

In [17]:
print(gaussian_input_content["prod"])

# Which could be easily adapted to read more data from the original calculation (e.g., solvent and solvation model)
# to set up more complex jobs, or to use any other program



#p opt MP2 6-311G(d,p)

Automated job for prod

0 1
C -0.9260 0.4181 0.0001
H -1.5837 1.3123 0.0003
C 0.5414 0.7216 0.0003
H 0.7264 1.3644 -0.8686
H 0.7262 1.3638 0.8697
C 1.4443 -0.4966 -0.0000
H 2.4925 -0.1948 0.0002
H 1.2677 -1.1166 0.8797
H 1.2678 -1.1160 -0.8802
O -1.4065 -0.6842 -0.0003


In [18]:
# Building ORCA inputs: keywords, title, charge/multiplicity and geometry

# Select parameters
keywords = ""   # leaving empty for a single point
method = "DLPNO-CCSD(T)"
basis = "cc-PVTZ"

orca_input_content = {}
# Build blocks for each molecule
for line in tracking_cmls:
    name,handle,file,orig_file = line
    # We will clean the name, which preserves the directory structure of the collection
    name_clean = name.split()[1]

    # Read and parse -> geometry, charge and multiplicity
    
    parsed_info = CMLtoPy.xslt_parsing_saxon(file)

    charge = int(parsed_info["general"]["charge"])
    multiplicity = int(parsed_info["general"]["multiplicity"])
    geometry = parsed_info["job2"]["initialCoordinates"]

    # Prepare the block
    orca_block = ""
    orca_block += "! %s %s %s/C %s \n" % (method,basis,basis,keywords)
    orca_block += "* xyz %d %d\n" % (charge,multiplicity)
    orca_block += geometry
    orca_block += "\n*\n"

    orca_input_content[name_clean] = orca_block

print(orca_input_content["prod"])

! DLPNO-CCSD(T) cc-PVTZ cc-PVTZ/C  
* xyz 0 1
C -0.9260 0.4181 0.0001
H -1.5837 1.3123 0.0003
C 0.5414 0.7216 0.0003
H 0.7264 1.3644 -0.8686
H 0.7262 1.3638 0.8697
C 1.4443 -0.4966 -0.0000
H 2.4925 -0.1948 0.0002
H 1.2677 -1.1166 0.8797
H 1.2678 -1.1160 -0.8802
O -1.4065 -0.6842 -0.0003
*

