In [1]:
import h5py
import xml.etree.ElementTree as ET
from chemspipy import ChemSpider
import time
import random
import concurrent.futures
from functools import partial

In [2]:
def list_groups(file_name):
    with h5py.File(file_name, 'r') as f:
        print("Groups in the file:")
        for group_name in f.keys():
            print(group_name)


In [5]:
API_KEY = 'xtzSa36effbFrBvRWT8EQmHNTlOxkm5e'
FILE_NAME = 'packages\\molecular_coordinates\\training_set.h5'
cs = ChemSpider(API_KEY)
MAX_WORKERS = 6

def save_many_structures(elements_in, elements_out, include_all=True, complexity='single', max_amount = 10):
    print("Searching ChemSpider...")
    search = cs.filter_element(elements_in, elements_out, include_all = include_all, complexity=complexity)

    while cs.filter_status(search)['status'] != 'Complete':
        time.sleep(1E-3)
    print(cs.filter_status(search))
    cids = cs.filter_results(search)
    # Define the range for the indices


    rand_mols = random.sample(cids, max_amount)
    with h5py.File(FILE_NAME, 'a') as f:
        # Create or access the group
        print("Writing h5 file")
        for idx in rand_mols:
            mol = cs.get_details(idx)
            group_name = f"{mol['id']}"
            print(group_name)
            if group_name in f:
                group = f[group_name]
                print(f"Rewriting group {group_name}")
            else:
                group = f.create_group(group_name)

            group.attrs['note'] = f"Random download set featuring {elements_in} and without {elements_out}"
            repeat = 0
            for dataset_name, data in mol.items():
            # Append run number to the dataset name
                run_dataset_name = f"{dataset_name}"
        
                # Create or overwrite the dataset within the group
                if run_dataset_name in group:
                    repeat =+ 1 
                    del group[run_dataset_name]

                group.create_dataset(run_dataset_name, data=data)
                time.sleep(1E-2)
        print(f"Data for {len(rand_mols)} molecules saved to group in {FILE_NAME} successfully with {repeat} repeats.")


    # found_ids = cs.filter_results(results)
    # for id in found_ids:

def mol2_to_xyz(mol2_str, xyz_str):
    atoms_section = False
    atoms_data = []

    for line in mol2_str:
        if line.startswith('@<TRIPOS>ATOM'):
            atoms_section = True
            continue
        if line.startswith('@<TRIPOS>BOND'):
            atoms_section = False
            break
        
        if atoms_section:
            parts = line.split()
            atom_name = parts[1]
            x = float(parts[2])
            y = float(parts[3])
            z = float(parts[4])
            atoms_data.append((atom_name, x, y, z))
    
    return atom_name, x, y, z
    

def add_to_h5(file_name, group_name, csid, data_dict, group_note=None):
    """
    Dictionary should include atom_identity, xyz coordinates, maybe mass and charge? Also simulated scattering pattern"""
    with h5py.File(file_name, 'a') as f:
        # Create or access the group
        if group_name in f:
            group = f[group_name]
        else:
            group = f.create_group(group_name)

        # Add a description of the data (if provided)
        if group_note != None:
            group.attrs['note'] = group_note

        for dataset_name, data in data_dict.items():
            # Append run number to the dataset name
            run_dataset_name = f"{dataset_name}_{csid}"
    
            # Create or overwrite the dataset within the group
            if run_dataset_name in group:
                del group[run_dataset_name]
            group.create_dataset(run_dataset_name, data=data)
        
    return(f"Data for molecule {csid} saved to group '{group_name}' in {FILE_NAME} successfully.")


def cml_to_xyz(cml_file, xyz_file):
    # Parse the CML file
    tree = ET.parse(cml_file)
    root = tree.getroot()

    # Extract atom information
    atoms = []
    for atom in root.findall(".//{http://www.xml-cml.org/schema}atom"):
        symbol = atom.get("elementType")
        x = atom.get("x3")
        y = atom.get("y3")
        z = atom.get("z3")
        atoms.append((symbol, float(x), float(y), float(z)))

    # Write the XYZ file
    with open(xyz_file, "w") as f:
        f.write(f"{len(atoms)}\n\n")  # Number of atoms and a blank line
        for atom in atoms:
            f.write(f"{atom[0]} {atom[1]:.5f} {atom[2]:.5f} {atom[3]:.5f}\n")

def print_h5_structure(name, obj):
    # Print the name of the object (group or dataset)
    print(name)
    # Print attributes of the object, if any
    for key, value in obj.attrs.items():
        print(f"  Attribute: {key} = {value}")

# Function to count groups in the HDF5 file
def count_groups(file):
    group_count = 0
    
    def count_groups_recursive(name, obj):
        nonlocal group_count
        if isinstance(obj, h5py.Group):
            group_count += 1

    # Open the HDF5 file in read mode
    with h5py.File(file, 'r') as h5file:
        # Traverse the file to count groups
        h5file.visititems(count_groups_recursive)

    return group_count


In [6]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = [executor.submit(save_many_structures, ['H', 'C', 'Br'], ['D'], max_amount=100) for _ in range(10)]
    for r in concurrent.futures.as_completed(results):
        print(r.result)

Searching ChemSpider...
Searching ChemSpider...
Searching ChemSpider...
Searching ChemSpider...
Searching ChemSpider...
Searching ChemSpider...
Searching ChemSpider...
Searching ChemSpider...
Searching ChemSpider...
Searching ChemSpider...
<bound method Future.result of <Future at 0x2c81afd94d0 state=finished raised ChemSpiPyUnavailableError>>
{'count': 10000, 'message': '', 'status': 'Complete'}
{'count': 10000, 'message': '', 'status': 'Complete'}
{'count': 10000, 'message': '', 'status': 'Complete'}
{'count': 10000, 'message': '', 'status': 'Complete'}
{'count': 10000, 'message': '', 'status': 'Complete'}
{'count': 10000, 'message': '', 'status': 'Complete'}
{'count': 10000, 'message': '', 'status': 'Complete'}
{'count': 10000, 'message': '', 'status': 'Complete'}
{'count': 10000, 'message': '', 'status': 'Complete'}
Writing h5 file
Writing h5 file
Writing h5 file
Writing h5 file
Writing h5 file
Writing h5 file
Writing h5 file
Writing h5 file
Writing h5 file
251045
126933
253352
345

In [7]:
num_groups = count_groups(FILE_NAME)
print(num_groups)

1753
