<a href="https://colab.research.google.com/github/kangmg/randatoms/blob/main/notebooks/randatoms_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !uv pip install https://github.com/kangmg/randatoms.git -q
!uv pip install randatoms

In [None]:
# check available dataset files
from randatoms import available_datasets

available_datasets() # `data_dir=None` will seek the *.tar dataset file in the internal `/dataset` directory

[1;34mAvailable Datasets[0m
Dataset found : /usr/local/lib/python3.11/dist-packages/randatoms/dataset
   Dataset Name    |  Structures  |       MW Range       |    Atoms Range     | Size (MB) 
-----------------------------------------------------------------------------------------
     default       |     878      |   (169.9, 6723.3)    |     (10, 616)      |    9.23   


<font color=skyblue size=4>Basic Usage</font>

In [None]:
from randatoms import randomatoms
from ase.visualize import view

# load single atoms
atoms = randomatoms()
display(view(atoms, viewer='x3d'))

# multiple atoms
atoms_list = randomatoms(10)
print('\nNum of random atoms: ', len(atoms_list))

In [None]:
# conditioned sampling
crystal_with_Zn = randomatoms(is_periodic=True, has_metals=True, include_elements=['Zn'])

view(crystal_with_Zn, viewer='x3d')

<font size=5 color=skyblue> Available Options</font>

By using the below filtering options, you can obtain the ase.Atoms data that match your desired criteria.

- `include_elements`: Structures that contain all specified elements
- `exclude_elements`: Structures that exclude specified elements
- `mw_range`: Molecular weight range
- `max_atoms`, `min_atoms`: Range of the number of atoms
- `is_periodic`: Whether the structure is periodic
- `has_metals`: Whether the structure contains metals
- `include_datasets`: Which dataset IDs to include

<font color=skyblue size=4>Advanced Usage</font>

In [None]:
from randatoms import DataLoader

# Initialize loader
loader = DataLoader() # By default, filename='default'

# filter query
filter = dict(
    include_elements=['H', 'C'],
    has_metals=True,
    is_periodic=True,
    include_datasets=['peptide', 'omol25', 'rattled', 'odac', 'x23b'] # this indicates the original `dataset` identifier.
    )

# print filtered data statistics
loader.print_statistics(**filter)

Loading dataset from TAR archive: /usr/local/lib/python3.11/dist-packages/randatoms/dataset/default.tar
Loaded dataset with 878 structures

[1;34mDataset Statistics[0m
* Total structures:                              121
* Percentage of dataset:                      13.8 %
* Molecular weight range:            (298.7, 6723.3)
* Average atoms per structure:                 235.7
* Num. of atoms range:                     (20, 616)
* Periodic structures:                       100.0 %
* Structures with metals:                    100.0 %

[1;34mElemental Coverage in Dataset[0m
Cu:  48 structures [====      ] 39.7 %
Zn:  29 structures [==        ] 24.0 %
Co:   9 structures [=         ]  7.4 %
Ag:   8 structures [=         ]  6.6 %
S :   8 structures [=         ]  6.6 %
Cd:   7 structures [=         ]  5.8 %
Ni:   5 structures [          ]  4.1 %
Cl:   5 structures [          ]  4.1 %
F :   4 structures [          ]  3.3 %
Mn:   3 structures [          ]  2.5 %
La:   3 structures [       

In [None]:
# Get random structures
atoms_list = loader.get_random_structures(5, **filter) # get 100 random structures

atoms_list

Loading 5 structures: 100%|██████████| 3/3 [00:00<00:00, 288.22it/s]


[Atoms(symbols='Co4H12C24S4N4O8', pbc=True, cell=[[8.74051284, 0.0, 0.0], [-2.004973886282855, 8.507446081380744, 0.0], [0.0, 0.0, 11.91452204]]),
 Atoms(symbols='C112H64Ag4N16', pbc=True, cell=[23.43921427, 23.43921427, 4.68182975]),
 Atoms(symbols='C176H128N16O32Zn8', pbc=True, cell=[[26.79537947, 0.0, 0.0], [-0.00011668762671117225, 26.795353019745928, 0.0], [1.4668375074739557e-06, -1.192814036006158e-05, 4.580032609984232]]),
 Atoms(symbols='C288H192Ag16N96', pbc=True, cell=[[22.66997141, 0.0, 0.0], [0.0, 15.20904952, 0.0], [-10.789463765267092, 0.0, 23.202966360505226]]),
 Atoms(symbols='C112H56Cu8O32', pbc=True, cell=[[18.3117523, 0.0, 0.0], [-5.41303812148311, 17.493877744884188, 0.0], [6.449397277436441, 8.746822502251275, 21.923851276327387]])]

In [None]:
# this shows top 5 the metadata
display(loader.df.head())

Unnamed: 0,index,dataset,key,molecular_weight,elements,n_atoms,formula,is_periodic,has_metals
0,0,peptide,peptide_000000,279.296,"[C, H, N, O]",37,C13H17N3O4,False,False
1,1,peptide,peptide_000001,261.281,"[C, H, N, O]",34,C13H15N3O3,False,False
2,2,peptide,peptide_000002,279.296,"[C, H, N, O]",37,C13H17N3O4,False,False
3,3,peptide,peptide_000003,279.296,"[C, H, N, O]",37,C13H17N3O4,False,False
4,4,peptide,peptide_000004,318.333,"[C, H, N, O]",41,C15H18N4O4,False,False


In [None]:
print('unique dataset id: ', loader.df['dataset'].unique())

unique dataset id:  ['peptide' 'omol25' 'rattled' 'odac' 'x23b']


<font color=skyblue size=5>Make your own dataset</font>

In [None]:
# dummy atoms (molecules + crystals)

from ase.build import bulk, graphene_nanoribbon, molecule
from randatoms.converter import ASEtoHDF5Converter

# molecules
molecule_list = [
    molecule('H2O'),
    molecule('CH4'),
    molecule('C2H6'),
    molecule('C6H6')
]

# crystals
nacl = bulk("NaCl", "rocksalt", a=5.66).repeat((2, 2, 2))
graphene = graphene_nanoribbon(n=3, m=3, type="zigzag", C_C=1.42, vacuum=1)
graphene.set_pbc([True, False, True])
diamond = bulk("C", "diamond", a=3.567).repeat((3, 3, 3))
silicon = bulk("Si", "diamond", a=5.431).repeat((3, 3, 3))
mgo = bulk("MgO", "rocksalt", a=4.21).repeat((3, 3, 3))
cu = bulk("Cu", "fcc", a=3.61).repeat((3, 3, 3))
zns = bulk("ZnS", "zincblende", a=5.41).repeat((2, 2, 2))

crystal_list = [
    nacl,
    graphene,
    diamond,
    silicon,
    mgo,
    cu,
    zns,
]

for atoms in crystal_list:
    atoms.center()

converter = ASEtoHDF5Converter(n_workers=2)

print('='*50, '\nConvert molecule list to *.tar format\n', "="*50, sep='')
converter.convert_atoms_list(molecule_list, filename='molecules', data_dir='./', dataset_name='molecules_from_ase')

print('='*50, '\nConvert crystal list to *.tar format\n', "="*50, sep='')
converter.convert_atoms_list(crystal_list, filename='crystals', data_dir='./', dataset_name='crystals_from_ase')

Convert molecule list to *.tar format
Converting 4 structures...


Extracting metadata: 100%|██████████| 4/4 [00:00<00:00, 2946.47it/s]
Writing HDF5: 100%|██████████| 1/1 [00:00<00:00, 156.54it/s]


Saving metadata...
Building element index...


Processing elements: 100%|██████████| 4/4 [00:00<00:00, 4043.68it/s]


Creating TAR archive at ./molecules.tar...
[1;34m
Conversion complete! File saved as molecules.tar[0m
Convert crystal list to *.tar format
Converting 7 structures...


Extracting metadata: 100%|██████████| 7/7 [00:00<00:00, 7265.56it/s]
Writing HDF5: 100%|██████████| 1/1 [00:00<00:00, 77.14it/s]


Saving metadata...
Building element index...


Processing elements: 100%|██████████| 7/7 [00:00<00:00, 37932.98it/s]

Creating TAR archive at ./crystals.tar...
[1;34m
Conversion complete! File saved as crystals.tar[0m





In [None]:
from randatoms import available_datasets

# check available dataset files(./*.tar)
available_datasets('./')

[1;34mAvailable Datasets[0m
Dataset found : /content
   Dataset Name    |  Structures  |       MW Range       |    Atoms Range     | Size (MB) 
-----------------------------------------------------------------------------------------
     crystals      |      7       |   (216.2, 1715.7)    |      (16, 54)      |    0.09   
  merged_dataset   |      11      |    (16.0, 1715.7)    |      (3, 54)       |    0.11   
    molecules      |      4       |     (16.0, 78.1)     |      (3, 12)       |    0.04   


In [None]:
from randatoms.merger import DatasetMerger

merger = DatasetMerger(merge_name_list=['molecules', 'crystals'], output_name='merged_dataset', data_dir='./') # `data_dir=None` will save {filename}.tar file into internal directory

# merge preview
merger.merge_preview()

# merge dataset
merger.merge()


[1;34mMerged Dataset Preview[0m
Datasets to merge: ['molecules', 'crystals']
Total structures: 11
Molecular weight range: (16.0, 1715.7)
Num. of atoms range: (3, 54)

[1;34mCombined Elemental Composition[0m
C : 5 structures [=====     ] 45.5%
H : 4 structures [====      ] 36.4%
O : 2 structures [==        ] 18.2%
Cl: 1 structures [=         ]  9.1%
Na: 1 structures [=         ]  9.1%
Si: 1 structures [=         ]  9.1%
Mg: 1 structures [=         ]  9.1%
Cu: 1 structures [=         ]  9.1%
Zn: 1 structures [=         ]  9.1%
S : 1 structures [=         ]  9.1%

[1;32mDatasets to merge[0m
  Dataset: [1mmolecules[0m
  - Structures: 4
  - Molecular weight range: (16.0, 78.1)
  - Num. of atoms range: (3, 12)
-------------------------------------------------------
  Dataset: [1mcrystals[0m
  - Structures: 7
  - Molecular weight range: (216.2, 1715.7)
  - Num. of atoms range: (16, 54)
Counting structures...
Merging 11 structures from 2 datasets...


Merging structures: 100%|██████████| 11/11 [00:00<00:00, 585.13it/s]


Building merged element index...


Building index: 100%|██████████| 11/11 [00:00<00:00, 43119.01it/s]

Calculating merged statistics...
Saving merged data to TAR archive...
[1;34mMerge complete! Output saved as merged_dataset.tar[0m





In [None]:
# merged_dataset = crystals + molecules
available_datasets('./')

[1;34mAvailable Datasets[0m
Dataset found : /content
   Dataset Name    |  Structures  |       MW Range       |    Atoms Range     | Size (MB) 
-----------------------------------------------------------------------------------------
     crystals      |      7       |   (216.2, 1715.7)    |      (16, 54)      |    0.09   
  merged_dataset   |      11      |    (16.0, 1715.7)    |      (3, 54)       |    0.11   
    molecules      |      4       |     (16.0, 78.1)     |      (3, 12)       |    0.04   


In [None]:

from randatoms import randomatoms

atoms_list = randomatoms(3, filename='merged_dataset', data_dir='./', include_datasets=['molecules_from_ase']) # ['crystals_from_ase', 'molecules_from_ase']

atoms_list

Loading 3 structures: 100%|██████████| 3/3 [00:00<00:00, 424.62it/s]


[Atoms(symbols='C6H6', pbc=False),
 Atoms(symbols='CH4', pbc=False),
 Atoms(symbols='C2H6', pbc=False)]

In [None]:
from randatoms import DataLoader

loader = DataLoader(filename='merged_dataset', data_dir='./')

loader.df

Loading dataset from TAR archive: ./merged_dataset.tar
Loaded dataset with 11 structures


Unnamed: 0,index,dataset,key,molecular_weight,elements,n_atoms,formula,is_periodic,has_metals
0,0,molecules_from_ase,molecules_from_ase_000000,18.015,"[H, O]",3,H2O,False,False
1,1,molecules_from_ase,molecules_from_ase_000001,16.043,"[C, H]",5,CH4,False,False
2,2,molecules_from_ase,molecules_from_ase_000002,30.07,"[C, H]",8,C2H6,False,False
3,3,molecules_from_ase,molecules_from_ase_000003,78.114,"[C, H]",12,C6H6,False,False
4,4,crystals_from_ase,crystals_from_ase_000000,467.518154,"[Cl, Na]",16,Cl8Na8,True,True
5,5,crystals_from_ase,crystals_from_ase_000001,216.198,[C],18,C18,True,False
6,6,crystals_from_ase,crystals_from_ase_000002,648.594,[C],54,C54,True,False
7,7,crystals_from_ase,crystals_from_ase_000003,1516.59,[Si],54,Si54,True,False
8,8,crystals_from_ase,crystals_from_ase_000004,1088.208,"[Mg, O]",54,Mg27O27,True,True
9,9,crystals_from_ase,crystals_from_ase_000005,1715.742,[Cu],27,Cu27,True,True


<font color=skyblue size=5>Build a large scale dataset with load_atoms package</font>

In [None]:
!uv pip install load-atoms -q

In [None]:
from load_atoms import load_dataset

# download dataset
GST_GAP_22 = load_dataset('GST-GAP-22')._structures[:500]
P_GAP_20 = load_dataset('P-GAP-20')._structures[:500]


In [None]:
from randatoms.converter import ASEtoHDF5Converter
import os

# tmp dir
tmp_dir = 'tmp_dataset'
os.makedirs(tmp_dir, exist_ok=True)

converter = ASEtoHDF5Converter(n_workers=2)

converter.convert_atoms_list(
    atoms_list=GST_GAP_22,
    filename='tmp_GST_GAP_22',
    data_dir = tmp_dir,
    dataset_name='gst_gap'
)

converter.convert_atoms_list(
    atoms_list=P_GAP_20,
    filename='tmp_P_GAP_20',
    data_dir = tmp_dir,
    dataset_name='p_gap'
)


Converting 500 structures...
Using 2 workers for metadata extraction...



Processing batches: 100%|██████████| 5/5 [00:00<00:00, 26.38it/s]
Writing HDF5: 100%|██████████| 1/1 [00:00<00:00,  1.37it/s]


Saving metadata...
Building element index...


Processing elements: 100%|██████████| 500/500 [00:00<00:00, 1069975.51it/s]

Creating TAR archive at tmp_dataset/tmp_GST_GAP_22.tar...
[1;34m
Conversion complete! File saved as tmp_GST_GAP_22.tar[0m
Converting 500 structures...
Using 2 workers for metadata extraction...




Processing batches: 100%|██████████| 5/5 [00:00<00:00, 24.25it/s]
Writing HDF5: 100%|██████████| 1/1 [00:01<00:00,  1.02s/it]


Saving metadata...
Building element index...


Processing elements: 100%|██████████| 500/500 [00:00<00:00, 446582.62it/s]

Creating TAR archive at tmp_dataset/tmp_P_GAP_20.tar...
[1;34m
Conversion complete! File saved as tmp_P_GAP_20.tar[0m





In [None]:
from randatoms import available_datasets

available_datasets(data_dir=tmp_dir)

[1;34mAvailable Datasets[0m
Dataset found : /content/tmp_dataset
   Dataset Name    |  Structures  |       MW Range       |    Atoms Range     | Size (MB) 
-----------------------------------------------------------------------------------------
  tmp_GST_GAP_22   |     500      |    (72.6, 6890.4)    |      (1, 64)       |    5.03   
   tmp_P_GAP_20    |     500      |    (61.9, 7681.5)    |      (2, 248)      |    8.51   


In [None]:
from randatoms.merger import DatasetMerger

merger = DatasetMerger(
    merge_name_list=['tmp_GST_GAP_22', 'tmp_P_GAP_20'],
    output_name='custom_dataset',
    data_dir=tmp_dir,
    )

# merge preview
merger.merge_preview()

# merge dataset
merger.merge()


[1;34mMerged Dataset Preview[0m
Datasets to merge: ['tmp_GST_GAP_22', 'tmp_P_GAP_20']
Total structures: 1,000
Molecular weight range: (61.9, 7681.5)
Num. of atoms range: (1, 248)

[1;34mCombined Elemental Composition[0m
P : 500 structures [=====     ] 50.0%
Ge: 209 structures [==        ] 20.9%
Sb: 206 structures [==        ] 20.6%
Te: 190 structures [==        ] 19.0%

[1;32mDatasets to merge[0m
  Dataset: [1mtmp_GST_GAP_22[0m
  - Structures: 500
  - Molecular weight range: (72.6, 6890.4)
  - Num. of atoms range: (1, 64)
-------------------------------------------------------
  Dataset: [1mtmp_P_GAP_20[0m
  - Structures: 500
  - Molecular weight range: (61.9, 7681.5)
  - Num. of atoms range: (2, 248)
Counting structures...
Merging 1000 structures from 2 datasets...


Merging structures: 100%|██████████| 1000/1000 [00:01<00:00, 743.83it/s]


Building merged element index...


Building index: 100%|██████████| 1000/1000 [00:00<00:00, 684449.09it/s]


Calculating merged statistics...
Saving merged data to TAR archive...
[1;34mMerge complete! Output saved as custom_dataset.tar[0m


In [None]:
from randatoms import available_datasets

available_datasets(tmp_dir)

[1;34mAvailable Datasets[0m
Dataset found : /content/tmp_dataset
   Dataset Name    |  Structures  |       MW Range       |    Atoms Range     | Size (MB) 
-----------------------------------------------------------------------------------------
  custom_dataset   |    1,000     |    (61.9, 7681.5)    |      (1, 248)      |   13.12   
  tmp_GST_GAP_22   |     500      |    (72.6, 6890.4)    |      (1, 64)       |    5.03   
   tmp_P_GAP_20    |     500      |    (61.9, 7681.5)    |      (2, 248)      |    8.51   


In [None]:
from randatoms import set_default_dataset, add_dataset

# move dataset into internal dataset directory
add_dataset(f'{tmp_dir}//tmp_GST_GAP_22.tar')

# move & rename dataset to default dataset
# Sets the dataset at the specified path as the default dataset
# set_default_dataset(f'{tmp_dir}/custom_dataset.tar')

Successfully added dataset 'tmp_GST_GAP_22.tar'.
Location: /usr/local/lib/python3.11/dist-packages/randatoms/dataset/tmp_GST_GAP_22.tar
[1;34mAvailable Datasets[0m
Dataset found : /usr/local/lib/python3.11/dist-packages/randatoms/dataset
   Dataset Name    |  Structures  |       MW Range       |    Atoms Range     | Size (MB) 
-----------------------------------------------------------------------------------------
     default       |     878      |   (169.9, 6723.3)    |     (10, 616)      |    9.23   
  tmp_GST_GAP_22   |     500      |    (72.6, 6890.4)    |      (1, 64)       |    5.03   


In [None]:
available_datasets()

[1;34mAvailable Datasets[0m
Dataset found : /usr/local/lib/python3.11/dist-packages/randatoms/dataset
   Dataset Name    |  Structures  |       MW Range       |    Atoms Range     | Size (MB) 
-----------------------------------------------------------------------------------------
     default       |    1,000     |    (61.9, 7681.5)    |      (1, 248)      |   13.12   
  tmp_GST_GAP_22   |     500      |    (72.6, 6890.4)    |      (1, 64)       |    5.03   


In [None]:
!rm -rf crystals.tar molecules.tar tmp_dataset merged_dataset.tar

In [None]:
# internal default dataset build

In [None]:
!uv pip install fairchem-core -q

In [None]:
# peptide conformer
!mkdir -p peptide_tmp
!wget http://cuby4.molecular.cz/download_geometries/peptide_conformers.tar -O peptide_conformers.tar -q
!tar -xf peptide_conformers.tar -C ./peptide_tmp
!rm -rf peptide_conformers.tar


# omol25 subset
!wget https://dl.fbaipublicfiles.com/opencatalystproject/data/omol/250514/neutral_val.tar.gz -O omol25sub.tar.gz -q
!tar -xzf omol25sub.tar.gz
!rm -rf omol25sub.tar.gz

# omat24 subset
!wget https://dl.fbaipublicfiles.com/opencatalystproject/data/omat/241220/omat/val/rattled-1000-subsampled.tar.gz -q
!tar -xzf rattled-1000-subsampled.tar.gz
!rm -rf rattled-1000-subsampled.tar.gz

# odac subset (mof)
!mkdir -p pristine_tmp
!wget https://github.com/facebookresearch/fairchem/raw/refs/heads/main/src/fairchem/data/odac/promising_mof/promising_mof_structures/pristine.zip -O pristine.zip -q
!unzip -q pristine.zip -d ./pristine_tmp
!rm -rf pristine.zip

# x23b (organic crystal)
!mkdir -p X23b_tmp
!wget https://figshare.com/ndownloader/files/43162153 -O X23b.zip -q
!unzip -q X23b.zip -d ./X23b_tmp
!rm -rf X23b.zip

In [None]:
#@title tmp xyz reader
import numpy as np
from ase import Atoms

def parse_extended_xyz(filename):
    """
    Parse extended XYZ file and return ASE Atoms object

    Args:
        filename: Path to XYZ file

    Returns:
        ASE Atoms object
    """
    with open(filename, 'r') as f:
        lines = [line.strip() for line in f.readlines()]

    # Number of atoms
    natoms = int(lines[0])

    # Parse atomic coordinates and cell vectors
    symbols = []
    positions = []
    cell_vectors = []

    for i in range(2, len(lines)):
        line = lines[i]
        if not line:
            continue

        parts = line.split()
        if len(parts) < 4:
            continue

        if parts[0] == 'Tv':
            # Cell vectors
            cell_vectors.append([float(x) for x in parts[1:4]])
        elif len(symbols) < natoms and parts[0] not in ['Tv']:
            # Atomic coordinates (element symbol first)
            try:
                if parts[0][0].isupper() and not parts[0][0].isdigit():
                    symbols.append(parts[0])
                    positions.append([float(x) for x in parts[1:4]])
            except (ValueError, IndexError):
                continue

    positions = np.array(positions)
    cell_vectors = np.array(cell_vectors)

    # Create default cell if no cell vectors found
    if len(cell_vectors) != 3:
        max_coord = np.max(np.abs(positions)) if len(positions) > 0 else 10
        cell_vectors = np.eye(3) * (max_coord + 5)

    # Create ASE Atoms object
    atoms = Atoms(symbols=symbols, positions=positions, cell=cell_vectors, pbc=True)

    return atoms

In [None]:
import glob
from ase.io import read
from tqdm import tqdm

# x23b
X23b_files = glob.glob('X23b_tmp/*_cryst/cryst.xyz')
x23_atoms = []
for _path in tqdm(X23b_files):
    atoms = parse_extended_xyz(_path)
    atoms.center()
    x23_atoms.append(atoms)

# peptide
peptide_files = glob.glob('peptide_tmp/*.xyz')
peptide_atoms = []
for xyz in tqdm(peptide_files):
    atoms = read(xyz, format='xyz')
    peptide_atoms.append(atoms)

# odac (mof)
odac_files = glob.glob('pristine_tmp/*.cif')
odac_atoms = []
for cif in tqdm(odac_files):
    atoms = read(cif, format='cif')
    odac_atoms.append(atoms)



In [None]:
from fairchem.core.datasets import AseDBDataset

# omol25
dataset_path = "neutral_val"
dataset = AseDBDataset({"src": dataset_path})
geom_orca6_atoms = []
omol_idx = 1
for idx in tqdm(range(len(dataset[:2671]))):
    atoms = dataset.get_atoms(idx)
    if omol_idx == 500: break
    if atoms.info['data_id'] == 'geom_orca6':
        atoms.center()
        geom_orca6_atoms.append(atoms)
        omol_idx += 1
geom_orca6_atoms = geom_orca6_atoms[:300]

# omat24
dataset_path = "rattled-1000-subsampled"
dataset = AseDBDataset({"src": dataset_path})
rattled_atoms = []
for idx in tqdm(range(len(dataset[:300]))):
    atoms = dataset.get_atoms(idx)
    atoms.center()
    rattled_atoms.append(atoms)

In [None]:
len(peptide_atoms), len(geom_orca6_atoms), len(rattled_atoms), len(odac_atoms), len(x23_atoms)

(76, 300, 300, 135, 67)

In [None]:
from randatoms.converter import ASEtoHDF5Converter
import os

tmp_dir = 'default_dataset'
os.makedirs(tmp_dir, exist_ok=True)
converter = ASEtoHDF5Converter(n_workers=2)

converter.convert_atoms_list(
    atoms_list=odac_atoms,
    filename='odac',
    data_dir=tmp_dir,
    dataset_name='odac'
)

converter.convert_atoms_list(
    atoms_list=rattled_atoms,
    filename='rattled',
    data_dir=tmp_dir,
    dataset_name='rattled'
)

converter.convert_atoms_list(
    atoms_list=geom_orca6_atoms,
    filename='omol25',
    data_dir=tmp_dir,
    dataset_name='omol25'
)

converter.convert_atoms_list(
    atoms_list=peptide_atoms,
    filename='peptide',
    data_dir=tmp_dir,
    dataset_name='peptide'
)

converter.convert_atoms_list(
    atoms_list=x23_atoms,
    filename='x23b',
    data_dir=tmp_dir,
    dataset_name='x23b'
)

Converting 135 structures...
Using 2 workers for metadata extraction...



Processing batches: 100%|██████████| 2/2 [00:00<00:00, 13.86it/s]
Writing HDF5: 100%|██████████| 1/1 [00:00<00:00,  2.58it/s]


Saving metadata...
Building element index...


Processing elements: 100%|██████████| 135/135 [00:00<00:00, 127386.06it/s]

Creating TAR archive at default_dataset/odac.tar...
[1;34m
Conversion complete! File saved as odac.tar[0m
Converting 300 structures...
Using 2 workers for metadata extraction...




Processing batches: 100%|██████████| 3/3 [00:00<00:00, 15.68it/s]
Writing HDF5: 100%|██████████| 1/1 [00:00<00:00,  1.26it/s]


Saving metadata...
Building element index...


Processing elements: 100%|██████████| 300/300 [00:00<00:00, 274496.34it/s]

Creating TAR archive at default_dataset/rattled.tar...
[1;34m
Conversion complete! File saved as rattled.tar[0m
Converting 300 structures...
Using 2 workers for metadata extraction...




Processing batches: 100%|██████████| 3/3 [00:00<00:00, 10.84it/s]
Writing HDF5: 100%|██████████| 1/1 [00:00<00:00,  1.30it/s]


Saving metadata...
Building element index...


Processing elements: 100%|██████████| 300/300 [00:00<00:00, 133704.30it/s]


Creating TAR archive at default_dataset/omol25.tar...
[1;34m
Conversion complete! File saved as omol25.tar[0m
Converting 76 structures...


Extracting metadata: 100%|██████████| 76/76 [00:00<00:00, 2337.38it/s]
Writing HDF5: 100%|██████████| 1/1 [00:01<00:00,  1.33s/it]


Saving metadata...
Building element index...


Processing elements: 100%|██████████| 76/76 [00:00<00:00, 132049.34it/s]


Creating TAR archive at default_dataset/peptide.tar...
[1;34m
Conversion complete! File saved as peptide.tar[0m
Converting 67 structures...


Extracting metadata: 100%|██████████| 67/67 [00:00<00:00, 1279.57it/s]
Writing HDF5: 100%|██████████| 1/1 [00:00<00:00,  2.00it/s]


Saving metadata...
Building element index...


Processing elements: 100%|██████████| 67/67 [00:00<00:00, 201446.86it/s]

Creating TAR archive at default_dataset/x23b.tar...
[1;34m
Conversion complete! File saved as x23b.tar[0m





In [None]:
from randatoms import available_datasets

available_datasets(tmp_dir)

[1;34mAvailable Datasets[0m
Dataset found : /content/default_dataset
   Dataset Name    |  Structures  |       MW Range       |    Atoms Range     | Size (MB) 
-----------------------------------------------------------------------------------------
       odac        |     135      |   (298.7, 6723.3)    |     (20, 616)      |    2.59   
      omol25       |     300      |    (182.1, 618.4)    |      (15, 85)      |    2.43   
     peptide       |      76      |    (261.3, 318.3)    |      (34, 41)      |    0.62   
     rattled       |     300      |   (169.9, 4540.2)    |      (10, 80)      |    3.01   
       x23b        |      67      |   (216.2, 2882.6)    |     (24, 240)      |    0.89   


In [None]:
from randatoms.merger import DatasetMerger

merger = DatasetMerger(
    merge_name_list=['odac', 'rattled', 'omol25', 'peptide', 'x23b'],
    output_name='default_dataset',
    data_dir=tmp_dir,
    )

merger.merge_preview()

merger.merge()


[1;34mMerged Dataset Preview[0m
Datasets to merge: ['odac', 'rattled', 'omol25', 'peptide', 'x23b']
Total structures: 878
Molecular weight range: (169.9, 6723.3)
Num. of atoms range: (10, 616)

[1;34mCombined Elemental Composition[0m
S : 199 structures [==        ] 22.7%
F : 161 structures [==        ] 18.3%
Cl: 121 structures [=         ] 13.8%
Br:  71 structures [=         ]  8.1%
Cu:  66 structures [=         ]  7.5%
Zn:  41 structures [          ]  4.7%
Cd:  29 structures [          ]  3.3%
Co:  28 structures [          ]  3.2%
P :  28 structures [          ]  3.2%
Al:  26 structures [          ]  3.0%
Ag:  25 structures [          ]  2.8%
Y :  25 structures [          ]  2.8%
Si:  22 structures [          ]  2.5%
Li:  22 structures [          ]  2.5%
Ga:  22 structures [          ]  2.5%
Pt:  20 structures [          ]  2.3%
Tc:  19 structures [          ]  2.2%
La:  18 structures [          ]  2.1%
Mg:  18 structures [          ]  2.1%
Sr:  18 structures [          ]  2.1%
T

Merging structures: 100%|██████████| 878/878 [00:01<00:00, 626.13it/s]


Building merged element index...


Building index: 100%|██████████| 878/878 [00:00<00:00, 402557.82it/s]


Calculating merged statistics...
Saving merged data to TAR archive...
[1;34mMerge complete! Output saved as default_dataset.tar[0m


In [None]:
from randatoms import set_default_dataset

set_default_dataset(f'{tmp_dir}/default_dataset.tar')

Successfully moved and set 'default_dataset.tar' as the default dataset.
Location: /usr/local/lib/python3.11/dist-packages/randatoms/dataset/default.tar
[1;34mAvailable Datasets[0m
Dataset found : /usr/local/lib/python3.11/dist-packages/randatoms/dataset
   Dataset Name    |  Structures  |       MW Range       |    Atoms Range     | Size (MB) 
-----------------------------------------------------------------------------------------
     default       |     878      |   (169.9, 6723.3)    |     (10, 616)      |    9.23   
  tmp_GST_GAP_22   |     500      |    (72.6, 6890.4)    |      (1, 64)       |    5.03   
