- frc/pos/vel file:
    - no. atoms
    - timestep no, time, energy
    - atom id, x, y, z component of frc/pos/vel
    - repeat
- init.pdb

### Create xyz file with energies, positions and forces

In [None]:
from ase import io
import numpy as np

Test on one example

In [84]:
pos1 = io.read('c-h2o/m12_n12_lange_aggro-pos-1.xyz', index='0')
frc1 = io.read('c-h2o/m12_n12_lange_aggro-frc-1.xyz', index='0')

In [85]:
frc2 = io.read('c-h2o/m12_n12_lange_aggro-frc-1.xyz', index=1)

In [None]:
pos1

Atoms(symbols='C288H130O65', pbc=False)

In [None]:
print(pos1.info)

{'i': 2801, 'time': 2801.0, 'E': -2760.4892690757}


In [None]:
pos1.set_cell(np.array([[26.341, 0, 0],[0, 26.341, 0],[0, 0, 14.819]]))

In [None]:
frc1.get_positions()

array([[-0.15785514,  0.80445088, -0.0683274 ],
       [ 0.10425984,  0.42702012,  0.80572148],
       [ 0.10465208, -1.36328091,  1.68661655],
       ...,
       [-0.11764501, -0.58360758, -1.53331323],
       [ 0.2573427 ,  0.77072382,  0.92609915],
       [ 0.18674235, -0.07566125,  0.60699077]])

In [None]:
pos1.set_array('forces', frc.get_positions())

In [None]:
pos1.get_array('forces')

array([[-0.15785514,  0.80445088, -0.0683274 ],
       [ 0.10425984,  0.42702012,  0.80572148],
       [ 0.10465208, -1.36328091,  1.68661655],
       ...,
       [-0.11764501, -0.58360758, -1.53331323],
       [ 0.2573427 ,  0.77072382,  0.92609915],
       [ 0.18674235, -0.07566125,  0.60699077]])

In [None]:
pos1.set_pbc([True, True, True])

In [None]:
io.write("test.xyz", pos1)

In [75]:
pos1 = io.read('c-h2o/m12_n12_lange_aggro-pos-1.xyz', index=str(1))
frc1 = io.read('c-h2o/m12_n12_lange_aggro-frc-1.xyz', index=str(1))
pos1.set_cell(np.array([[26.341, 0, 0],[0, 26.341, 0],[0, 0, 14.819]]))
pos1.set_array('forces', frc1.get_positions())
pos1.set_pbc([True, True, True])

In [82]:
frc1.get_positions()

array([[-0.23923343,  0.9676642 , -0.19535211],
       [ 0.10417801,  0.37953577,  0.73581644],
       [ 0.23021526, -1.37329947,  1.95345345],
       ...,
       [-0.15318008, -0.92955321, -0.88518534],
       [ 0.25901186,  0.81599329,  0.81476721],
       [ 0.22317319,  0.30660271,  0.05873493]])

In [77]:
pos1

Atoms(symbols='C288H130O65', pbc=True, cell=[26.341, 26.341, 14.819], forces=...)

In [76]:
pos1.get_array('forces')

array([[-0.23923343,  0.9676642 , -0.19535211],
       [ 0.10417801,  0.37953577,  0.73581644],
       [ 0.23021526, -1.37329947,  1.95345345],
       ...,
       [-0.15318008, -0.92955321, -0.88518534],
       [ 0.25901186,  0.81599329,  0.81476721],
       [ 0.22317319,  0.30660271,  0.05873493]])

### Randomly sample 500 structures - g-h2o
- the dataset contains tens of thousands of structures so its better to sample these before merging the files than merging and then sampling to save computational effort

In [55]:
len_pos_xyz = 59643845 / (483+2)
print("No. configurations = ", len_pos_xyz)

No. configurations =  122977.0


In [62]:
import random

sampled_atom_idx = random.sample(range(int(len_pos_xyz)), 200)

In [130]:
# save sampled atom indices
np.save('sampled_atom_idx.npy', sampled_atom_idx)

In [131]:
sampled_atom_idx = np.load('sampled_atom_idx.npy')

In [63]:
atoms_list = []
for i in sampled_atom_idx:
    pos = io.read('c-h2o/m12_n12_lange_aggro-pos-1.xyz', index=str(i))
    frc = io.read('c-h2o/m12_n12_lange_aggro-frc-1.xyz', index=str(i))
    pos.set_cell(np.array([[26.341, 0, 0],[0, 26.341, 0],[0, 0, 14.819]]))
    pos.set_array('forces', frc.get_positions())
    pos.set_pbc([True, True, True])
    atoms_list.append(pos)

In [65]:
io.write("sampled_dataset_200.xyz", atoms_list)

In [66]:
sampled_dataset_200 = io.read("sampled_dataset_200.xyz", index=':')

In [67]:
sampled_dataset_200

[Atoms(symbols='C288H130O65', pbc=True, cell=[26.341, 26.341, 14.819], calculator=SinglePointCalculator(...)),
 Atoms(symbols='C288H130O65', pbc=True, cell=[26.341, 26.341, 14.819], calculator=SinglePointCalculator(...)),
 Atoms(symbols='C288H130O65', pbc=True, cell=[26.341, 26.341, 14.819], calculator=SinglePointCalculator(...)),
 Atoms(symbols='C288H130O65', pbc=True, cell=[26.341, 26.341, 14.819], calculator=SinglePointCalculator(...)),
 Atoms(symbols='C288H130O65', pbc=True, cell=[26.341, 26.341, 14.819], calculator=SinglePointCalculator(...)),
 Atoms(symbols='C288H130O65', pbc=True, cell=[26.341, 26.341, 14.819], calculator=SinglePointCalculator(...)),
 Atoms(symbols='C288H130O65', pbc=True, cell=[26.341, 26.341, 14.819], calculator=SinglePointCalculator(...)),
 Atoms(symbols='C288H130O65', pbc=True, cell=[26.341, 26.341, 14.819], calculator=SinglePointCalculator(...)),
 Atoms(symbols='C288H130O65', pbc=True, cell=[26.341, 26.341, 14.819], calculator=SinglePointCalculator(...)),
 

In [123]:
sampled_dataset_200[0].get_forces()

array([[ 0.42909584, -2.72894669, -1.03472715],
       [-1.02263867,  0.75467332,  0.05177561],
       [-1.30053213,  1.10356744,  1.84503179],
       ...,
       [-0.75285653, -0.09111667,  0.20564839],
       [ 0.48558501, -0.45733806, -0.44822117],
       [-0.10203428, -0.16180811,  0.19245631]])

### Validation set
- randomly select 100 structures from the sampled data set
- then create training sets from the remaining structures

In [None]:
val_random_idx = random.sample(sampled_atom_idx, 100)

validation_set_100 = []
for i in val_random_idx:
    validation_set_100.append(sampled_dataset_200[i])

In [None]:
io.write("validation_set_100.xyz", validation_set_100)

In [126]:
validation_set_100 = io.read("validation_set_100.xyz", index=':')

Training sets

In [None]:
# 10
training_set_10 = []
for i in range(len(sampled_atom_idx)):
    if i not in val_random_idx:
        training_set_10.append(sampled_dataset_200[i])

# 20
training_set_20 = []
for i in range(len(sampled_atom_idx)):
    if i not in val_random_idx:
        training_set_20.append(sampled_dataset_200[i])
        
# 30
training_set_30 = []
for i in range(len(sampled_atom_idx)):
    if i not in val_random_idx:
        training_set_30.append(sampled_dataset_200[i])
        
# 50
training_set_50 = []
for i in range(len(sampled_atom_idx)):
    if i not in val_random_idx:
        training_set_50.append(sampled_dataset_200[i])

# 100
training_set_100 = []
for i in range(len(sampled_atom_idx)):
    if i not in val_random_idx:
        training_set_100.append(sampled_dataset_200[i])

In [139]:
io.write("train_10.xyz", training_set_10)
io.write("train_20.xyz", training_set_20)
io.write("train_30.xyz", training_set_30)
io.write("train_50.xyz", training_set_50)
io.write("train_100.xyz", training_set_100)