- frc/pos/vel file:
    - no. atoms
    - timestep no, time, energy
    - atom id, x, y, z component of frc/pos/vel
    - repeat
- init.pdb

### Create xyz file with energies, positions and forces

In [1]:
from ase import io
import numpy as np
from ase.visualize import view
from ase import Atoms

In [2]:
input_MP2 = io.read("caco3-water/MP2/gen2.xyz", index=":")
input_revPBE_D3 = io.read("caco3-water/revPBE-D3/gen2.xyz", index=":")
input_revPBE0_D3 = io.read("caco3-water/revPBE0-D3/gen2.xyz", index=":")

In [3]:
inputs = {"MP2": input_MP2, "revPBE_D3": input_revPBE_D3, "revPBE0_D3": input_revPBE0_D3}

In [4]:
view(input_MP2[2], viewer="x3d")

### Randomly sample structures (max 423)
- the dataset contains tens of thousands of structures so its better to sample these before merging the files than merging and then sampling to save computational effort
- use the same structures for all the ES methods to allow for a fair comparison
- D3 correction: Grimme's dispersion correction, adds an empirical correction for vdw interactions which standard GGA functionals struggle to describe
- revPBE0 is a hydrid version of revPBE, meaning it includes a fraction of exact exhance (HF exchange) in addition to the standard exchange-correlation energy from revPBE

In [5]:
print("No. configurations = ", len(input_revPBE_D3))

No. configurations =  423


In [5]:
import random

sampled_atom_idx = random.sample(range(len(input_MP2)), 400)

In [6]:
# save sampled atom indices
np.save('sampled_atom_idx.npy', sampled_atom_idx)

In [7]:
sampled_atom_idx = np.load('sampled_atom_idx.npy')

In [8]:
atoms_list_MP2 = [input_MP2[i] for i in sampled_atom_idx]
atoms_list_revPBE_D3 = [input_revPBE_D3[i] for i in sampled_atom_idx]
atoms_list_revPBE0_D3 = [input_revPBE0_D3[i] for i in sampled_atom_idx]

In [9]:
for i in atoms_list_MP2:
    i.info["MP2_energy"] = i.get_potential_energy()
    i.arrays["MP2_forces"] = i.get_forces()
for i in atoms_list_revPBE_D3:
    i.info["rPBED3_energy"] = i.get_potential_energy()
    i.arrays["rPBED3_forces"] = i.get_forces()
for i in atoms_list_revPBE0_D3:
    i.info["rPBE0D3_energy"] = i.get_potential_energy()
    i.arrays["rPBE0D3_forces"] = i.get_forces()


In [10]:
io.write("training_val_sets/sampled_dataset_400_MP2.xyz", atoms_list_MP2)
io.write("training_val_sets/sampled_dataset_400_revPBE_D3.xyz", atoms_list_revPBE_D3)
io.write("training_val_sets/sampled_dataset_400_revPBE0_D3.xyz", atoms_list_revPBE0_D3)

In [11]:
sampled_dataset_400_MP2 = io.read("training_val_sets/sampled_dataset_400_MP2.xyz", index=":")
sampled_dataset_400_revPBE_D3 = io.read("training_val_sets/sampled_dataset_400_revPBE_D3.xyz", index=":")
sampled_dataset_400_revPBE0_D3 = io.read("training_val_sets/sampled_dataset_400_revPBE0_D3.xyz", index=":")

In [13]:
len(sampled_dataset_400_MP2)

400

### Validation set
- randomly select 100 structures
- then create training sets from the remaining structures

In [14]:
import random
val_random_idx = random.sample(list(sampled_atom_idx), 100)

validation_set_100_MP2 = []
validation_set_100_revPBE_D3 = []
validation_set_100_revPBE0_D3 = []

for i in val_random_idx:
    validation_set_100_MP2.append(input_MP2[i])
    validation_set_100_revPBE_D3.append(input_revPBE_D3[i])
    validation_set_100_revPBE0_D3.append(input_revPBE0_D3[i])

In [15]:
io.write("training_val_sets/validation_set_100_MP2.xyz", validation_set_100_MP2)
io.write("training_val_sets/validation_set_100_revPBE_D3.xyz", validation_set_100_revPBE_D3)
io.write("training_val_sets/validation_set_100_revPBE0_D3.xyz", validation_set_100_revPBE0_D3)

In [16]:
validation_set_100_MP2 = io.read("training_val_sets/validation_set_100_MP2.xyz", index=":")
validation_set_100_revPBE_D3 = io.read("training_val_sets/validation_set_100_revPBE_D3.xyz", index=":")
validation_set_100_revPBE0_D3 = io.read("training_val_sets/validation_set_100_revPBE0_D3.xyz", index=":")

Training sets

In [17]:
training_set_sizes = [10, 20, 30, 50, 100, 150, 200, 250, 300]

for key, input in inputs.items():
    for size in training_set_sizes:
        training_set = []
        for i in list(sampled_atom_idx):
            if i not in val_random_idx:
                training_set.append(input[i])
            if len(training_set) == size:
                break
        io.write(f"training_val_sets/training_set_{size}_{key}.xyz", training_set)
