- frc/pos/vel file:
    - no. atoms
    - timestep no, time, energy
    - atom id, x, y, z component of frc/pos/vel
    - repeat
- init.pdb

### Create xyz file with energies, positions and forces

In [2]:
from ase import io
import numpy as np
from ase.visualize import view

In [3]:
input = io.read("gra-wat-protonic-defects/input.xyz", index=":")

In [6]:
input[0].get_forces()

array([[-2.09133547e+00,  1.23988888e+00, -1.40006862e+00],
       [ 2.34278940e-01, -7.50813600e-01,  7.22994300e-02],
       [-1.68150159e+00,  1.39996578e+00,  3.71267320e-01],
       [ 1.48656054e+00, -8.90578780e-01,  1.45262197e+00],
       [-9.37887080e-01,  3.33610945e+00,  1.27778695e+00],
       [ 2.24616731e+00,  1.14187842e+00,  6.82165140e-01],
       [-3.39807304e+00,  1.72078805e+00, -1.06392257e+00],
       [ 4.75345590e-01, -1.98638303e+00,  4.72928750e-01],
       [-9.75476600e-02,  3.44784960e-01,  1.78280310e-01],
       [ 1.27994667e+00, -7.24691190e-01,  9.08010860e-01],
       [ 1.38356214e+00, -1.21196670e+00, -4.28397240e-01],
       [ 4.43566750e-01, -1.05687774e+00, -1.06957900e-02],
       [ 9.70334400e-02,  2.96499640e-01,  7.49733740e-01],
       [-1.71991388e+00,  1.44989660e+00, -1.19546022e+00],
       [ 5.60639371e+00, -1.04217103e+00,  3.53475290e-01],
       [-1.38716168e+00, -3.08763802e+00, -1.07868070e+00],
       [-1.17550850e-01,  7.60429530e-01

In [7]:
view(input[0], viewer="x3d")

In [8]:
view(input[7], viewer="x3d")

### Randomly sample 500 structures - g-h2o
- the dataset contains tens of thousands of structures so its better to sample these before merging the files than merging and then sampling to save computational effort

In [9]:
print("No. configurations = ", len(input))

No. configurations =  5845


In [11]:
import random

sampled_atom_idx = random.sample(range(len(input)), 200)

In [12]:
# save sampled atom indices
np.save('sampled_atom_idx.npy', sampled_atom_idx)

In [16]:
sampled_atom_idx = np.load('sampled_atom_idx.npy')

In [17]:
atoms_list = [input[i] for i in sampled_atom_idx]

In [18]:
io.write("sampled_dataset_200.xyz", atoms_list)

In [19]:
sampled_dataset_200 = io.read("sampled_dataset_200.xyz", index=':')

### Validation set
- randomly select 100 structures from the sampled data set
- then create training sets from the remaining structures

In [23]:
sampled_atom_idx

array([4966, 5286, 3686, 2642,  454, 3240, 4483,  389,  953,  219, 5760,
       1431, 3138, 3891, 3613, 3727, 4632,  646, 5548, 2840,  414, 3900,
       4219,  116, 2791, 3677, 1283, 5632, 4617,  608,  500,  489, 3109,
        400, 1440,   87, 1696, 4662, 1213, 2853,  569, 3128, 1142, 4605,
       1318, 5713, 5799, 2426, 5699, 1608, 1137, 4354, 5127, 1081, 1791,
       2201, 2157, 2303, 5707,  443, 1167, 1441, 5724, 2505,  404, 4596,
       1091,  266, 2019, 5751, 3623, 4209, 5522,  426,  392, 5272, 3217,
       1413, 3378, 4569, 2678, 1458, 3975, 2086,  845, 5141, 3314, 2103,
       5682, 4004, 2940,  399, 1820, 1602, 5324, 2787, 1119, 3976, 1592,
       1967,  711, 2799, 5658, 3281,  293, 4706, 3052,  587, 4075, 3505,
       3500, 4297, 4092, 3705, 1496, 2706, 2421, 5055, 5258, 4904, 3198,
       4233, 3444, 1063, 1155, 1429, 3863, 2770,  621, 1988, 3003, 5087,
        900, 1124, 3034, 5796, 5147, 1019, 2718, 1484, 3649,  479, 5838,
       2795,  841,  497, 4531, 3872,  492,  537, 58

In [25]:
val_random_idx = random.sample(list(sampled_atom_idx), 100)

validation_set_100 = []
for i in val_random_idx:
    validation_set_100.append(input[i])

In [26]:
io.write("validation_set_100.xyz", validation_set_100)

In [27]:
validation_set_100 = io.read("validation_set_100.xyz", index=':')

Training sets

In [30]:
# 10
training_set_10 = []
for i in list(sampled_atom_idx):
    if i not in val_random_idx:
        training_set_10.append(input[i])
    if len(training_set_10) == 10:
        break

# 20
training_set_20 = []
for i in list(sampled_atom_idx):
    if i not in val_random_idx:
        training_set_20.append(input[i])
    if len(training_set_20) == 20:
        break
        
# 30
training_set_30 = []
for i in list(sampled_atom_idx):
    if i not in val_random_idx:
        training_set_30.append(input[i])
    if len(training_set_30) == 30:
        break
        
# 50
training_set_50 = []
for i in list(sampled_atom_idx):
    if i not in val_random_idx:
        training_set_50.append(input[i])
    if len(training_set_50) == 50:
        break

# 100
training_set_100 = []
for i in list(sampled_atom_idx):
    if i not in val_random_idx:
        training_set_100.append(input[i])
    if len(training_set_100) == 100:
        break

In [31]:
io.write("train_10.xyz", training_set_10)
io.write("train_20.xyz", training_set_20)
io.write("train_30.xyz", training_set_30)
io.write("train_50.xyz", training_set_50)
io.write("train_100.xyz", training_set_100)