# Process QChem AIMD scratch files

In [1]:
import numpy as np
from sgdml.utils import io
from sgdml.predict import GDMLPredict
from sgdml.utils import io, ui
import argparse
import os
import sys
from sgdml import __version__
# from ase.io import read

In [48]:
def read_QChem_AIMD_scratch(NucCarts,NucForces,View,Energy):
    n_atoms = None

    R, z, E, F = [], [], [], []
    for i, line in enumerate(NucCarts.split('\n')):
        if i ==0:
            print("Heading labels: ",line)
        else:
            carts = line.strip().split()
            if len(carts)<2:
                break
            time = carts[0]
            n_atoms = int((len(carts)-1)/3)
            R.append(list(map(float, carts[1::])))
    ndata = np.shape(R)[0]
    for i, line in enumerate(NucForces.split('\n')):
        if i ==0:
            print("Heading labels: ",line)
        else:
            forces = line.strip().split()
            if len(forces)<(n_atoms*3+1):
                break
            time = forces[0]
            n_atoms = int((len(forces)-1)/3)
            F.append(list(map(float, forces[1::])))
    for i, line in enumerate(Energy.split('\n')):
        if i ==0:
            print("Heading labels: ",line)
        else:
            en = line.strip().split()
            if len(en) < 2:
                break
            time = en[0]
            if i > 9997:
                print(en[2].isnumeric(),line, en, en[2])
            if i > ndata-1:
                break
            E.append(float(en[2])) # 4th column of TandV, 3rd of Energy
    for i, line in enumerate(View.split('\n')):
        if i == 0:
            number = int(line)
            if number != n_atoms:
                print(number,n_atoms," Do not agree on number of atoms!")
                break
        elif i==1:
            continue
        elif i>n_atoms+1:
            break # read first time step only
        else:
            z.append(io._z_str_to_z_dict[line[0]])  # From sgdml io to convert letter to atomic number, from sgdml.utils import io
    print(z)
    F = np.hstack(F)
    fshape = np.shape(F)
    ts = int(fshape[0]/(n_atoms*3))
    print(fshape)
    F = -F.reshape(-1, n_atoms, 3)*1185.8 # Remove negative sign to shift between gradient and force
    R = np.hstack(R)
    R = R[:fshape[0]] # Limit size of carts to that of forces
    R = R.reshape(-1,n_atoms, 3)
    E = np.array(E[:ts])*627.51 # Limit size of carts to that of forces
    z = np.array(z)
    print("Found", n_atoms," atoms and ",ts," time steps.")
    return (R, z, E, F)

In [49]:

path = 'data_h2o'
Force = path+'/NucForces'
with open(Force, 'r') as file:
    NucForces = file.read()
Carts = path+'/NucCarts'
with open(Carts, 'r') as file:
    NucCarts = file.read()
View = path+'/View.xyz'
with open(View, 'r') as file:
    View = file.read()
#Energy = path+'/TandV' # Change from EComponents which are constant
Energy = path+'/Energy' # Change from EComponents which are constant
with open(Energy, 'r') as file:
    Energy = file.read()
R, z, E, F = read_QChem_AIMD_scratch(NucCarts,NucForces,View,Energy)
name = "H2O"
filename = name+"_AIMD"
# Base variables contained in every model file.
base_vars = {
    'type': 'd',
    'code_version': __version__,
    'name': name,
    'theory': 'QChem',
    'R': R,
    'z': z,
    'F': F,
}

Heading labels:  # Time/fs  Nuclear cartestian coordinates (angstroms)
Heading labels:  # Time/fs  Nuclear cartesian forces (a.u.)
Heading labels:  # Time/fs   E(total) - E(prev)  E(total) - E(initial)
False 1209.079333    -2.959015e-07     1.961293e-06 ['1209.079333', '-2.959015e-07', '1.961293e-06'] 1.961293e-06
False 1209.200277    -3.050929e-07     1.656200e-06 ['1209.200277', '-3.050929e-07', '1.656200e-06'] 1.656200e-06
False 1209.321221    -3.091092e-07     1.347091e-06 ['1209.321221', '-3.091092e-07', '1.347091e-06'] 1.347091e-06
False # Step-to-Step energy fluctuations, |E(n) - E(n-1)| ['#', 'Step-to-Step', 'energy', 'fluctuations,', '|E(n)', '-', 'E(n-1)|'] energy
[1, 8, 1]
(90000,)
Found 3  atoms and  10000  time steps.


In [50]:
np.shape(R)

(10000, 3, 3)

In [51]:
np.shape(F)

(10000, 3, 3)

In [52]:
z

array([1, 8, 1])

In [53]:
np.shape(E)

(10000,)

In [54]:
base_vars['F_min'], base_vars['F_max'] = np.min(F.ravel()), np.max(F.ravel())
base_vars['F_mean'], base_vars['F_var'] = np.mean(F.ravel()), np.var(F.ravel())

print('Please provide a description of the length unit used in your input file, e.g. \'Ang\' or \'au\': ')
print('Note: This string will be stored in the dataset file and passed on to models files for later reference.')
#base_vars['r_unit'] = 'Ang'

print('Please provide a description of the energy unit used in your input file, e.g. \'kcal/mol\' or \'eV\': ')
print('Note: This string will be stored in the dataset file and passed on to models files for later reference.')
#base_vars['e_unit'] = 'kcal/mol'

if E is not None:
    base_vars['E'] = E
    base_vars['E_min'], base_vars['E_max'] = np.min(E), np.max(E)
    base_vars['E_mean'], base_vars['E_var'] = np.mean(E), np.var(E)
else:
    print(ui.color_str('[INFO]', bold=True) + ' No energy labels found in dataset.')
keys = ['z', 'R', 'E', 'F']
for k in keys:
    d = base_vars[k]
    print(np.shape(d))
    if type(d) is np.ndarray:
            print(k)
            d = d.ravel()
            print(np.shape(d))
dataset_file_name = filename + '.npz'
base_vars['md5'] = io.dataset_md5(base_vars)
np.savez_compressed(dataset_file_name, **base_vars)
print(ui.color_str('[DONE]', fore_color=ui.GREEN, bold=True))

Please provide a description of the length unit used in your input file, e.g. 'Ang' or 'au': 
Note: This string will be stored in the dataset file and passed on to models files for later reference.
Please provide a description of the energy unit used in your input file, e.g. 'kcal/mol' or 'eV': 
Note: This string will be stored in the dataset file and passed on to models files for later reference.
(3,)
z
(3,)
(10000, 3, 3)
R
(90000,)
(10000,)
E
(10000,)
(10000, 3, 3)
F
(90000,)
[DONE]


In [55]:
def dataset_md5(dataset):

    md5_hash = hashlib.md5()

    keys = ['z', 'R']
    if 'E' in dataset:
        keys.append('E')
    keys.append('F')

    # only include new extra keys in fingerprint for 'modern' dataset files
    # 'code_version' was included from 0.4.0.dev1
    # opt_keys = ['lattice', 'e_unit', 'E_min', 'E_max', 'E_mean', 'E_var', 'f_unit', 'F_min', 'F_max', 'F_mean', 'F_var']
    # for k in opt_keys:
    #    if k in dataset:
    #        keys.append(k)

    for k in keys:
        d = dataset[k]
        if type(d) is np.ndarray:
            d = d.ravel()
        md5_hash.update(hashlib.md5(d).digest())

    return md5_hash.hexdigest().encode('utf-8')

In [47]:
dataset_file_name = filename + '.xyz'
try:
    with open(dataset_file_name, 'w') as file:

        n = R.shape[0]
        for i, r in enumerate(R):

            e = np.squeeze(base_vars['E'][i]) if 'E' in base_vars else None
            f = base_vars['F'][i,:,:]
            ext_xyz_str = io.generate_xyz_str(r, z, e=e, f=f) + '\n'

            file.write(ext_xyz_str)

            progr = float(i) / (n - 1)
            ui.callback(i, n - 1, disp_str='Exporting %d data points...' % n)
            
except IOError:
    sys.exit("ERROR: Writing xyz file failed.")

print()

[100%] Exporting 10000 data points...

