In [86]:
%matplotlib inline
from matplotlib import pylab as plt

import os, sys
from ase.io import read
sys.path.insert(0,"../build/")

import sys
import time, timeit
import rascal
import json

import ase
from ase.io import read, write
from ase.build import make_supercell
from ase.visualize import view
import numpy as np
import sys

import json

from rascal.representations import SphericalInvariants, SphericalInvariantsKspace
from rascal.representations import SphericalExpansion, SphericalExpansionKspace
from rascal.models import Kernel, train_gap_model
from rascal.models.sparse_points import SparsePoints
#from rascal.models.IP_ase_interface import ASEMLCalculator
from rascal.neighbourlist import AtomsList
from rascal.utils import from_dict, to_dict, CURFilter, dump_obj, load_obj, get_score, print_score, FPSFilter

In [87]:
import urllib.request
# a collection of distorted ethanol molecules from the ANI-1 dataset 
# (see https://github.com/isayev/ANI1_dataset) with energies and forces computed using DFTB+ 
# (see https://www.dftbplus.org/)
url = 'https://raw.githubusercontent.com/cosmo-epfl/librascal-example-data/833b4336a7daf471e16993158322b3ea807b9d3f/inputs/molecule_conformers_dftb.xyz'
# Download the file from `url`, save it in a temporary directory and get the
# path to it (e.g. '/tmp/tmpb48zma.txt') in the `structures_fn` variable:
structures_fn, headers = urllib.request.urlretrieve(url)
structures_fn

'/tmp/tmpygwliu9m'

# Utility functions

In [88]:
def extract_ref(frames,info_key='dft_formation_energy_per_atom_in_eV',array_key='zeros'):
    y,f = [], []
    for frame in frames:
        y.append(frame.info[info_key])
        if array_key is None:
            pass
        elif array_key == 'zeros':
            f.append(np.zeros(frame.get_positions().shape))
        else:
            f.append(frame.get_array(array_key))
    y= np.array(y)
    try:
        f = np.concatenate(f)
    except:
        pass
    return y,f


# Build a Force Field

In [89]:
# Total number of structure to load
#N = 
# Number of structure to train the model with
#structures_fn = 'water_monomers_1k_with_gradients.xyz'


# Isolated atom contributions


use_structures = 'ethanol'
#use_structures = 'NO_molecules'

if use_structures == 'ethanol':
    N = 100
    frames = read(structures_fn,':{}'.format(N))
    
    energy_key = 'dftb_energy_eV'
    forces_key = 'dftb_forces_eV_per_Ang'
    self_contributions = {
        1: -6.492647589968434,
        6: -38.054950840332474,
        8: -83.97955098636527,
    }
elif use_structures == 'NO_molecules':
    frames = read('mlip_trainingset_ON_10angs.xyz', ':')
    N = len(frames)
    
    energy_key = 'energy'
    forces_key = 'forces'
    self_contributions = {7: 0., 8:0.}


# load the structures
frames = read(structures_fn,':{}'.format(N))
f = int(0.8*N)

global_species = []
for frame in frames:
    global_species.extend(frame.get_atomic_numbers())
global_species = np.unique(global_species)

# split the structures in 2 sets
ids = list(range(N))
np.random.seed(10)
np.random.shuffle(ids)

train_ids = ids[:f]
test_ids = ids[f:]

frames_train = [frames[ii] for ii in ids[:f]]
frames_test = [frames[ii] for ii in ids[f:]]

In [90]:
frames[0]

Atoms(symbols='OC3H6', pbc=False, cell=[6.0, 6.0, 6.0], dftb_forces_eV_per_Ang=..., forces=..., calculator=SinglePointCalculator(...))

In [76]:
# split the dataset in a train and a test set
#y_train, f_train = extract_ref(frames_train,'dftb_energy_eV','dftb_forces_eV_per_Ang')
#y_test, f_test = extract_ref(frames_test,'dftb_energy_eV','dftb_forces_eV_per_Ang')

y_train, f_train = extract_ref(frames_train, energy_key, forces_key)
y_test, f_test = extract_ref(frames_test, energy_key, forces_key)


In [77]:
# Define the parameters of the spherical expansion
# Note that we choose a very large smearing parameter gaussian_sigma_constant = 1.5
# This is because the computational cost of the main loop of LODE scales cubically with 1/sigma.
# Thus, it would be recommended to stay in the range sigma >= 1.0.
# For the current value, the runtimes of both the real and reciprocal space versions should be similar.
hypers = {'interaction_cutoff': 4.0,
          'cutoff_smooth_width': 0.0, # does not affect LODE but is needed for SOAP
          'max_radial': 6,
          'max_angular': 4,
          'gaussian_sigma_type': "Constant",
          'gaussian_sigma_constant': 1.5, # this is quite a large value! See description above
          'compute_gradients': True,
         }

In [78]:
# Compute SOAP features and measure time
start = timeit.default_timer()

calculator_SOAP = SphericalInvariants(**hypers)
managers_SOAP = calculator_SOAP.transform(frames)

stop = timeit.default_timer()
print(f'Duration of real space version = {stop - start}s')

Duration of real space version = 0.19215778727084398s


In [79]:
# Compute LODE features and measure time
start = timeit.default_timer()

calculator_LODE = SphericalInvariantsKspace(**hypers)
managers_LODE = calculator_LODE.transform(frames)

stop = timeit.default_timer()
print(f'Duration of k-space version = {stop - start}s')

Duration of k-space version = 4.516579265706241s


In [80]:
# Compute real space spherical expansion coefficients
start = timeit.default_timer()

calculator_SEXP = SphericalExpansion(**hypers)
managers_SEXP = calculator_SEXP.transform(frames)

stop = timeit.default_timer()
print(f'Duration of real space version = {stop - start}s')

Duration of real space version = 0.21696776058524847s


In [81]:
# Compute reciprocal space expansion coefficients
start = timeit.default_timer()

calculator_KEXP = SphericalExpansionKspace(**hypers)
managers_KEXP = calculator_KEXP.transform(frames)

stop = timeit.default_timer()
print(f'Duration of k-space version = {stop - start}s')

Duration of k-space version = 4.541865400969982s


In [61]:
# Choose the manager and calculator you want to use for the rest of this notebook
managers = managers_LODE
calculator = calculator_LODE

#managers = managers_SOAP
#calculator = calculator_SOAP

In [82]:
features_LODE = managers_LODE.get_features(calculator_LODE)
features_SOAP = managers_SOAP.get_features(calculator_SOAP)

In [84]:
# Compare the features obtained from both methods:
assert (features_LODE.shape == features_SOAP.shape)
np.mean(np.abs(features_LODE - features_SOAP))

0.012543581870530687

In [38]:
# select the sparse points for the sparse kernel method with CUR on the whole training set
num_pseudo_per_element = min(50, N//2)
n_pseudo = {}
for species in global_species:
    n_pseudo[species] = num_pseudo_per_element
#n_pseudo = {1:50, 6:50, 8:50}
compressor = CURFilter(calculator, n_pseudo, act_on='sample per species')
X_pseudo = compressor.select_and_filter(managers)

In [39]:
# set up the sparse kernel and compute the representation for the train set
zeta = 2
kernel = Kernel(calculator, name='GAP', zeta=zeta, target_type='Structure', kernel_type='Sparse')

managers_train = managers.get_subset(train_ids)

In [40]:
# build the KNM matrix for training with forces and energies (see train_gap_model for more details)
KNM = kernel(managers_train, X_pseudo)
KNM_down = kernel(managers_train, X_pseudo, grad=(True, False))
KNM = np.vstack([KNM, KNM_down])
del KNM_down
KNM_down = []

In [41]:
# train a GAP model 
model = train_gap_model(kernel, managers_train, KNM, X_pseudo, y_train, self_contributions, 
                        grad_train=-f_train, lambdas=[1e-12, 1e-8], jitter=1e-13)

AttributeError: 'rascal.lib._rascal.neighbour_list.Adaptor.CenterCo' object has no attribute 'get_atomic_numbers'

In [None]:
# make predictions on the training and test sets
managers_test = managers.get_subset(test_ids)

y_pred_train = model.predict(managers_train)
f_pred_train = model.predict_forces(managers_train)

y_pred_test = model.predict(managers_test)
f_pred_test = model.predict_forces(managers_test)

In [None]:
# basic assessement of the quality of the trained model
print_score(y_pred_test, y_test)
print_score(f_pred_test.flatten(), f_test.flatten())
plt.plot(y_test, y_pred_test, 'o')
plt.title("correlation plot")
plt.xlabel("predicted energies [eV]")
plt.ylabel("reference energies [eV]")

In [None]:
# More detailed plots of target vs predicted quantities for both the training and test sets
fig, ax = plt.subplots(figsize=(10,6), ncols = 2, nrows = 2, sharex='row', sharey='row')
ax[0,0].scatter(y_train, y_pred_train)
ax[1,0].scatter(f_train.flatten(), f_pred_train.flatten())
ax[0,1].scatter(y_test, y_pred_test)
ax[1,1].scatter(f_test.flatten(), f_pred_test.flatten())
ax[0,0].set_title('Training set')
ax[0,1].set_title('Test set')
ax[0,0].set_ylabel('Energies eV')
ax[1,0].set_ylabel('Forces eV/A')

In [None]:
# save the model to a file in json format for future use
dump_obj('/tmp/mymodel.json', model)

# Test the model on dimer configurations

In [None]:
# you can load the previously trained model
model = load_obj('/tmp/mymodel.json')

In [None]:
#creating atoms pairs, H is 1, C is 6 and O is 8, the first atom is the origin one
pairs = [[1,1],[6,6],[8,8],[6,1],[8,1],[6,8]]
ndists = 40 #number of distances to look at
dists = np.linspace(0.1,4.9,ndists) #distance list, can be changed 
print('Number of configurations: ', len(pairs)*len(dists))

In [None]:
frames = []
for p in pairs:
    for d in dists:
        #using ase we can create the cell and place the atoms
        atoms = ase.Atoms(numbers=p,pbc=True,cell=np.eye(3)*10,positions=[[0,0,0],[d,0,0]])
        frames.append(atoms)
X = calculator.transform(frames)
e_pairs = model.predict(X)
e_pairs -= e_pairs.mean()

In [None]:
for pair_to_plot in pairs:
    i = pairs.index(pair_to_plot)

    fig, ax = plt.subplots()
    ax.plot(dists,e_pairs[i*ndists:(i+1)*ndists],'--xb',linewidth=1)
    # ax.plot(dists,f_pairs[i*ndists:(i+1)*ndists],'--xr',linewidth=1)
    ax.set_xlabel('Distance (A)')
    ax.set_ylabel('Predicted energy (eV)')
    ax.set_title('Bond energy between {} and {}'.format(*pair_to_plot))
    plt.tight_layout()
    plt.show()