To install rascal:
(NOTE: See the top-level README for the most up-to-date installation instructions.)
+ mkdir ../build 
+ cd build
+ cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON ..
+ make -j 4
+ make install

In [1]:
!export OMP_NUM_THREADS=1
!export NUMBA_THREADING_LAYER=1
from mkl import set_num_threads
set_num_threads(1)

In [2]:
%matplotlib notebook
from matplotlib import pylab as plt

import os, sys
from ase.io import read
sys.path.insert(0,"../build/")

import sys
import time
import rascal
import json

import ase
from ase.io import read, write
from ase.build import make_supercell
from ase.visualize import view
import numpy as np
import sys

import json

from rascal.representations import SphericalInvariants as SOAP
from rascal.models import Kernel
from rascal.utils import fps

In [3]:
frames = read('../reference_data/inputs/small_molecules-1000.xyz',':100')

# SOAP: Power spectrum

In [4]:
hypers = dict(soap_type="PowerSpectrum",
              interaction_cutoff=3.5, 
              max_radial=6, 
              max_angular=4, 
              gaussian_sigma_constant=0.4,
              gaussian_sigma_type="Constant",
              cutoff_smooth_width=0.5,
              normalize=True,
              )
soap = SOAP(**hypers)
zeta=2
kernel1 = Kernel(soap, zeta=zeta, target_type='Atom')

In [5]:
representation = soap.transform(frames[:10])
X = representation.get_features(soap)
X.shape

(174, 1800)

In [6]:
from itertools import product
species = []
for ii in range(len(representation)):
    manager = representation[ii]
    for center in manager:
        sp = center.atom_type
        species.append(sp)
        
u_species = np.unique(species)
sp_pairs = []
for sp1 in u_species:
    for sp2 in u_species:
        if sp1 <= sp2:
            sp_pairs.append((sp1,sp2))
feat_idx2coeff_idx = {}
i_feat = 0
for sp_pair,n1,n2,l in product(sp_pairs,range(soap.hypers['max_radial']),
                       range(soap.hypers['max_radial']),range(soap.hypers['max_angular'])):
    feat_idx2coeff_idx[i_feat] = dict(a=sp_pair[0],b=sp_pair[1],n1=n1,n2=n2,l=l)
    i_feat += 1
u_species, sp_pairs,feat_idx2coeff_idx

(array([1, 6, 7, 8]),
 [(1, 1),
  (1, 6),
  (1, 7),
  (1, 8),
  (6, 6),
  (6, 7),
  (6, 8),
  (7, 7),
  (7, 8),
  (8, 8)],
 {0: {'a': 1, 'b': 1, 'n1': 0, 'n2': 0, 'l': 0},
  1: {'a': 1, 'b': 1, 'n1': 0, 'n2': 0, 'l': 1},
  2: {'a': 1, 'b': 1, 'n1': 0, 'n2': 0, 'l': 2},
  3: {'a': 1, 'b': 1, 'n1': 0, 'n2': 0, 'l': 3},
  4: {'a': 1, 'b': 1, 'n1': 0, 'n2': 1, 'l': 0},
  5: {'a': 1, 'b': 1, 'n1': 0, 'n2': 1, 'l': 1},
  6: {'a': 1, 'b': 1, 'n1': 0, 'n2': 1, 'l': 2},
  7: {'a': 1, 'b': 1, 'n1': 0, 'n2': 1, 'l': 3},
  8: {'a': 1, 'b': 1, 'n1': 0, 'n2': 2, 'l': 0},
  9: {'a': 1, 'b': 1, 'n1': 0, 'n2': 2, 'l': 1},
  10: {'a': 1, 'b': 1, 'n1': 0, 'n2': 2, 'l': 2},
  11: {'a': 1, 'b': 1, 'n1': 0, 'n2': 2, 'l': 3},
  12: {'a': 1, 'b': 1, 'n1': 0, 'n2': 3, 'l': 0},
  13: {'a': 1, 'b': 1, 'n1': 0, 'n2': 3, 'l': 1},
  14: {'a': 1, 'b': 1, 'n1': 0, 'n2': 3, 'l': 2},
  15: {'a': 1, 'b': 1, 'n1': 0, 'n2': 3, 'l': 3},
  16: {'a': 1, 'b': 1, 'n1': 0, 'n2': 4, 'l': 0},
  17: {'a': 1, 'b': 1, 'n1': 0, 'n2':

In [7]:
def get_power_spectrum_index_mapping(soap, managers):
    n_max = soap.hypers['max_radial']
    l_max = soap.hypers['max_angular']+1
    species = []
    for ii in range(len(managers)):
        manager = managers[ii]
        if isinstance(manager, ase.Atoms):
            species.extend(manager.get_atomic_numbers())
        else:
            for center in manager:
                sp = center.atom_type
                species.append(sp)

    u_species = np.unique(species)
    sp_pairs = []
    for sp1 in u_species:
        for sp2 in u_species:
            if sp1 <= sp2:
                sp_pairs.append((sp1,sp2))
    feat_idx2coeff_idx = {}    
    i_feat = 0
    for sp_pair,n1,n2,l in product(sp_pairs, range(n_max), range(n_max), range(l_max)):
        feat_idx2coeff_idx[i_feat] = dict(a=sp_pair[0],b=sp_pair[1],n1=n1,n2=n2,l=l)
        i_feat += 1
    return feat_idx2coeff_idx

get_power_spectrum_index_mapping(soap, representation)

{0: {'a': 1, 'b': 1, 'n1': 0, 'n2': 0, 'l': 0},
 1: {'a': 1, 'b': 1, 'n1': 0, 'n2': 0, 'l': 1},
 2: {'a': 1, 'b': 1, 'n1': 0, 'n2': 0, 'l': 2},
 3: {'a': 1, 'b': 1, 'n1': 0, 'n2': 0, 'l': 3},
 4: {'a': 1, 'b': 1, 'n1': 0, 'n2': 0, 'l': 4},
 5: {'a': 1, 'b': 1, 'n1': 0, 'n2': 1, 'l': 0},
 6: {'a': 1, 'b': 1, 'n1': 0, 'n2': 1, 'l': 1},
 7: {'a': 1, 'b': 1, 'n1': 0, 'n2': 1, 'l': 2},
 8: {'a': 1, 'b': 1, 'n1': 0, 'n2': 1, 'l': 3},
 9: {'a': 1, 'b': 1, 'n1': 0, 'n2': 1, 'l': 4},
 10: {'a': 1, 'b': 1, 'n1': 0, 'n2': 2, 'l': 0},
 11: {'a': 1, 'b': 1, 'n1': 0, 'n2': 2, 'l': 1},
 12: {'a': 1, 'b': 1, 'n1': 0, 'n2': 2, 'l': 2},
 13: {'a': 1, 'b': 1, 'n1': 0, 'n2': 2, 'l': 3},
 14: {'a': 1, 'b': 1, 'n1': 0, 'n2': 2, 'l': 4},
 15: {'a': 1, 'b': 1, 'n1': 0, 'n2': 3, 'l': 0},
 16: {'a': 1, 'b': 1, 'n1': 0, 'n2': 3, 'l': 1},
 17: {'a': 1, 'b': 1, 'n1': 0, 'n2': 3, 'l': 2},
 18: {'a': 1, 'b': 1, 'n1': 0, 'n2': 3, 'l': 3},
 19: {'a': 1, 'b': 1, 'n1': 0, 'n2': 3, 'l': 4},
 20: {'a': 1, 'b': 1, 'n1': 0,

In [24]:
def get_index_mappings_sample_per_species(managers, sps):
    # get various info from the structures about the center atom species and indexing
    types = []
    strides_by_sp = {sp: [0] for sp in sps}
    global_counter = {sp: 0 for sp in sps}
    indices_by_sp = {sp: [] for sp in sps}
    map_by_manager = [{} for ii in range(len(managers))]
    for i_man in range(len(managers)):
        man = managers[i_man]
        counter = {sp: 0 for sp in sps}
        for i_at, at in enumerate(man):
            types.append(at.atom_type)
            if at.atom_type in sps:
                map_by_manager[i_man][global_counter[at.atom_type]] = i_at
                counter[at.atom_type] += 1
                global_counter[at.atom_type] += 1
            else:
                raise ValueError('Atom type {} has not been specified in fselect: {}'.format(
                    at.atom_type, self.Nselect))
        for sp in sps:
            strides_by_sp[sp].append(counter[sp])

    for sp in sps:
        strides_by_sp[sp] = np.cumsum(strides_by_sp[sp])

    for ii, sp in enumerate(types):
        indices_by_sp[sp].append(ii)

    return strides_by_sp, global_counter, map_by_manager, indices_by_sp

def convert_selected_global_index2rascal_sample_per_species(managers, selected_ids_by_sp, strides_by_sp, map_by_manager, sps):
    # convert selected center indexing into the rascal format
    selected_ids = [[] for ii in range(len(managers))]
    i_manager = {sp: 0 for sp in sps}
    for sp in sps:
        for idx in selected_ids_by_sp[sp]:
            carry_on = True
            while carry_on:
                if idx >= strides_by_sp[sp][i_manager[sp]] and idx < strides_by_sp[sp][i_manager[sp] + 1]:
                    selected_ids[i_manager[sp]].append(
                        map_by_manager[i_manager[sp]][idx])
                    carry_on = False
                else:
                    i_manager[sp] += 1
    for ii in range(len(selected_ids)):
        selected_ids[ii] = list(np.sort(selected_ids[ii]))
    return selected_ids


class FPSFilter(object):
    """Farther Point Sampling (FPS) to select samples or features in a given feature matrix. 
    Wrapper around the fps function for convenience.
    Parameters
    ----------
    representation : Calculator
        Representation calculator associated with the kernel
    Nselect: int
        number of points to select. if act_on='sample per specie' then it should
        be a dictionary mapping atom type to the number of samples, e.g.
        Nselect = {1:200,6:100,8:50}.
    act_on: string
        Select how to apply the selection. Can be either of 'sample',
        'sample per species','feature'.
        
    is_deterministic: bool
        flag to switch between selction criteria
    seed: int
        if is_deterministic==False, seed for the random selection
    """

    def __init__(self, representation, Nselect, act_on='sample per specie', starting_index=0):
        super(FPSFilter, self).__init__()
        self._representation = representation
        self.Nselect = Nselect
        self.starting_index = starting_index
        if act_on in ['sample', 'sample per specie', 'feature']:
            self.act_on = act_on
        else:
            raise 'Wrong input: {}'.format(act_on)
    
        self.selected_ids = None
        self.fps_minmax_d2_by_sp = None
        self.fps_minmax_d2 = None
        
    def fit(self, managers):
        """Perform CUR selection of samples/features.
        Parameters
        ----------
        managers : AtomsList
            list of structures containing features computed with representation
        Returns
        -------
        PseudoPoints
            Selected samples
        Raises
        ------
        ValueError
            [description]
        NotImplementedError
            [description]
        """
        
        from rascal.utils import fps as do_fps
        # get the dense feature matrix
        X = managers.get_features(self._representation)
        
        if self.act_on in ['sample per specie']:
            sps = list(self.Nselect.keys())

            # get various info from the structures about the center atom species and indexing
            (strides_by_sp, global_counter, map_by_manager,
             indices_by_sp) = get_index_mappings_sample_per_species(managers, sps)

            print('The number of pseudo points selected by central atom species is: {}'.format(
                self.Nselect))

            # organize features w.r.t. central atom type
            X_by_sp = {}
            for sp in sps:
                X_by_sp[sp] = X[indices_by_sp[sp]]
            self._XX = X_by_sp

            # split the dense feature matrix by center species and apply CUR decomposition
            self.selected_ids_by_sp = {}
            self.fps_minmax_d2_by_sp = {}
            self.fps_hausforff_d2_by_sp = {}
            for sp in sps:
                print('Selecting species: {}'.format(sp))
                fps_out = do_fps(X_by_sp[sp], self.Nselect[sp], starting_index=self.starting_index)
                self.selected_ids_by_sp[sp] = fps_out['fps_indices']
                self.fps_minmax_d2_by_sp[sp] = fps_out['fps_minmax_d2']

            return self
        elif self.act_on in ['features']:
            fps_out = do_fps(X.T, self.Nselect, starting_index=self.starting_index)
            self.selected_ids = fps_out['fps_indices']
            self.fps_minmax_d2 = fps_out['fps_minmax_d2']
        else:
            raise NotImplementedError("method: {}".format(self.act_on))
            
    def transform(self, managers):
        if self.act_on in ['sample per specie']:
            sps = list(self.Nselect.keys())
            # get various info from the structures about the center atom species and indexing
            (strides_by_sp, global_counter, map_by_manager,
             indices_by_sp) = get_index_mappings_sample_per_species(managers, sps)
            self.selected_ids = convert_selected_global_index2rascal_sample_per_species(
                managers, self.selected_ids_by_sp, strides_by_sp, map_by_manager, sps)
            # build the pseudo points
            pseudo_points = PseudoPoints(self._representation)
            pseudo_points.extend(managers, self.selected_ids)
            return pseudo_points
        
        elif self.act_on in ['features']:
            feat_idx2coeff_idx = get_power_spectrum_index_mapping(self._representation, managers)
            selected_features = {key:[] for key in feat_idx2coeff_idx[0].keys()}
            for idx in self.selected_ids:
                coef_idx = feat_idx2coeff_idx[idx]
                for key in selected_features.keys():
                    selected_features[key].append(coef_idx[key])
            return dict(coefficient_subselection=selected_features)
            
    def plot(self):
        if self.fps_minmax_d2_by_sp is None:
            plt.semilogy(self.fps_minmax_d2,label=self.act_on)
            
        else:
            for sp in self.fps_minmax_d2_by_sp:
                plt.semilogy(self.fps_minmax_d2_by_sp[sp],
                            label='{} species {}'.format(self.act_on, sp))
            plt.legend()
        plt.title('FPSFilter')
        plt.ylabel('fps minmax d^2')
        
    def fit_transform(self, managers):
        return self.fit(managers).transform(managers)

Nselect = {1:300, 6:300, 7:300, 8:1}
fps_filter = FPSFilter(soap, Nselect, act_on='sample per specie')

In [25]:
managers = soap.transform(frames)

In [26]:
fps_filter.fit(managers)
fps_filter.plot()

The number of pseudo points selected by central atom species is: {1: 300, 6: 300, 7: 300, 8: 1}
Selecting species: 1
Selecting species: 6
Selecting species: 7


RuntimeError: Cannot FPS more inputs than those provided

In [8]:
aa = dict(a=1,b=3)
bb = {'c':5}
aa.update(bb)
aa

{'a': 1, 'b': 3, 'c': 5}

In [4]:
Xs = []
for frame in frames:
    representation = soap.transform([frame])
    X = representation.get_features(soap, species=[1, 6, 7, 8])
    Xs.append(X)

In [5]:
%%time
for ii,X in enumerate(Xs):
    for jj,Y in enumerate(Xs):
        # if jj < ii: continue
        aa = np.sum(np.power(np.dot(X, Y.T), zeta))

CPU times: user 1min 1s, sys: 0 ns, total: 1min 1s
Wall time: 1min 1s


In [6]:
representation = soap.transform(frames)
X = representation.get_features(soap)

In [7]:
%%time 
kk = np.power(np.dot(X, X.T), zeta)

CPU times: user 25.5 s, sys: 3.18 s, total: 28.7 s
Wall time: 8.2 s


In [21]:
%time kernel1(representation)

CPU times: user 4.19 s, sys: 7.93 ms, total: 4.2 s
Wall time: 4.2 s


array([[1.        , 0.00472567, 0.11009869, ..., 0.04595198, 0.04978086,
        0.03560331],
       [0.00472567, 1.        , 0.11667004, ..., 0.05571336, 0.06607319,
        0.04509975],
       [0.11009869, 0.11667004, 1.        , ..., 0.05723611, 0.06780385,
        0.06020636],
       ...,
       [0.04595198, 0.05571336, 0.05723611, ..., 1.        , 0.85119125,
        0.84384698],
       [0.04978086, 0.06607319, 0.06780385, ..., 0.85119125, 1.        ,
        0.84252262],
       [0.03560331, 0.04509975, 0.06020636, ..., 0.84384698, 0.84252262,
        1.        ]])

In [4]:
hypers = dict(soap_type="PowerSpectrum",
              interaction_cutoff=3.5, 
              max_radial=6, 
              max_angular=4, 
              gaussian_sigma_constant=0.4,
              gaussian_sigma_type="Constant",
              cutoff_smooth_width=0.5,
              expansion_by_species_method='user defined',
              normalize=True,
              global_species=[1, 6, 7, 8],
              )
soap = SOAP(**hypers)
zeta=2
kernel1 = Kernel(soap, zeta=zeta, target_type='Atom')
representation = soap.transform(frames)

In [8]:
%time kernel1(representation)

CPU times: user 28.1 s, sys: 500 ms, total: 28.6 s
Wall time: 28.7 s


array([[1.        , 0.00472567, 0.11009869, ..., 0.04753737, 0.02975926,
        0.00890048],
       [0.00472567, 1.        , 0.11667004, ..., 0.06367934, 0.02375891,
        0.00288407],
       [0.11009869, 0.11667004, 1.        , ..., 0.0790658 , 0.03902808,
        0.00189056],
       ...,
       [0.04753737, 0.06367934, 0.0790658 , ..., 1.        , 0.72615537,
        0.33178025],
       [0.02975926, 0.02375891, 0.03902808, ..., 0.72615537, 1.        ,
        0.46487243],
       [0.00890048, 0.00288407, 0.00189056, ..., 0.33178025, 0.46487243,
        1.        ]])

In [5]:
aa = kernel1(representation)
bb = kernel1(representation, representation)
np.allclose(aa, bb)

True

In [13]:
bb[:10,:10]

array([[1.        , 0.00472567, 0.11009869, 0.35840026, 0.01889473,
        0.36544231, 0.08698729, 0.82470979, 0.06989846, 0.0558029 ],
       [0.00472567, 1.        , 0.11667004, 0.05361856, 0.04870066,
        0.05328729, 0.10730086, 0.01035052, 0.0314713 , 0.0140616 ],
       [0.11009869, 0.11667004, 1.        , 0.52827571, 0.03751435,
        0.51922305, 0.60658761, 0.13982755, 0.01909603, 0.00646523],
       [0.35840026, 0.05361856, 0.52827571, 1.        , 0.08242616,
        0.93518617, 0.48517071, 0.46188971, 0.04831532, 0.02203933],
       [0.01889473, 0.04870066, 0.03751435, 0.08242616, 1.        ,
        0.09741674, 0.06644332, 0.04759085, 0.03904336, 0.02468563],
       [0.36544231, 0.05328729, 0.51922305, 0.93518617, 0.09741674,
        1.        , 0.46840516, 0.48956985, 0.03968006, 0.01826398],
       [0.08698729, 0.10730086, 0.60658761, 0.48517071, 0.06644332,
        0.46840516, 1.        , 0.15431263, 0.03514725, 0.02418615],
       [0.82470979, 0.01035052, 0.1398275

In [12]:
aa[:20,:20]

array([[1.        , 0.79513757, 0.04532791, 0.47834536, 0.0012243 ,
        0.43778697, 0.19340956, 0.56439146, 0.1222456 , 0.44408756,
        0.07018829, 0.0709119 , 0.06876686, 0.05238681, 0.0513778 ,
        0.05238449, 0.05978511, 0.03541624, 0.04631459, 0.06043754],
       [0.79513757, 0.0150889 , 0.16008232, 0.45488497, 0.04276427,
        0.49568388, 0.13508786, 0.93383368, 0.07852325, 0.0518483 ,
        0.07852126, 0.05382414, 0.0248058 , 0.06384085, 0.0882993 ,
        0.08832277, 0.09397047, 0.06247693, 0.0368819 , 0.05036481],
       [0.04532791, 0.16008232, 0.46217492, 0.32384483, 0.06699144,
        0.30856518, 0.76049812, 0.07782628, 0.04077903, 0.01931466,
        0.04077414, 0.04820434, 0.01026047, 0.02640223, 0.02756485,
        0.02756594, 0.04872499, 0.08132929, 0.0394965 , 0.04611102],
       [0.47834536, 0.45488497, 0.32384483, 0.73711621, 0.09428872,
        0.7925445 , 0.36012606, 0.61686045, 0.06818785, 0.03549725,
        0.06818157, 0.08505378, 0.0485641 , 0

In [10]:
aa

array([[1.        , 0.79513757, 0.04532791, ..., 0.        , 0.        ,
        0.        ],
       [0.79513757, 0.0150889 , 0.16008232, ..., 0.        , 0.        ,
        0.        ],
       [0.04532791, 0.16008232, 0.46217492, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [63]:
140/43. # 900

3.255813953488372

In [70]:
1.8/0.513 #100

3.508771929824561

# Learning the formation energies of small molecules

In [None]:
# Load the small molecules 
frames = read('../reference_data/inputs/small_molecules-1000.xyz',':600')

## learning utilities

In [None]:
def compute_representation(representation,frames):
    expansions = soap.transform(frames)
    return expansions

def compute_kernel(zeta, rep1, rep2=None):
    if rep2 is None:
        kernel = rep1.cosine_kernel_global(zeta)
    else:
        kernel = rep1.cosine_kernel_global(rep2,zeta)
    return kernel

def extract_energy(frames):
    prop = [[]]*len(frames)
    for ii,cc in enumerate(frames):
        prop[ii] = cc.info['dft_formation_energy_per_atom_in_eV']
    y = np.array(prop)
    return y

def split_dataset(frames, test_fraction, seed=10):
    N = len(frames)
    ids = np.arange(N)
    np.random.seed(seed)
    np.random.shuffle(ids)
    Ntrain = int(N*test_fraction)
    train = ids[:Ntrain]
    test = ids[Ntrain:]
    targets = extract_energy(frames)
    return [frames[ii] for ii in train],targets[train],[frames[ii] for ii in test],targets[test]

def get_mae(ypred,y):
    return np.mean(np.abs(ypred-y))
def get_rmse(ypred,y):
    return np.sqrt(np.mean((ypred-y)**2))
def get_sup(ypred,y):
    return np.amax(np.abs((ypred-y)))
def get_r2(y_pred,y_true):
    weight = 1
    sample_weight = None
    numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,dtype=np.float64)
    denominator = (weight * (y_true - np.average(
        y_true, axis=0, weights=sample_weight)) ** 2).sum(axis=0,dtype=np.float64)
    output_scores = 1 - (numerator / denominator)
    return np.mean(output_scores)


score_func = dict(
    MAE=get_mae,
    RMSE=get_rmse,
    SUP=get_sup,
    R2=get_r2,
)

def get_score(ypred,y):
    scores = {}
    for k,func in score_func.items():
        scores[k] = func(ypred,y)
    return scores

class KRR(object):
    def __init__(self,zeta,weights,representation,X):
        self.weights = weights
        self.representation = representation
        self.zeta = zeta
        self.X = X
        
    def predict(self,frames):
        features = compute_representation(self.representation,frames)
        kernel = compute_kernel(self.zeta , self.X, features)
        return np.dot(self.weights, kernel)
    
def train_krr_model(zeta,Lambda,representation,frames,y,jitter=1e-8):
    features = compute_representation(representation,frames)
    kernel = compute_kernel(zeta,features)    
    # adjust the kernel so that it is properly scaled
    delta = np.std(y) / np.mean(kernel.diagonal())
    kernel[np.diag_indices_from(kernel)] += Lambda**2 / delta **2 + jitter
    # train the krr model
    weights = np.linalg.solve(kernel,y)
    model = KRR(zeta, weights,representation, features)
    return model,kernel



## With the full power spectrum

In [None]:
hypers = dict(soap_type="PowerSpectrum",
              interaction_cutoff=3.5, 
              max_radial=6, 
              max_angular=6, 
              gaussian_sigma_constant=0.4,
              gaussian_sigma_type="Constant",
              cutoff_smooth_width=0.5,
              )
soap = SOAP(**hypers)

In [None]:
frames_train, y_train, frames_test, y_test = split_dataset(frames,0.8)

In [None]:
zeta = 2
Lambda = 5e-3
krr,k = train_krr_model(zeta, Lambda, soap, frames_train, y_train)

In [None]:
y_pred = krr.predict(frames_test)
get_score(y_pred, y_test)

In [None]:
plt.scatter(y_pred, y_test, s=3)
plt.axis('scaled')
plt.xlabel('DFT energy / (eV/atom)')
plt.ylabel('Predicted energy / (eV/atom)')

## With just the radial spectrum

In [None]:
hypers = dict(soap_type="RadialSpectrum",
              interaction_cutoff=3.5, 
              max_radial=6, 
              max_angular=0, 
              gaussian_sigma_constant=0.4,
              gaussian_sigma_type="Constant",
              cutoff_smooth_width=0.5,
              )
soap = SOAP(**hypers)

In [None]:
frames_train, y_train, frames_test, y_test = split_dataset(frames,0.8)

In [None]:
zeta = 2
Lambda = 5e-4
krr,k = train_krr_model(zeta, Lambda, soap, frames_train, y_train)

In [None]:
y_pred = krr.predict(frames_test)
get_score(y_pred, y_test)

In [None]:
plt.scatter(y_pred, y_test, s=3)
plt.axis('scaled')
plt.xlabel('DFT energy / (eV/atom)')
plt.ylabel('Predicted energy / (eV/atom)')

# Make a map of the dataset

## utils

In [None]:
def compute_representation(representation,frames):
    expansions = soap.transform(frames)
    return expansions

def compute_kernel(zeta, rep1, rep2=None):
    if rep2 is None:
        kernel = rep1.cosine_kernel_global(zeta)
    else:
        kernel = rep1.cosine_kernel_global(rep2,zeta)
    return kernel

In [None]:
def link_ngl_wdgt_to_ax_pos(ax, pos, ngl_widget):
    from matplotlib.widgets import AxesWidget
    from scipy.spatial import cKDTree
    r"""
    Initial idea for this function comes from @arose, the rest is @gph82 and @clonker
    """
    
    kdtree = cKDTree(pos)        
    #assert ngl_widget.trajectory_0.n_frames == pos.shape[0]
    x, y = pos.T
    
    lineh = ax.axhline(ax.get_ybound()[0], c="black", ls='--')
    linev = ax.axvline(ax.get_xbound()[0], c="black", ls='--')
    dot, = ax.plot(pos[0,0],pos[0,1], 'o', c='red', ms=7)

    ngl_widget.isClick = False
    
    def onclick(event):
        linev.set_xdata((event.xdata, event.xdata))
        lineh.set_ydata((event.ydata, event.ydata))
        data = [event.xdata, event.ydata]
        _, index = kdtree.query(x=data, k=1)
        dot.set_xdata((x[index]))
        dot.set_ydata((y[index]))
        ngl_widget.isClick = True
        ngl_widget.frame = index
    
    def my_observer(change):
        r"""Here comes the code that you want to execute
        """
        ngl_widget.isClick = False
        _idx = change["new"]
        try:
            dot.set_xdata((x[_idx]))
            dot.set_ydata((y[_idx]))            
        except IndexError as e:
            dot.set_xdata((x[0]))
            dot.set_ydata((y[0]))
            print("caught index error with index %s (new=%s, old=%s)" % (_idx, change["new"], change["old"]))
    
    # Connect axes to widget
    axes_widget = AxesWidget(ax)
    axes_widget.connect_event('button_release_event', onclick)
    
    # Connect widget to axes
    ngl_widget.observe(my_observer, "frame", "change")

## make a map with kernel pca projection

In [None]:
# Load the small molecules 
frames = read('./reference_data/small_molecules-1000.xyz',':600')

In [None]:
hypers = dict(soap_type="PowerSpectrum",
              interaction_cutoff=3.5, 
              max_radial=6, 
              max_angular=6, 
              gaussian_sigma_constant=0.4,
              gaussian_sigma_type="Constant",
              cutoff_smooth_width=0.5,
              )
soap = SOAP(**hypers)

In [None]:
zeta = 2

features = compute_representation(soap, frames)

kernel = compute_kernel(zeta,features)

In [None]:
from sklearn.decomposition import KernelPCA

In [None]:
kpca = KernelPCA(n_components=2,kernel='precomputed')
kpca.fit(kernel)

In [None]:
X = kpca.transform(kernel)

In [None]:
plt.scatter(X[:,0],X[:,1],s=3)

## make an interactive map

In [None]:
# package to visualize the structures in the notebook
# https://github.com/arose/nglview#released-version
import nglview

In [None]:
iwdg = nglview.show_asetraj(frames)
# set up the visualization
iwdg.add_unitcell()
iwdg.add_spacefill()
iwdg.remove_ball_and_stick()
iwdg.camera = 'orthographic'
iwdg.parameters = { "clipDist": 0 }
iwdg.center()
iwdg.update_spacefill(radiusType='covalent',
                                   scale=0.6,
                                   color_scheme='element')
iwdg._remote_call('setSize', target='Widget',
                               args=['%dpx' % (600,), '%dpx' % (400,)])
iwdg.player.delay = 200.0

In [None]:
link_ngl_wdgt_to_ax_pos(plt.gca(), X, iwdg)
plt.scatter(X[:,0],X[:,1],s=3)
iwdg