To install rascal:
(NOTE: See the top-level README for the most up-to-date installation instructions.)
+ mkdir ../build 
+ cd build
+ cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON ..
+ make -j 4
+ make install

In [None]:
%matplotlib inline
from matplotlib import pylab as plt

import os, sys
from ase.io import read
sys.path.insert(0,"../build/")

import sys
import time
import rascal
import json

import ase
from ase.io import read, write
from ase.build import make_supercell
from ase.visualize import view
import numpy as np
import sys
import pandas as pd
from copy import deepcopy

from tqdm.notebook import tqdm

from rascal.representations import SphericalInvariants
from rascal.models import Kernel, KRR, train_gap_model, SparsePoints
from rascal.utils import from_dict, to_dict, CURFilter, FPSFilter

In [None]:
# import urllib.request
# # a collection of distorted ethanol molecules from the ANI-1 dataset 
# # (see https://github.com/isayev/ANI1_dataset) with energies and forces computed using DFTB+ 
# # (see https://www.dftbplus.org/)
# url = 'https://raw.githubusercontent.com/cosmo-epfl/librascal-example-data/833b4336a7daf471e16993158322b3ea807b9d3f/inputs/molecule_conformers_dftb.xyz'
# # Download the file from `url`, save it in a temporary directory and get the
# # path to it (e.g. '/tmp/tmpb48zma.txt') in the `structures_fn` variable:
# structures_fn, headers = urllib.request.urlretrieve(url)
# structures_fn
structures_fn = '/tmp/tmpiilc84vk'

# Spherical Invariants: body order = 3

## utils

In [None]:
def extract_ref(frames,info_key='dft_formation_energy_per_atom_in_eV',array_key='zeros'):
    y,f = [], []
    for frame in frames:
        y.append(frame.info[info_key])
        if array_key is None:
            pass
        elif array_key == 'zeros':
            f.append(np.zeros(frame.get_positions().shape))
        else:
            f.append(frame.get_array(array_key))
    y= np.array(y)
    try:
        f = np.concatenate(f)
    except:
        pass
    return y,f

from scipy.stats import spearmanr

def get_r2(y_pred,y_true):
    weight = 1
    sample_weight = None
    numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,dtype=np.float64)
    denominator = (weight * (y_true - np.average(
        y_true, axis=0, weights=sample_weight)) ** 2).sum(axis=0,dtype=np.float64)
    output_scores = 1 - (numerator / denominator)
    return np.mean(output_scores)

def get_mae(ypred,y):
    return np.mean(np.abs(ypred-y))
def get_rmse(ypred,y):
    return np.sqrt(np.mean((ypred-y)**2))
def get_sup(ypred,y):
    return np.amax(np.abs((ypred-y)))
def get_spearman(ypred,y):
    corr,_ = spearmanr(ypred,y)
    return corr

score_func = dict(
    MAE=get_mae,
    RMSE=get_rmse,
    SUP=get_sup,
    R2=get_r2,
    CORR=get_spearman
)

def get_score(ypred,y):
    scores = {}
    for k,func in score_func.items():
        scores[k] = func(ypred,y)
    return scores
def print_score(ypred,y):
    scores = get_score(ypred,y)
    print(' '.join(map(lambda x:'{}={:.2e}'.format(*x), scores.items())))


## compute the representation of some atomic structures and their similarity

In [None]:
# Load the small molecules 
frames = read('../reference_data/inputs/small_molecules-1000.xyz',':100')

In [None]:
hypers = dict(soap_type="PowerSpectrum",
              interaction_cutoff=3.5, 
              max_radial=1, 
              max_angular=1, 
              gaussian_sigma_constant=0.4,
              gaussian_sigma_type="Constant",
              cutoff_smooth_width=0.5,
              normalize=True,
              )
soap = SphericalInvariants(**hypers)
zeta=1
kernel1 = Kernel(soap,name='Cosine', zeta=zeta, target_type='Structure', kernel_type='Full')

In [None]:
frames[0].info

In [None]:
representation = soap.transform(frames)
X = representation.get_features(soap)

In [None]:
%%time 
kk = np.power(np.dot(X, X.T), zeta)

## identify the most important features for regression with standard KRR

In [None]:
def train_krr_model(kernel, managers, K_, y_train, sigma=1e-3, jitter=1e-8):
    Y = y_train.reshape((-1, 1)).copy()
    K = K_.copy()
    n_centers = Y.shape[0]
    Natoms = np.zeros(n_centers)
    Y0 = np.zeros((n_centers, 1))
    for iframe, manager in enumerate(managers):
        Natoms[iframe] = len(manager)
    delta = np.std(Y)
    # K[np.diag_indices_from(K)] *= (sigma / delta)**2 * Natoms + jitter
    K[np.diag_indices_from(K)] *= (sigma / delta)**2 + jitter
    weights = np.linalg.lstsq(K, Y, rcond=None)[0]
    model = KRR(weights, kernel, managers, {sp:0. for sp in range(120)})

    # avoid memory clogging
    del K
    K = []

    return model

In [None]:
# Total number of structure to load
N = 985
# Number of structure to train the model with
f = int(0.6*N)

# load the structures
frames = read('../reference_data/inputs/small_molecules-1000.xyz',':{}'.format(N))


global_species = []
for frame in frames:
    global_species.extend(frame.get_atomic_numbers())
global_species = np.unique(global_species)

# split the structures in 2 sets
ids = list(range(N))
np.random.seed(10)
np.random.shuffle(ids)

frames_train = [frames[ii] for ii in ids[:f]]
frames_test = [frames[ii] for ii in ids[f:]]


In [None]:
# split the dataset in a train and a test set
y_train, _ = extract_ref(frames_train,'dft_formation_energy_per_atom_in_eV')
for ii,ft in enumerate(frames_train):
    y_train[ii] /= len(ft)
y_test, _ = extract_ref(frames_test,'dft_formation_energy_per_atom_in_eV')
for ii,ft in enumerate(frames_test):
    y_test[ii] /= len(ft)

In [None]:
hypers = dict(soap_type="PowerSpectrum",
              interaction_cutoff=3.5, 
              max_radial=8, 
              max_angular=8, 
              gaussian_sigma_constant=0.4,
              gaussian_sigma_type="Constant",
              cutoff_smooth_width=0.5,
              normalize=True,
              expansion_by_species_method='structure wise',
              )
soap = SphericalInvariants(**hypers)
kernel = Kernel(soap,name='Cosine', zeta=2, target_type='Structure', kernel_type='Full')

In [None]:
managers_train = soap.transform(frames_train)
managers_test = soap.transform(frames_test)
%time K = kernel(managers_train, managers_train)

In [None]:
# train a KRR model
model = train_krr_model(kernel, managers_train, K, y_train, sigma=0.5e-1, jitter=1e-8)

# make predictions on the test set
y_pred = model.predict(managers_test)

# basic assessement of the quality of the trained model
print_score(y_pred, y_test)
plt.plot(y_test, y_pred, 'o')
plt.title("correlation plot")
plt.ylabel("predicted energies [eV]")
plt.xlabel("reference energies [eV]")

## Sparsification with the gap model

In [None]:
# Total number of structure to load
N = 1000
# Number of structure to train the model with
f = int(0.6*N)

# load the structures
frames = read(structures_fn,':{}'.format(N))


global_species = []
for frame in frames:
    global_species.extend(frame.get_atomic_numbers())
global_species = np.unique(global_species)

# split the structures in 2 sets
ids = list(range(N))
np.random.seed(10)
np.random.shuffle(ids)

frames_train = [frames[ii] for ii in ids[:f]]
frames_test = [frames[ii] for ii in ids[f:]]
# Isolated atom contributions
self_contributions = {
    1: -6.492647589968434,
    6: -38.054950840332474,
    8: -83.97955098636527,
}

In [None]:
# split the dataset in a train and a test set
y_train, _ = extract_ref(frames_train,'dftb_energy_eV')
y_test, _ = extract_ref(frames_test,'dftb_energy_eV')

In [None]:
hypers = dict(soap_type="PowerSpectrum",
              interaction_cutoff=3.5, 
              max_radial=8, 
              max_angular=8, 
              gaussian_sigma_constant=0.4,
              gaussian_sigma_type="Constant",
              cutoff_smooth_width=0.5,
              normalize=True,
              expansion_by_species_method='structure wise',
              )
soap_calculator = SphericalInvariants(**hypers)
kernel = Kernel(soap_calculator,name='GAP', zeta=2, target_type='Structure', kernel_type='Sparse')

### pick up randomly sparse points

In [None]:
Nenv = 0
env_map = []
for i_frame, frame in enumerate(frames):
    Nenv += len(frame)
    for i_env in range(len(frame)):
        env_map.append((i_frame, i_env))
Nenv

In [None]:
managers = soap_calculator.transform(frames)
managers_train = soap_calculator.transform(frames_train)
managers_test = soap_calculator.transform(frames_test)

In [None]:
# compute the representation for all atomic structures
managers = soap_calculator.transform(frames)

fractions = (Nenv*np.array([0.4,0.3,0.2,0.1,0.05])).astype(int)
scores = []
for fraction in tqdm(fractions):
    ids = np.array(range(Nenv))
    np.random.seed(100)
    np.random.shuffle(ids)
    # randomly select atomic centers
    selected_ids = [[] for _ in range(len(frames))]
    for idx in ids[:fraction]:
        i_frame, i_env = env_map[idx]
        selected_ids[i_frame].append(i_env)
    # initialize the sparse points with randomly selected 
    sparse_points = SparsePoints(soap_calculator)
    sparse_points.extend(managers, selected_ids)
    
    KNM = kernel(managers_train, sparse_points)
    
    model = train_gap_model(kernel, managers_train, KNM, sparse_points, y_train, self_contributions, 
                        grad_train=None, lambdas=[7e-3, None], jitter=1e-8)

    y_pred = model.predict(managers_test)
    
    score = get_score(y_pred, y_test)
    score.update(n_sparse_point=fraction)
    scores.append(score)
scores = pd.DataFrame(scores)
scores

### pick sparse points with fps

In [None]:
managers = soap_calculator.transform(frames)
managers_train = soap_calculator.transform(frames_train)
managers_test = soap_calculator.transform(frames_test)

In [None]:
sps = []
for frame in frames:
    sps.extend(frame.get_atomic_numbers())
sps_u = np.unique(sps)
sps_n = np.bincount(sps)
Nenv = {sp:sps_n[sp] for sp in sps_u}
Nenv

In [None]:
fps_filter = FPSFilter(soap_calculator, Nenv, 'sample per species')
fps_filter.fit(managers)
fps_filter.plot()

In [None]:
managers = soap_calculator.transform(frames)

fractions = [0.4,0.3,0.2,0.1,0.05, 0.01, 0.005]
scores = []
for fraction in tqdm(fractions):
    sparse_points = fps_filter.transform(managers, {sp:int(n*fraction) for sp,n in Nenv.items()})
    KNM = kernel(managers_train, sparse_points)
    model = train_gap_model(kernel, managers_train, KNM, sparse_points, y_train, self_contributions, 
                        grad_train=None, lambdas=[7e-3, None], jitter=1e-8)

    y_pred = model.predict(managers_test)
    score = get_score(y_pred, y_test)
    score.update(fraction=fraction)
    scores.append(score)
scores = pd.DataFrame(scores)
scores

### try feature sparsification with FPS

In [None]:
managers = soap_calculator.transform(frames)

X = managers.get_features(soap_calculator)
n_features = int(X.shape[1]*0.3)
n_features

In [None]:
sps = []
for frame in frames:
    sps.extend(frame.get_atomic_numbers())
sps_u = np.unique(sps)
sps_n = np.bincount(sps)
Nenv = {sp:int(sps_n[sp]*0.05) for sp in sps_u}
print(Nenv)
fps_filter = FPSFilter(soap_calculator, Nenv, 'sample per species')
fps_filter.fit(managers)
fps_filter.plot()

In [None]:
feature_filter = FPSFilter(soap_calculator, n_features, 'feature')
feature_filter.fit(managers);
feature_filter.plot()

In [None]:
fractions = list(reversed([1, 0.75, 0.5, 0.25, 0.125, 0.05, 0.01]))
scores = []
for fraction in tqdm(fractions):
    selected_feature = feature_filter.transform(managers, int(n_features*fraction))
    
    hypers_sparse = deepcopy(hypers)
    hypers_sparse.update(**selected_feature)
    soap_calculator_sparse = SphericalInvariants(**hypers_sparse)
    
    kernel_sparse = Kernel(soap_calculator_sparse,name='GAP', zeta=2, target_type='Structure', kernel_type='Sparse')
    
    managers_sp = soap_calculator_sparse.transform(frames)
    managers_train_sp = soap_calculator_sparse.transform(frames_train)
    managers_test_sp = soap_calculator_sparse.transform(frames_test)
    
    fps_filter_sp = deepcopy(fps_filter)
    fps_filter_sp._representation = soap_calculator_sparse
    sparse_points = fps_filter_sp.transform(managers_sp)
    
    KNM = kernel_sparse(managers_train_sp, sparse_points)
    model = train_gap_model(kernel_sparse, managers_train_sp, KNM, sparse_points, y_train, self_contributions, 
                        grad_train=None, lambdas=[7e-3, None], jitter=1e-8)

    y_pred = model.predict(managers_test_sp)
    score = get_score(y_pred, y_test)
    score.update(n_features=int(n_features*fraction))
    scores.append(score)
scores = pd.DataFrame(scores)
scores

### pick sparse points with CUR decomposition

In [None]:
managers = soap_calculator.transform(frames)
managers_train = soap_calculator.transform(frames_train)
managers_test = soap_calculator.transform(frames_test)

In [None]:
sps = []
for frame in frames:
    sps.extend(frame.get_atomic_numbers())
sps_u = np.unique(sps)
sps_n = np.bincount(sps)
Nenv = {sp:int(sps_n[sp]*0.2) for sp in sps_u}
Nenv

In [None]:
cur_filter = CURFilter(soap_calculator, Nenv, 'sample per species')
cur_filter.fit(managers)

In [None]:
managers = soap_calculator.transform(frames)

fractions = [1, 0.75, 0.5, 0.25, 0.125]
scores = []
for fraction in tqdm(fractions):
    sparse_points = cur_filter.transform(managers, {sp:int(n*fraction) for sp,n in Nenv.items()})
    KNM = kernel(managers_train, sparse_points)
    model = train_gap_model(kernel, managers_train, KNM, sparse_points, y_train, self_contributions, 
                        grad_train=None, lambdas=[7e-3, None], jitter=1e-8)

    y_pred = model.predict(managers_test)
    score = get_score(y_pred, y_test)
    score.update(fraction=fraction)
    scores.append(score)
scores = pd.DataFrame(scores)
scores

### try feature sparsification with CUR

In [None]:
managers = soap_calculator.transform(frames)

X = managers.get_features(soap_calculator)
n_features = int(X.shape[1]*0.3)
n_features

In [None]:
sps = []
for frame in frames:
    sps.extend(frame.get_atomic_numbers())
sps_u = np.unique(sps)
sps_n = np.bincount(sps)
Nenv = {sp:int(sps_n[sp]*0.05) for sp in sps_u}
print(Nenv)
cur_filter = CURFilter(soap_calculator, Nenv, 'sample per species')
cur_filter.fit(managers);

In [None]:
feature_filter = CURFilter(soap_calculator, n_features, 'feature')
feature_filter.fit(managers);

In [None]:
fractions = list(reversed([1, 0.75, 0.5, 0.25, 0.125, 0.05, 0.01]))
scores = []
for fraction in tqdm(fractions):
    selected_feature = feature_filter.transform(managers, int(n_features*fraction))
    
    hypers_sparse = deepcopy(hypers)
    hypers_sparse.update(**selected_feature)
    soap_calculator_sparse = SphericalInvariants(**hypers_sparse)
    
    kernel_sparse = Kernel(soap_calculator_sparse,name='GAP', zeta=2, target_type='Structure', kernel_type='Sparse')
    
    managers_sp = soap_calculator_sparse.transform(frames)
    managers_train_sp = soap_calculator_sparse.transform(frames_train)
    managers_test_sp = soap_calculator_sparse.transform(frames_test)
    
    cur_filter_sp = deepcopy(cur_filter)
    cur_filter_sp._representation = soap_calculator_sparse
    sparse_points = cur_filter_sp.transform(managers_sp)
    
    KNM = kernel_sparse(managers_train_sp, sparse_points)
    model = train_gap_model(kernel_sparse, managers_train_sp, KNM, sparse_points, y_train, self_contributions, 
                        grad_train=None, lambdas=[7e-3, None], jitter=1e-8)

    y_pred = model.predict(managers_test_sp)
    score = get_score(y_pred, y_test)
    score.update(n_features=int(n_features*fraction))
    scores.append(score)
scores = pd.DataFrame(scores)
scores

# Make a map of the dataset

## utils

In [None]:
def link_ngl_wdgt_to_ax_pos(ax, pos, ngl_widget):
    from matplotlib.widgets import AxesWidget
    from scipy.spatial import cKDTree
    r"""
    Initial idea for this function comes from @arose, the rest is @gph82 and @clonker
    """
    
    kdtree = cKDTree(pos)        
    #assert ngl_widget.trajectory_0.n_frames == pos.shape[0]
    x, y = pos.T
    
    lineh = ax.axhline(ax.get_ybound()[0], c="black", ls='--')
    linev = ax.axvline(ax.get_xbound()[0], c="black", ls='--')
    dot, = ax.plot(pos[0,0],pos[0,1], 'o', c='red', ms=7)

    ngl_widget.isClick = False
    
    def onclick(event):
        linev.set_xdata((event.xdata, event.xdata))
        lineh.set_ydata((event.ydata, event.ydata))
        data = [event.xdata, event.ydata]
        _, index = kdtree.query(x=data, k=1)
        dot.set_xdata((x[index]))
        dot.set_ydata((y[index]))
        ngl_widget.isClick = True
        ngl_widget.frame = index
    
    def my_observer(change):
        r"""Here comes the code that you want to execute
        """
        ngl_widget.isClick = False
        _idx = change["new"]
        try:
            dot.set_xdata((x[_idx]))
            dot.set_ydata((y[_idx]))            
        except IndexError as e:
            dot.set_xdata((x[0]))
            dot.set_ydata((y[0]))
            print("caught index error with index %s (new=%s, old=%s)" % (_idx, change["new"], change["old"]))
    
    # Connect axes to widget
    axes_widget = AxesWidget(ax)
    axes_widget.connect_event('button_release_event', onclick)
    
    # Connect widget to axes
    ngl_widget.observe(my_observer, "frame", "change")

## make a map with kernel pca projection

In [None]:
# Load the small molecules 
frames = read('../reference_data/inputs/small_molecules-1000.xyz',':600')

In [None]:
hypers = dict(soap_type="PowerSpectrum",
              interaction_cutoff=3.5, 
              max_radial=6, 
              max_angular=6, 
              gaussian_sigma_constant=0.4,
              gaussian_sigma_type="Constant",
              cutoff_smooth_width=0.5,
              )
soap = SphericalInvariants(**hypers)
kernel = Kernel(soap,name='Cosine', zeta=2, target_type='Structure', kernel_type='Full')

In [None]:
managers = soap.transform(frames)

Kmat = kernel(managers)

In [None]:
from sklearn.decomposition import KernelPCA

In [None]:
kpca = KernelPCA(n_components=2,kernel='precomputed')
kpca.fit(Kmat)

In [None]:
X = kpca.transform(Kmat)

In [None]:
plt.scatter(X[:,0],X[:,1],s=3)

## make an interactive map

In [None]:
# package to visualize the structures in the notebook
# https://github.com/arose/nglview#released-version
import nglview

In [None]:
iwdg = nglview.show_asetraj(frames)
# set up the visualization
iwdg.add_unitcell()
iwdg.add_spacefill()
iwdg.remove_ball_and_stick()
iwdg.camera = 'orthographic'
iwdg.parameters = { "clipDist": 0 }
iwdg.center()
iwdg.update_spacefill(radiusType='covalent',
                                   scale=0.6,
                                   color_scheme='element')
iwdg._remote_call('setSize', target='Widget',
                               args=['%dpx' % (600,), '%dpx' % (400,)])
iwdg.player.delay = 200.0

In [None]:
link_ngl_wdgt_to_ax_pos(plt.gca(), X, iwdg)
plt.scatter(X[:,0],X[:,1],s=3)
iwdg