In [1]:
import numpy as np
import pandas as pd
import json
import pickle
import sklearn.preprocessing

import torch

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr, spearmanr
from sklearn.preprocessing import StandardScaler

This notebook demonstrates how to perform the virtual adversarial attack analysis done to get feature importance for the DF-BP models. The structures which are highlighted in the main text figure are also shown.

In [2]:
import sys
sys.path.append('dfa_recommender')
from dfa_recommender.net import GatedNetwork, MySoftplus, TiledMultiLayerNN, MLP, finalMLP, ElementalGate
from torch.utils.data import DataLoader
from dfa_recommender.dataset import SubsetDataset
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import ExponentialLR
from dfa_recommender.sampler import InfiniteSampler
from dfa_recommender.vat import regVAT
from dfa_recommender.ml_utils import numpy_to_dataset
import copy

torch.set_num_threads(4)
torch.manual_seed(0)
np.random.seed(0)
device = torch.device('cpu')
num_workers = 0

In [3]:
vss_features = pickle.load(open('../ml_features/BP_features/pbe0-vss452_X.pkl', 'rb'))
csd_features = pickle.load(open('../ml_features/BP_features/pbe0-csd76_X.pkl', 'rb'))

vss_df = pd.read_csv('../ml_features/BP_features/pbe0-vss452_structures.csv')
vss_feature_df = pd.DataFrame(index=vss_df['Unnamed: 0'], columns=['features'])
for idx, row in vss_df.iterrows():
    vss_feature_df['features'][row['Unnamed: 0']] = vss_features[idx]

csd_df = pd.read_csv('../ml_features/BP_features/pbe0-csd76_structures.csv')
csd_feature_df = pd.DataFrame(index=csd_df['Unnamed: 0'], columns=['features'])
for idx, row in csd_df.iterrows():
    csd_feature_df['features'][row['Unnamed: 0']] = csd_features[idx]

vss_df = pd.read_csv('../ml_features/BP_features/pbe0-vss452_structures.csv')
vss_df2 = pd.read_csv('../ml_features/BP_features/scan0-vss452_structures.csv')
vss_df3 = pd.read_csv('../ml_features/BP_features/b3lyp-vss452_structures.csv')

set1 = set(vss_df['Unnamed: 0'].to_list())
set2 = set(vss_df2['Unnamed: 0'].to_list())
set3 = set(vss_df3['Unnamed: 0'].to_list())
vss_common_structs = set1 & set2 & set3

csd_df = pd.read_csv('../ml_features/BP_features/pbe0-csd76_structures.csv')
csd_df2 = pd.read_csv('../ml_features/BP_features/scan0-csd76_structures.csv')
csd_df3 = pd.read_csv('../ml_features/BP_features/b3lyp-csd76_structures.csv')

set1 = set(csd_df['Unnamed: 0'].to_list())
set2 = set(csd_df2['Unnamed: 0'].to_list())
set3 = set(csd_df3['Unnamed: 0'].to_list())
csd_common_structs = set1 & set2 & set3

#load features and targets
csd_features = csd_feature_df.sort_index()
csd_X = csd_features['features'].to_numpy()
csd_targets = pd.read_csv('../data/CSD76targets.csv')
csd_targets = csd_targets.set_index(csd_targets['Unnamed: 0'])
for idx, row in csd_targets.iterrows():
    if idx in csd_features.index:
        pass
    else:
        csd_targets = csd_targets.drop([idx])
csd_targets = csd_targets.sort_index()
csd_structs = csd_features.index.to_list()

vss_features = vss_feature_df.sort_index()
vss_X = vss_features['features'].to_numpy()
vss_targets = pd.read_csv('../data/VSS452targets.csv')
vss_targets['Unnamed: 0'] = vss_targets['Unnamed: 0'].apply(lambda x: x.split('/')[-1])
vss_targets = vss_targets.set_index(vss_targets['Unnamed: 0'])
for idx, row in vss_targets.iterrows():
    if idx in vss_features.index:
        pass
    else:
        vss_targets = vss_targets.drop([idx])
vss_targets = vss_targets.sort_index()
vss_structs = vss_features.index.to_list()

#remove the structures that do not have targets
csd_pbe_y = csd_targets['hfx_pbe'].to_numpy()
csd_pbe_X = [csd_X[i] for i in range(len(csd_pbe_y)) if (not np.isnan(csd_pbe_y[i])) and (True not in np.isnan(csd_X[i]))]
csd_pbe_structs = [csd_structs[i] for i in range(len(csd_pbe_y)) if (not np.isnan(csd_pbe_y[i])) and (True not in np.isnan(csd_X[i]))]
csd_pbe_y = [[csd_pbe_y[i]] for i in range(len(csd_pbe_y)) if (not np.isnan(csd_pbe_y[i])) and (True not in np.isnan(csd_X[i]))]
for idx, elem in enumerate(csd_pbe_y):
    if elem[0] > 100:
        csd_pbe_y[idx] = [100]
    elif elem[0] < 0:
        csd_pbe_y[idx] = [0]

csd_scan_y = csd_targets['hfx_scan'].to_numpy()
csd_scan_X = [csd_X[i] for i in range(len(csd_scan_y)) if (not np.isnan(csd_scan_y[i])) and (True not in np.isnan(csd_X[i]))]
csd_scan_structs = [csd_structs[i] for i in range(len(csd_scan_y)) if (not np.isnan(csd_scan_y[i])) and (True not in np.isnan(csd_X[i]))]
csd_scan_y = [[csd_scan_y[i]] for i in range(len(csd_scan_y)) if (not np.isnan(csd_scan_y[i])) and (True not in np.isnan(csd_X[i]))]
for idx, elem in enumerate(csd_scan_y):
    if elem[0] > 100:
        csd_scan_y[idx] = [100]
    elif elem[0] < 0:
        csd_scan_y[idx] = [0]

vss_pbe_y = vss_targets['hfx_pbe'].to_numpy()
vss_pbe_X = [vss_X[i] for i in range(len(vss_pbe_y)) if (not np.isnan(vss_pbe_y[i])) and (True not in np.isnan(vss_X[i]))]
vss_pbe_structs = [vss_structs[i] for i in range(len(vss_pbe_y)) if (not np.isnan(vss_pbe_y[i])) and (True not in np.isnan(vss_X[i]))]
vss_pbe_y = [[vss_pbe_y[i]] for i in range(len(vss_pbe_y)) if (not np.isnan(vss_pbe_y[i])) and (True not in np.isnan(vss_X[i]))]
for idx, elem in enumerate(vss_pbe_y):
    if elem[0] > 100:
        vss_pbe_y[idx] = [100]
    elif elem[0] < 0:
        vss_pbe_y[idx] = [0]

vss_scan_y = vss_targets['hfx_scan'].to_numpy()
vss_scan_X = [vss_X[i] for i in range(len(vss_scan_y)) if (not np.isnan(vss_scan_y[i])) and (True not in np.isnan(vss_X[i]))]
vss_scan_structs = [vss_structs[i] for i in range(len(vss_scan_y)) if (not np.isnan(vss_scan_y[i])) and (True not in np.isnan(vss_X[i]))]
vss_scan_y = [[vss_scan_y[i]] for i in range(len(vss_scan_y)) if (not np.isnan(vss_scan_y[i])) and (True not in np.isnan(vss_X[i]))]
for idx, elem in enumerate(vss_scan_y):
    if elem[0] > 100:
        vss_scan_y[idx] = [100]
    elif elem[0] < 0:
        vss_scan_y[idx] = [0]

#find structures that are in common between two

in_common1 = set(vss_pbe_structs).intersection(set(vss_scan_structs)) & vss_common_structs
in_common2 = set(vss_pbe_structs).intersection(set(vss_scan_structs)) - vss_common_structs

#get training, testing indices based on the structures in common
np.random.seed(5)
vss_pbe_common_idxs1 = [i for i in range(len(vss_pbe_structs)) if vss_pbe_structs[i] in in_common1]
vss_scan_common_idxs1 = [i for i in range(len(vss_scan_structs)) if vss_scan_structs[i] in in_common1]
vss_pbe_common_idxs2 = [i for i in range(len(vss_pbe_structs)) if vss_pbe_structs[i] in in_common2]
vss_scan_common_idxs2 = [i for i in range(len(vss_scan_structs)) if vss_scan_structs[i] in in_common2]

train_idxs1 = np.random.choice(len(vss_pbe_common_idxs1), int(0.9*len(vss_pbe_common_idxs1)), replace=False)
train_idxs2 = np.random.choice(len(vss_pbe_common_idxs2), int(0.9*len(vss_pbe_common_idxs2)), replace=False)

vss_pbe_train_idxs = [vss_pbe_common_idxs1[i] for i in train_idxs1] + [vss_pbe_common_idxs2[i] for i in train_idxs2]
vss_pbe_val_idxs = [i for i in range(len(vss_pbe_y)) if i not in vss_pbe_train_idxs]
vss_scan_train_idxs = [vss_scan_common_idxs1[i] for i in train_idxs1] + [vss_scan_common_idxs2[i] for i in train_idxs2]
vss_scan_val_idxs = [i for i in range(len(vss_scan_y)) if i not in vss_scan_train_idxs]

#make the (unscaled) datasets
vss_pbe_train_X = [vss_pbe_X[i] for i in vss_pbe_train_idxs]
vss_pbe_val_X = [vss_pbe_X[i] for i in vss_pbe_val_idxs]
vss_scan_train_X = [vss_scan_X[i] for i in vss_scan_train_idxs]
vss_scan_val_X = [vss_scan_X[i] for i in vss_scan_val_idxs]
#note: pbe and scan train features are identical, can just use one
vss_pbe_train_y = [vss_pbe_y[i] for i in vss_pbe_train_idxs]
vss_pbe_val_y = [vss_pbe_y[i] for i in vss_pbe_val_idxs]
vss_scan_train_y = [vss_scan_y[i] for i in vss_scan_train_idxs]
vss_scan_val_y = [vss_scan_y[i] for i in vss_scan_val_idxs]

#scale by VSS-452 training features
pbe_target_scaler = StandardScaler().fit(vss_pbe_train_y)
scan_target_scaler = StandardScaler().fit(vss_scan_train_y)

vss_pbe_train_y = pbe_target_scaler.transform(vss_pbe_train_y)
vss_scan_train_y = scan_target_scaler.transform(vss_scan_train_y)
vss_pbe_val_y = pbe_target_scaler.transform(vss_pbe_val_y)
vss_scan_val_y = scan_target_scaler.transform(vss_scan_val_y)
csd_pbe_y = pbe_target_scaler.transform(csd_pbe_y)
csd_scan_y = scan_target_scaler.transform(csd_scan_y)

vss_pbe_train_y = np.ravel(vss_pbe_train_y)
vss_scan_train_y = np.ravel(vss_scan_train_y)
vss_pbe_val_y = np.ravel(vss_pbe_val_y)
vss_scan_val_y = np.ravel(vss_scan_val_y)
csd_pbe_y = np.ravel(csd_pbe_y)
csd_scan_y = np.ravel(csd_scan_y)

In [4]:
X_train = np.array(vss_pbe_train_X)
X_val = np.array(vss_pbe_val_X)
X_test = np.array(csd_pbe_X)

y_train = np.array(vss_pbe_train_y)
y_val = np.array(vss_pbe_val_y)
y_test = np.array(csd_pbe_y)
y_scaler = pbe_target_scaler

atoms  = ["X", "H", "C", "N", "O", "F", "Cr", "Mn", "Fe", "Co"]

bz = 16

data_tr, data_te = numpy_to_dataset(X_train, y_train, regression=True), numpy_to_dataset(X_val, y_val, regression=True)
tr_l = SubsetDataset(data_tr, list(range(len(data_tr))))
te_l = SubsetDataset(data_te, list(range(len(data_te))))
print("sub labeled dataset length: ", len(tr_l), len(te_l))

l_tr_iter = iter(DataLoader(tr_l, bz, num_workers=num_workers,
                            sampler=InfiniteSampler(len(tr_l))))
l_te_iter = iter(DataLoader(te_l, bz, num_workers=num_workers,
                            sampler=InfiniteSampler(len(te_l))))
te_loader = DataLoader(te_l, len(te_l), num_workers=num_workers)
tr_l_loader = DataLoader(tr_l, len(tr_l), num_workers=num_workers)


best_model = pickle.load(open('../ml_training/DF-BP/pbe0_opt5/BP_model-pbe.pkl', "rb"))

best_model.eval()
pbe_train_preds = []
pbe_train_labels = []
with torch.no_grad():
    for x, y in tr_l_loader:
        _pred = best_model(x.to(device))
        pbe_train_preds.append(_pred.cpu().numpy())
        pbe_train_labels.append(y.cpu().numpy())

pbe_train_preds = y_scaler.inverse_transform(pbe_train_preds)[0]
pbe_train_labels = y_scaler.inverse_transform(pbe_train_labels)[0]

pbe_val_preds = []
pbe_val_labels = []
with torch.no_grad():
    for x, y in te_loader:
        _pred = best_model(x.to(device))
        pbe_val_preds.append(_pred.cpu().numpy())
        pbe_val_labels.append(y.cpu().numpy())

pbe_val_preds = y_scaler.inverse_transform(pbe_val_preds)[0]
pbe_val_labels = y_scaler.inverse_transform(pbe_val_labels)[0]

data_csd = numpy_to_dataset(X_test, y_test, regression=True)
csd_l = SubsetDataset(data_csd, list(range(len(data_csd))))
csd_loader = DataLoader(csd_l, len(csd_l), num_workers=num_workers)

pbe_test_preds = []
pbe_test_labels = []
with torch.no_grad():
    for x, y in csd_loader:
        _pred = best_model(x.to(device))
        pbe_test_preds.append(_pred.cpu().numpy())
        pbe_test_labels.append(y.cpu().numpy())

pbe_test_preds = y_scaler.inverse_transform(pbe_test_preds)[0]
pbe_test_labels = y_scaler.inverse_transform(pbe_test_labels)[0]

for arr in [pbe_train_preds, pbe_val_preds, pbe_test_preds]:
    for idx, elem in enumerate(arr):
        if elem > 100:
            arr[idx] = 100
        elif elem < 0:
            arr[idx] = 0

pbe_train_r2 = r2_score(pbe_train_labels, pbe_train_preds)
pbe_train_mae = mean_absolute_error(pbe_train_labels, pbe_train_preds)
pbe_val_r2 = r2_score(pbe_val_labels, pbe_val_preds)
pbe_val_mae = mean_absolute_error(pbe_val_labels, pbe_val_preds)
pbe_test_r2 = r2_score(pbe_test_labels, pbe_test_preds)
pbe_test_mae = mean_absolute_error(pbe_test_labels, pbe_test_preds)

print("Training R^2 score: {:.2f}".format(pbe_train_r2))
print("Training MAE: {:.2f}".format(pbe_train_mae))
print("Validation R^2 score: {:.2f}".format(pbe_val_r2))
print("Validation MAE: {:.2f}".format(pbe_val_mae))
print("Testing (CSD76) R^2 score: {:.2f}".format(pbe_test_r2))
print("Testing (CSD76) MAE: {:.2f}".format(pbe_test_mae))

sub labeled dataset length:  297 51
Training R^2 score: 0.84
Training MAE: 3.13
Validation R^2 score: 0.78
Validation MAE: 5.92
Testing (CSD76) R^2 score: 0.51
Testing (CSD76) MAE: 4.36


In [5]:
from dfa_recommender.df_class import get_molecule
from molSimplify.Classes.mol3D import mol3D

def find_list_different(A, B):
    return list(set(A).difference(set(B)))

def get_zeroth_shell(kulik_mol):
    kulik_mol.ozcs = kulik_mol.findMetal()

def get_first_shell(kulik_mol):
    fcs = kulik_mol.get_fcs()
    kulik_mol.ofcs = find_list_different(fcs, kulik_mol.ozcs) 

def get_second_shell(kulik_mol):
    inds = []
    for ii in kulik_mol.ofcs:
        inds += kulik_mol.getBondedAtoms(ii)
    inds = find_list_different(inds, kulik_mol.ozcs)
    kulik_mol.oscs = find_list_different(inds, kulik_mol.ofcs)

def get_global_shell(kulik_mol):
    inds = list(range(kulik_mol.natoms))
    inds = find_list_different(inds, kulik_mol.ozcs)
    inds = find_list_different(inds, kulik_mol.ofcs)
    kulik_mol.ogcs = find_list_different(inds, kulik_mol.oscs)

# Training Set

In [6]:

pbe_idx = [vss_pbe_structs[i] for i in vss_pbe_train_idxs]

eps = 1.
xi = 1e-3
alpha = 1.
cut = True
vat_criterion = regVAT(device, eps, xi, alpha, k=3, cut=cut)

l_tr_iter = iter(DataLoader(
    tr_l, len(tr_l), num_workers=0,
    ))
l_x, l_y = next(l_tr_iter)
l_x.shape

nl_x = l_x.numpy()
mapping = {0: 1e-6, 1: 6, 2: 25, 3: 25, 4: 25, 5: 25, 6: 58*2, 7: 58*2, 8: 58*2, 9: 58*2}
d_x = vat_criterion(best_model, l_x, return_adv=True).numpy()
#d_aggr = np.mean(np.abs(d_x), axis=-1)
d_aggr = np.zeros(shape=(d_x.shape[0], d_x.shape[1]))
for ii, _d in enumerate(d_x):
    for jj, __d in enumerate(_d):
        nf = mapping[int(nl_x[ii, jj, -1])]
        d_aggr[ii, jj] =  np.sum(np.abs(__d))/(3*nf +5)

tot_mean_shell_focus = {
    "ozcs": [], "ofcs": [], "oscs": [], "ogcs": []
}
for ii in range(l_x.shape[0]):
    xyzfile = "../../vss_data/geometries/VSS-452/%s.xyz"% pbe_idx[ii]
    kulik_mol = mol3D()
    kulik_mol.readfromxyz(xyzfile)
    get_zeroth_shell(kulik_mol=kulik_mol)
    get_first_shell(kulik_mol=kulik_mol)
    get_second_shell(kulik_mol=kulik_mol)
    get_global_shell(kulik_mol=kulik_mol)
    mean_shell_focus = {}
    for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
        inds = getattr(kulik_mol, attr)
        mean_shell_focus[attr] = np.mean(d_aggr[ii][inds])
        tot_mean_shell_focus[attr] += [mean_shell_focus[attr]]

res = {
    "mean": list(),
    "std": list(),
}
mapping = {
    "ozcs": "metal          ", 
    "ofcs": "first shell    ", 
    "oscs": "second shell   ", 
    "ogcs": "third & global ",
}
print("shell           mean    std. dev.")
for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
    print(mapping[attr], round(np.nanmean(tot_mean_shell_focus[attr]), 4), round(np.nanstd(tot_mean_shell_focus[attr]), 4))
    res["mean"].append(np.nanmean(tot_mean_shell_focus[attr]))
    res["std"].append(0.5*np.nanstd(tot_mean_shell_focus[attr]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


shell           mean    std. dev.
metal           0.0204 0.0023
first shell     0.0106 0.0047
second shell    0.0059 0.0041
third & global  0.003 0.0018


In [7]:
ii = 44
xyzfile = "../../vss_data/geometries/VSS-452/%s.xyz"% pbe_idx[ii]

try:
    mol, _ = get_molecule(
        xyzfile=xyzfile, 
        charge=2, spin=4
        )
except:
    mol, _ = get_molecule(
        xyzfile=xyzfile, 
        charge=2, spin=5
        )
kulik_mol = mol3D()
kulik_mol.readfromxyz(xyzfile)
rev_max = list(reversed((np.argsort(d_aggr[ii]))))[:7]
fcs = kulik_mol.get_fcs()
fdict = {}
for _ii in range(kulik_mol.natoms):
    fdict["%d-%s"%(_ii, kulik_mol.getAtom(_ii).symbol())] = np.power(d_aggr[ii][_ii], 1./3)
get_zeroth_shell(kulik_mol=kulik_mol)
get_first_shell(kulik_mol=kulik_mol)
get_second_shell(kulik_mol=kulik_mol)
get_global_shell(kulik_mol=kulik_mol)
mean_shell_focus = {}
for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
    inds = getattr(kulik_mol, attr)
    mean_shell_focus[attr] = np.power(np.mean(d_aggr[ii][inds]), 1./3)
print("xyzfile: ", xyzfile)
print("catoms: ", [kulik_mol.getAtom(_ii).symbol() for _ii in fcs], fcs)
print("actual max: ", [kulik_mol.getAtom(_ii).symbol() for _ii in rev_max], rev_max)
print("diff compared to fcs: ", set(fcs).difference(set(rev_max)))
print(fcs, list(reversed((np.argsort(d_aggr[ii])))))
print("fdict: ", fdict)
print("mean_shell_focus: ", mean_shell_focus)



c: [2.0]
fc: [2.0]
m: [4]
fm: [4]
xyzfile:  ../../vss_data/geometries/VSS-452/mn_3_cyanide-C-0-d1_cyanide-C-0-d1_cyanide-C-0-d1_formaldehyde-O-1-0_formaldehyde-O-1-0_formaldehyde-O-1-0_5.xyz
catoms:  ['Mn', 'C', 'C', 'C', 'O', 'O', 'O'] [0, 1, 3, 5, 8, 12, 16]
actual max:  ['Mn', 'O', 'O', 'O', 'N', 'N', 'N'] [0, 16, 8, 12, 2, 4, 6]
diff compared to fcs:  {1, 3, 5}
[0, 1, 3, 5, 8, 12, 16] [0, 16, 8, 12, 2, 4, 6, 3, 1, 5, 11, 15, 7, 18, 14, 17, 13, 9, 10, 24, 29, 28, 27, 26, 25, 21, 23, 22, 20, 19, 31, 30, 64, 63, 33, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 32]
fdict:  {'0-Mn': 0.28146779012094003, '1-C': 0.16121287057047676, '2-N': 0.1829207558578598, '3-C': 0.16207965733014662, '4-N': 0.1827981599904229, '5-C': 0.16113867345531752, '6-N': 0.18276189787186828, '7-C': 0.16021868653937696, '8-O': 0.23366173126514606, '9-H': 0.10796187158703531, '10-H': 0.10794949804936585, '11-C': 0.16026628214805896, '12-O': 0

In [8]:
ii = 249
xyzfile = "../../vss_data/geometries/VSS-452/%s.xyz"% pbe_idx[ii]

try:
    mol, _ = get_molecule(
        xyzfile=xyzfile, 
        charge=2, spin=4
        )
except:
    mol, _ = get_molecule(
        xyzfile=xyzfile, 
        charge=2, spin=5
        )
kulik_mol = mol3D()
kulik_mol.readfromxyz(xyzfile)
rev_max = list(reversed((np.argsort(d_aggr[ii]))))[:7]
fcs = kulik_mol.get_fcs()
fdict = {}
for _ii in range(kulik_mol.natoms):
    fdict["%d-%s"%(_ii, kulik_mol.getAtom(_ii).symbol())] = np.power(d_aggr[ii][_ii], 1./3)
get_zeroth_shell(kulik_mol=kulik_mol)
get_first_shell(kulik_mol=kulik_mol)
get_second_shell(kulik_mol=kulik_mol)
get_global_shell(kulik_mol=kulik_mol)
mean_shell_focus = {}
for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
    inds = getattr(kulik_mol, attr)
    mean_shell_focus[attr] = np.power(np.mean(d_aggr[ii][inds]), 1./3)
print("xyzfile: ", xyzfile)
print("catoms: ", [kulik_mol.getAtom(_ii).symbol() for _ii in fcs], fcs)
print("actual max: ", [kulik_mol.getAtom(_ii).symbol() for _ii in rev_max], rev_max)
print("diff compared to fcs: ", set(fcs).difference(set(rev_max)))
print(fcs, list(reversed((np.argsort(d_aggr[ii])))))
print("fdict: ", fdict)
print("mean_shell_focus: ", mean_shell_focus)



c: [2.0]
fc: [2.0]
m: [4]
fm: [4]
xyzfile:  ../../vss_data/geometries/VSS-452/cr_2_cs-C-0-0_cs-C-0-0_cs-C-0-0_cs-C-0-0_ome2-O-0-0_ome2-O-0-0_5.xyz
catoms:  ['Cr', 'C', 'C', 'C', 'C', 'O', 'O'] [0, 1, 3, 5, 7, 9, 18]
actual max:  ['Cr', 'S', 'S', 'S', 'S', 'O', 'O'] [0, 4, 8, 2, 6, 9, 18]
diff compared to fcs:  {1, 3, 5, 7}
[0, 1, 3, 5, 7, 9, 18] [0, 4, 8, 2, 6, 9, 18, 5, 1, 7, 3, 10, 11, 20, 19, 21, 24, 12, 15, 26, 23, 13, 16, 17, 22, 25, 14, 31, 27, 28, 29, 30, 64, 63, 33, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 32]
fdict:  {'0-Cr': 0.2752165940725832, '1-C': 0.15475300176058038, '2-S': 0.21710632385324552, '3-C': 0.15471989791461865, '4-S': 0.21724888698639255, '5-C': 0.15475409581793007, '6-S': 0.21710378945965828, '7-C': 0.15472090944352565, '8-S': 0.21724470885015634, '9-O': 0.21669631611194096, '10-C': 0.15335486018609798, '11-C': 0.1533540364829818, '12-H': 0.10487779121422579, '13-H': 0.10480980936482

# Validation Set

In [9]:

pbe_idx = [vss_pbe_structs[i] for i in vss_pbe_val_idxs]
#pbe_idx = list(csd_pbe_structs)

eps = 1.
xi = 1e-3
alpha = 1.
cut = True
vat_criterion = regVAT(device, eps, xi, alpha, k=3, cut=cut)


l_te_iter = iter(DataLoader(
    te_l, len(te_l), num_workers=0,
    ))
l_x, l_y = next(l_te_iter)
l_x.shape

nl_x = l_x.numpy()
mapping = {0: 1e-6, 1: 6, 2: 25, 3: 25, 4: 25, 5: 25, 6: 58*2, 7: 58*2, 8: 58*2, 9: 58*2}
d_x = vat_criterion(best_model, l_x, return_adv=True).numpy()
#d_aggr = np.mean(np.abs(d_x), axis=-1)
d_aggr = np.zeros(shape=(d_x.shape[0], d_x.shape[1]))
for ii, _d in enumerate(d_x):
    for jj, __d in enumerate(_d):
        nf = mapping[int(nl_x[ii, jj, -1])]
        d_aggr[ii, jj] =  np.sum(np.abs(__d))/(3*nf +5)

tot_mean_shell_focus = {
    "ozcs": [], "ofcs": [], "oscs": [], "ogcs": []
}
for ii in range(l_x.shape[0]):
    xyzfile = "../../vss_data/geometries/VSS-452/%s.xyz"% pbe_idx[ii]
    kulik_mol = mol3D()
    kulik_mol.readfromxyz(xyzfile)
    get_zeroth_shell(kulik_mol=kulik_mol)
    get_first_shell(kulik_mol=kulik_mol)
    get_second_shell(kulik_mol=kulik_mol)
    get_global_shell(kulik_mol=kulik_mol)
    mean_shell_focus = {}
    for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
        inds = getattr(kulik_mol, attr)
        mean_shell_focus[attr] = np.mean(d_aggr[ii][inds])
        tot_mean_shell_focus[attr] += [mean_shell_focus[attr]]

res = {
    "mean": list(),
    "std": list(),
}
mapping = {
    "ozcs": "metal          ", 
    "ofcs": "first shell    ", 
    "oscs": "second shell   ", 
    "ogcs": "third & global ",
}
print("shell           mean    std. dev.")
for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
    print(mapping[attr], round(np.nanmean(tot_mean_shell_focus[attr]), 4), round(np.nanstd(tot_mean_shell_focus[attr]), 4))
    res["mean"].append(np.nanmean(tot_mean_shell_focus[attr]))
    res["std"].append(0.5*np.nanstd(tot_mean_shell_focus[attr]))

shell           mean    std. dev.
metal           0.0206 0.0023
first shell     0.0097 0.0046
second shell    0.0059 0.0045
third & global  0.0029 0.0023


In [10]:
ii = 12
xyzfile = "../../vss_data/geometries/VSS-452/%s.xyz"% pbe_idx[ii]

try:
    mol, _ = get_molecule(
        xyzfile=xyzfile, 
        charge=2, spin=4
        )
except:
    mol, _ = get_molecule(
        xyzfile=xyzfile, 
        charge=2, spin=5
        )
kulik_mol = mol3D()
kulik_mol.readfromxyz(xyzfile)
rev_max = list(reversed((np.argsort(d_aggr[ii]))))[:7]
fcs = kulik_mol.get_fcs()
fdict = {}
for _ii in range(kulik_mol.natoms):
    fdict["%d-%s"%(_ii, kulik_mol.getAtom(_ii).symbol())] = np.power(d_aggr[ii][_ii], 1./3)
get_zeroth_shell(kulik_mol=kulik_mol)
get_first_shell(kulik_mol=kulik_mol)
get_second_shell(kulik_mol=kulik_mol)
get_global_shell(kulik_mol=kulik_mol)
mean_shell_focus = {}
for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
    inds = getattr(kulik_mol, attr)
    mean_shell_focus[attr] = np.power(np.mean(d_aggr[ii][inds]), 1./3)
print("xyzfile: ", xyzfile)
print("catoms: ", [kulik_mol.getAtom(_ii).symbol() for _ii in fcs], fcs)
print("actual max: ", [kulik_mol.getAtom(_ii).symbol() for _ii in rev_max], rev_max)
print("diff compared to fcs: ", set(fcs).difference(set(rev_max)))
print(fcs, list(reversed((np.argsort(d_aggr[ii])))))
print("fdict: ", fdict)
print("mean_shell_focus: ", mean_shell_focus)



c: [2.0]
fc: [2.0]
m: [4]
fm: [4]
xyzfile:  ../../vss_data/geometries/VSS-452/cr_2_phosphine-P-0-0_phosphine-P-0-0_phosphine-P-0-0_phosphine-P-0-0_phosphine-P-0-0_acetonitrile-N-1-0_5.xyz
catoms:  ['Cr', 'P', 'P', 'P', 'P', 'P', 'N'] [0, 1, 5, 9, 13, 17, 22]
actual max:  ['Cr', 'N', 'P', 'P', 'P', 'P', 'P'] [0, 22, 1, 9, 13, 17, 5]
diff compared to fcs:  set()
[0, 1, 5, 9, 13, 17, 22] [0, 22, 1, 9, 13, 17, 5, 23, 21, 16, 19, 6, 25, 24, 14, 8, 20, 26, 18, 7, 4, 2, 15, 11, 3, 12, 10, 31, 27, 28, 29, 30, 64, 63, 33, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 32]
fdict:  {'0-Cr': 0.2805129701530542, '1-P': 0.1781022092481307, '2-H': 0.10735589750956613, '3-H': 0.10733491172660091, '4-H': 0.10735720445703734, '5-P': 0.17721356979039674, '6-H': 0.10754878924155117, '7-H': 0.10736796799694966, '8-H': 0.10744076194664372, '9-P': 0.17777913709002727, '10-H': 0.10731594999679807, '11-H': 0.10733785463510566, '12-H': 0.107

In [11]:
ii = 21
xyzfile = "../../vss_data/geometries/VSS-452/%s.xyz"% pbe_idx[ii]

try:
    mol, _ = get_molecule(
        xyzfile=xyzfile, 
        charge=2, spin=4
        )
except:
    mol, _ = get_molecule(
        xyzfile=xyzfile, 
        charge=2, spin=5
        )
kulik_mol = mol3D()
kulik_mol.readfromxyz(xyzfile)
rev_max = list(reversed((np.argsort(d_aggr[ii]))))[:7]
fcs = kulik_mol.get_fcs()
fdict = {}
for _ii in range(kulik_mol.natoms):
    fdict["%d-%s"%(_ii, kulik_mol.getAtom(_ii).symbol())] = np.power(d_aggr[ii][_ii], 1./3)
get_zeroth_shell(kulik_mol=kulik_mol)
get_first_shell(kulik_mol=kulik_mol)
get_second_shell(kulik_mol=kulik_mol)
get_global_shell(kulik_mol=kulik_mol)
mean_shell_focus = {}
for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
    inds = getattr(kulik_mol, attr)
    mean_shell_focus[attr] = np.power(np.mean(d_aggr[ii][inds]), 1./3)
print("xyzfile: ", xyzfile)
print("catoms: ", [kulik_mol.getAtom(_ii).symbol() for _ii in fcs], fcs)
print("actual max: ", [kulik_mol.getAtom(_ii).symbol() for _ii in rev_max], rev_max)
print("diff compared to fcs: ", set(fcs).difference(set(rev_max)))
print(fcs, list(reversed((np.argsort(d_aggr[ii])))))
print("fdict: ", fdict)
print("mean_shell_focus: ", mean_shell_focus)



c: [2.0]
fc: [2.0]
m: [4]
fm: [4]
xyzfile:  ../../vss_data/geometries/VSS-452/fe_2_ammonia-N-0-0_ammonia-N-0-0_ammonia-N-0-0_ammonia-N-0-0_ammonia-N-0-0_formaldehyde-O-1-0_5.xyz
catoms:  ['Fe', 'N', 'N', 'N', 'N', 'N', 'O'] [0, 1, 5, 9, 13, 17, 22]
actual max:  ['O', 'Fe', 'N', 'N', 'N', 'N', 'N'] [22, 0, 13, 1, 5, 17, 9]
diff compared to fcs:  set()
[0, 1, 5, 9, 13, 17, 22] [22, 0, 13, 1, 5, 17, 9, 21, 19, 18, 20, 4, 12, 8, 16, 2, 7, 10, 24, 14, 23, 11, 15, 6, 3, 31, 25, 26, 27, 28, 29, 30, 64, 63, 33, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 32]
fdict:  {'0-Fe': 0.27692987905933963, '1-N': 0.22665725234372985, '2-H': 0.13485232482476311, '3-H': 0.13483666804834046, '4-H': 0.13487800083704077, '5-N': 0.22663438400681044, '6-H': 0.13483746686058545, '7-H': 0.13485139852461847, '8-H': 0.13486199974419824, '9-N': 0.22597983101358604, '10-H': 0.13484808214700608, '11-H': 0.13484084908710972, '12-H': 0.13486372439

# Test Set

In [12]:

pbe_idx = list(csd_pbe_structs)

eps = 1.
xi = 1e-3
alpha = 1.
cut = True
vat_criterion = regVAT(device, eps, xi, alpha, k=3, cut=cut)


l_csd_iter = iter(DataLoader(
    csd_l, len(csd_l), num_workers=0,
    ))
l_x, l_y = next(l_csd_iter)
l_x.shape

nl_x = l_x.numpy()
mapping = {0: 1e-6, 1: 6, 2: 25, 3: 25, 4: 25, 5: 25, 6: 58*2, 7: 58*2, 8: 58*2, 9: 58*2}
d_x = vat_criterion(best_model, l_x, return_adv=True).numpy()
#d_aggr = np.mean(np.abs(d_x), axis=-1)
d_aggr = np.zeros(shape=(d_x.shape[0], d_x.shape[1]))
for ii, _d in enumerate(d_x):
    for jj, __d in enumerate(_d):
        nf = mapping[int(nl_x[ii, jj, -1])]
        d_aggr[ii, jj] =  np.sum(np.abs(__d))/(3*nf +5)

tot_mean_shell_focus = {
    "ozcs": [], "ofcs": [], "oscs": [], "ogcs": []
}
for ii in range(l_x.shape[0]):
    xyzfile = "../../vss_data/geometries/CSD-76/%s.xyz"% pbe_idx[ii]
    kulik_mol = mol3D()
    kulik_mol.readfromxyz(xyzfile)
    get_zeroth_shell(kulik_mol=kulik_mol)
    get_first_shell(kulik_mol=kulik_mol)
    get_second_shell(kulik_mol=kulik_mol)
    get_global_shell(kulik_mol=kulik_mol)
    mean_shell_focus = {}
    for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
        inds = getattr(kulik_mol, attr)
        mean_shell_focus[attr] = np.mean(d_aggr[ii][inds])
        tot_mean_shell_focus[attr] += [mean_shell_focus[attr]]

res = {
    "mean": list(),
    "std": list(),
}
mapping = {
    "ozcs": "metal          ", 
    "ofcs": "first shell    ", 
    "oscs": "second shell   ", 
    "ogcs": "third & global ",
}
print("shell           mean    std. dev.")
for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
    print(mapping[attr], round(np.nanmean(tot_mean_shell_focus[attr]), 4), round(np.nanstd(tot_mean_shell_focus[attr]), 4))
    res["mean"].append(np.nanmean(tot_mean_shell_focus[attr]))
    res["std"].append(0.5*np.nanstd(tot_mean_shell_focus[attr]))

shell           mean    std. dev.
metal           0.0192 0.0021
first shell     0.0124 0.0037
second shell    0.0065 0.0042
third & global  0.0047 0.0045


In [13]:
ii = 60
xyzfile = "../../vss_data/geometries/CSD-76/%s.xyz"% pbe_idx[ii]

try:
    mol, _ = get_molecule(
        xyzfile=xyzfile, 
        charge=2, spin=4
        )
except:
    mol, _ = get_molecule(
        xyzfile=xyzfile, 
        charge=2, spin=5
        )
kulik_mol = mol3D()
kulik_mol.readfromxyz(xyzfile)
rev_max = list(reversed((np.argsort(d_aggr[ii]))))[:7]
fcs = kulik_mol.get_fcs()
fdict = {}
for _ii in range(kulik_mol.natoms):
    fdict["%d-%s"%(_ii, kulik_mol.getAtom(_ii).symbol())] = np.power(d_aggr[ii][_ii], 1./3)
get_zeroth_shell(kulik_mol=kulik_mol)
get_first_shell(kulik_mol=kulik_mol)
get_second_shell(kulik_mol=kulik_mol)
get_global_shell(kulik_mol=kulik_mol)
mean_shell_focus = {}
for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
    inds = getattr(kulik_mol, attr)
    mean_shell_focus[attr] = np.power(np.mean(d_aggr[ii][inds]), 1./3)
print("xyzfile: ", xyzfile)
print("catoms: ", [kulik_mol.getAtom(_ii).symbol() for _ii in fcs], fcs)
print("actual max: ", [kulik_mol.getAtom(_ii).symbol() for _ii in rev_max], rev_max)
print("diff compared to fcs: ", set(fcs).difference(set(rev_max)))
print(fcs, list(reversed((np.argsort(d_aggr[ii])))))
print("fdict: ", fdict)
print("mean_shell_focus: ", mean_shell_focus)



c: [2.0]
fc: [2.0]
m: [4]
fm: [4]
xyzfile:  ../../vss_data/geometries/CSD-76/Mn_FARTOH_comp_0_3111.xyz
catoms:  ['Mn', 'N', 'N', 'N', 'N', 'N', 'N'] [0, 1, 2, 4, 5, 7, 8]
actual max:  ['Mn', 'N', 'N', 'N', 'N', 'N', 'N'] [0, 7, 1, 4, 9, 5, 12]
diff compared to fcs:  {8, 2}
[0, 1, 2, 4, 5, 7, 8] [0, 7, 1, 4, 9, 5, 12, 6, 2, 3, 11, 8, 10, 13, 14, 18, 15, 16, 17, 33, 25, 28, 19, 23, 21, 26, 22, 30, 24, 29, 20, 27, 31, 32, 63, 49, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 48, 34, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 64]
fdict:  {'0-Mn': 0.28130459579713507, '1-N': 0.19121088050910323, '2-N': 0.18984345727390978, '3-N': 0.189834264333315, '4-N': 0.19121020802754096, '5-N': 0.19065789742753902, '6-N': 0.19013602213554878, '7-N': 0.19143199942308847, '8-N': 0.1888544411534633, '9-N': 0.1911814023563683, '10-N': 0.18660943899052287, '11-N': 0.18964640661764476, '12-N': 0.19036052783895133, '13-C': 0.1673001476488106, '14-C': 0.16726484293922397, '15-C': 0.16722991850

In [14]:
ii = 9
xyzfile = "../../vss_data/geometries/CSD-76/%s.xyz"% pbe_idx[ii]

try:
    mol, _ = get_molecule(
        xyzfile=xyzfile, 
        charge=2, spin=4
        )
except:
    mol, _ = get_molecule(
        xyzfile=xyzfile, 
        charge=2, spin=5
        )
kulik_mol = mol3D()
kulik_mol.readfromxyz(xyzfile)
rev_max = list(reversed((np.argsort(d_aggr[ii]))))[:7]
fcs = kulik_mol.get_fcs()
fdict = {}
for _ii in range(kulik_mol.natoms):
    fdict["%d-%s"%(_ii, kulik_mol.getAtom(_ii).symbol())] = np.power(d_aggr[ii][_ii], 1./3)
get_zeroth_shell(kulik_mol=kulik_mol)
get_first_shell(kulik_mol=kulik_mol)
get_second_shell(kulik_mol=kulik_mol)
get_global_shell(kulik_mol=kulik_mol)
mean_shell_focus = {}
for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
    inds = getattr(kulik_mol, attr)
    mean_shell_focus[attr] = np.power(np.mean(d_aggr[ii][inds]), 1./3)
print("xyzfile: ", xyzfile)
print("catoms: ", [kulik_mol.getAtom(_ii).symbol() for _ii in fcs], fcs)
print("actual max: ", [kulik_mol.getAtom(_ii).symbol() for _ii in rev_max], rev_max)
print("diff compared to fcs: ", set(fcs).difference(set(rev_max)))
print(fcs, list(reversed((np.argsort(d_aggr[ii])))))
print("fdict: ", fdict)
print("mean_shell_focus: ", mean_shell_focus)

xyzfile:  ../../vss_data/geometries/CSD-76/Co_DETCON_comp_0_33.xyz
catoms:  ['Co', 'N', 'N', 'N', 'N', 'N', 'N'] [0, 9, 10, 11, 12, 13, 14]
actual max:  ['Co', 'N', 'N', 'N', 'N', 'N', 'N'] [0, 10, 13, 14, 11, 12, 9]
diff compared to fcs:  set()
[0, 9, 10, 11, 12, 13, 14] [0, 10, 13, 14, 11, 12, 9, 7, 5, 3, 6, 2, 1, 4, 8, 20, 23, 31, 30, 15, 28, 26, 40, 24, 37, 27, 25, 19, 36, 39, 33, 18, 16, 34, 29, 21, 32, 17, 22, 38, 35, 63, 52, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 50, 51, 49, 48, 47, 46, 45, 44, 43, 42, 41, 64]
fdict:  {'0-Co': 0.2750111488632742, '1-C': 0.1821168248491713, '2-C': 0.18216378527520155, '3-C': 0.18222618578028796, '4-C': 0.18209998641242917, '5-C': 0.18226303889723555, '6-C': 0.18221628670036855, '7-C': 0.18227838965357798, '8-C': 0.1820943242237772, '9-N': 0.21317418263565938, '10-N': 0.2202220377460938, '11-N': 0.21399148523033068, '12-N': 0.21332932281309547, '13-N': 0.21778445106662936, '14-N': 0.21473643056015468, '15-H': 0.12699795014741558, '16-H': 0.126541

# SCAN0 model

In [15]:
X_train = np.array(vss_scan_train_X)
X_val = np.array(vss_scan_val_X)
X_test = np.array(csd_scan_X)

y_train = np.array(vss_scan_train_y)
y_val = np.array(vss_scan_val_y)
y_test = np.array(csd_scan_y)
y_scaler = scan_target_scaler

atoms  = ["X", "H", "C", "N", "O", "F", "Cr", "Mn", "Fe", "Co"]

bz = 16

data_tr, data_te = numpy_to_dataset(X_train, y_train, regression=True), numpy_to_dataset(X_val, y_val, regression=True)
tr_l = SubsetDataset(data_tr, list(range(len(data_tr))))
te_l = SubsetDataset(data_te, list(range(len(data_te))))
print("sub labeled dataset length: ", len(tr_l), len(te_l))

l_tr_iter = iter(DataLoader(tr_l, bz, num_workers=num_workers,
                            sampler=InfiniteSampler(len(tr_l))))
l_te_iter = iter(DataLoader(te_l, bz, num_workers=num_workers,
                            sampler=InfiniteSampler(len(te_l))))
te_loader = DataLoader(te_l, len(te_l), num_workers=num_workers)
tr_l_loader = DataLoader(tr_l, len(tr_l), num_workers=num_workers)


best_model = pickle.load(open('../ml_training/DF-BP/scan0_opt5/BP_model-scan.pkl', "rb"))

best_model.eval()
scan_train_preds = []
scan_train_labels = []
with torch.no_grad():
    for x, y in tr_l_loader:
        _pred = best_model(x.to(device))
        scan_train_preds.append(_pred.cpu().numpy())
        scan_train_labels.append(y.cpu().numpy())

scan_train_preds = y_scaler.inverse_transform(scan_train_preds)[0]
scan_train_labels = y_scaler.inverse_transform(scan_train_labels)[0]

scan_val_preds = []
scan_val_labels = []
with torch.no_grad():
    for x, y in te_loader:
        _pred = best_model(x.to(device))
        scan_val_preds.append(_pred.cpu().numpy())
        scan_val_labels.append(y.cpu().numpy())

scan_val_preds = y_scaler.inverse_transform(scan_val_preds)[0]
scan_val_labels = y_scaler.inverse_transform(scan_val_labels)[0]

data_csd = numpy_to_dataset(X_test, y_test, regression=True)
csd_l = SubsetDataset(data_csd, list(range(len(data_csd))))
csd_loader = DataLoader(csd_l, len(csd_l), num_workers=num_workers)

scan_test_preds = []
scan_test_labels = []
with torch.no_grad():
    for x, y in csd_loader:
        _pred = best_model(x.to(device))
        scan_test_preds.append(_pred.cpu().numpy())
        scan_test_labels.append(y.cpu().numpy())

scan_test_preds = y_scaler.inverse_transform(scan_test_preds)[0]
scan_test_labels = y_scaler.inverse_transform(scan_test_labels)[0]

for arr in [scan_train_preds, scan_val_preds, scan_test_preds]:
    for idx, elem in enumerate(arr):
        if elem > 100:
            arr[idx] = 100
        elif elem < 0:
            arr[idx] = 0

scan_train_r2 = r2_score(scan_train_labels, scan_train_preds)
scan_train_mae = mean_absolute_error(scan_train_labels, scan_train_preds)
scan_val_r2 = r2_score(scan_val_labels, scan_val_preds)
scan_val_mae = mean_absolute_error(scan_val_labels, scan_val_preds)
scan_test_r2 = r2_score(scan_test_labels, scan_test_preds)
scan_test_mae = mean_absolute_error(scan_test_labels, scan_test_preds)

print("Training R^2 score: {:.2f}".format(scan_train_r2))
print("Training MAE: {:.2f}".format(scan_train_mae))
print("Validation R^2 score: {:.2f}".format(scan_val_r2))
print("Validation MAE: {:.2f}".format(scan_val_mae))
print("Testing (CSD76) R^2 score: {:.2f}".format(scan_test_r2))
print("Testing (CSD76) MAE: {:.2f}".format(scan_test_mae))

sub labeled dataset length:  297 34
Training R^2 score: 0.46
Training MAE: 6.89
Validation R^2 score: 0.62
Validation MAE: 5.53
Testing (CSD76) R^2 score: 0.44
Testing (CSD76) MAE: 4.75


In [16]:

scan_idx = [vss_scan_structs[i] for i in vss_scan_train_idxs]

eps = 1.
xi = 1e-3
alpha = 1.
cut = True
vat_criterion = regVAT(device, eps, xi, alpha, k=3, cut=cut)

l_tr_iter = iter(DataLoader(
    tr_l, len(tr_l), num_workers=0,
    ))
l_x, l_y = next(l_tr_iter)
l_x.shape

nl_x = l_x.numpy()
mapping = {0: 1e-6, 1: 6, 2: 25, 3: 25, 4: 25, 5: 25, 6: 58*2, 7: 58*2, 8: 58*2, 9: 58*2}
d_x = vat_criterion(best_model, l_x, return_adv=True).numpy()
#d_aggr = np.mean(np.abs(d_x), axis=-1)
d_aggr = np.zeros(shape=(d_x.shape[0], d_x.shape[1]))
for ii, _d in enumerate(d_x):
    for jj, __d in enumerate(_d):
        nf = mapping[int(nl_x[ii, jj, -1])]
        d_aggr[ii, jj] =  np.sum(np.abs(__d))/(3*nf +5)

tot_mean_shell_focus = {
    "ozcs": [], "ofcs": [], "oscs": [], "ogcs": []
}
for ii in range(l_x.shape[0]):
    xyzfile = "../../vss_data/geometries/VSS-452/%s.xyz"% scan_idx[ii]
    kulik_mol = mol3D()
    kulik_mol.readfromxyz(xyzfile)
    get_zeroth_shell(kulik_mol=kulik_mol)
    get_first_shell(kulik_mol=kulik_mol)
    get_second_shell(kulik_mol=kulik_mol)
    get_global_shell(kulik_mol=kulik_mol)
    mean_shell_focus = {}
    for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
        inds = getattr(kulik_mol, attr)
        mean_shell_focus[attr] = np.mean(d_aggr[ii][inds])
        tot_mean_shell_focus[attr] += [mean_shell_focus[attr]]

res = {
    "mean": list(),
    "std": list(),
}
mapping = {
    "ozcs": "metal          ", 
    "ofcs": "first shell    ", 
    "oscs": "second shell   ", 
    "ogcs": "third & global ",
}
print("shell           mean    std. dev.")
for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
    print(mapping[attr], round(np.nanmean(tot_mean_shell_focus[attr]), 4), round(np.nanstd(tot_mean_shell_focus[attr]), 4))
    res["mean"].append(np.nanmean(tot_mean_shell_focus[attr]))
    res["std"].append(0.5*np.nanstd(tot_mean_shell_focus[attr]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


shell           mean    std. dev.
metal           0.0199 0.0054
first shell     0.0102 0.0083
second shell    0.0034 0.0046
third & global  0.002 0.0053


In [17]:

scan_idx = [vss_scan_structs[i] for i in vss_scan_val_idxs]

eps = 1.
xi = 1e-3
alpha = 1.
cut = True
vat_criterion = regVAT(device, eps, xi, alpha, k=3, cut=cut)


l_te_iter = iter(DataLoader(
    te_l, len(te_l), num_workers=0,
    ))
l_x, l_y = next(l_te_iter)
l_x.shape

nl_x = l_x.numpy()
mapping = {0: 1e-6, 1: 6, 2: 25, 3: 25, 4: 25, 5: 25, 6: 58*2, 7: 58*2, 8: 58*2, 9: 58*2}
d_x = vat_criterion(best_model, l_x, return_adv=True).numpy()
#d_aggr = np.mean(np.abs(d_x), axis=-1)
d_aggr = np.zeros(shape=(d_x.shape[0], d_x.shape[1]))
for ii, _d in enumerate(d_x):
    for jj, __d in enumerate(_d):
        nf = mapping[int(nl_x[ii, jj, -1])]
        d_aggr[ii, jj] =  np.sum(np.abs(__d))/(3*nf +5)

tot_mean_shell_focus = {
    "ozcs": [], "ofcs": [], "oscs": [], "ogcs": []
}
for ii in range(l_x.shape[0]):
    xyzfile = "../../vss_data/geometries/VSS-452/%s.xyz"% scan_idx[ii]
    kulik_mol = mol3D()
    kulik_mol.readfromxyz(xyzfile)
    get_zeroth_shell(kulik_mol=kulik_mol)
    get_first_shell(kulik_mol=kulik_mol)
    get_second_shell(kulik_mol=kulik_mol)
    get_global_shell(kulik_mol=kulik_mol)
    mean_shell_focus = {}
    for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
        inds = getattr(kulik_mol, attr)
        mean_shell_focus[attr] = np.mean(d_aggr[ii][inds])
        tot_mean_shell_focus[attr] += [mean_shell_focus[attr]]

res = {
    "mean": list(),
    "std": list(),
}
mapping = {
    "ozcs": "metal          ", 
    "ofcs": "first shell    ", 
    "oscs": "second shell   ", 
    "ogcs": "third & global ",
}
print("shell           mean    std. dev.")
for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
    print(mapping[attr], round(np.nanmean(tot_mean_shell_focus[attr]), 4), round(np.nanstd(tot_mean_shell_focus[attr]), 4))
    res["mean"].append(np.nanmean(tot_mean_shell_focus[attr]))
    res["std"].append(0.5*np.nanstd(tot_mean_shell_focus[attr]))

shell           mean    std. dev.
metal           0.0201 0.0057
first shell     0.0099 0.0082
second shell    0.0034 0.0039
third & global  0.001 0.0011


In [18]:

scan_idx = list(csd_scan_structs)

eps = 1.
xi = 1e-3
alpha = 1.
cut = True
vat_criterion = regVAT(device, eps, xi, alpha, k=3, cut=cut)


l_csd_iter = iter(DataLoader(
    csd_l, len(csd_l), num_workers=0,
    ))
l_x, l_y = next(l_csd_iter)
l_x.shape

nl_x = l_x.numpy()
mapping = {0: 1e-6, 1: 6, 2: 25, 3: 25, 4: 25, 5: 25, 6: 58*2, 7: 58*2, 8: 58*2, 9: 58*2}
d_x = vat_criterion(best_model, l_x, return_adv=True).numpy()
#d_aggr = np.mean(np.abs(d_x), axis=-1)
d_aggr = np.zeros(shape=(d_x.shape[0], d_x.shape[1]))
for ii, _d in enumerate(d_x):
    for jj, __d in enumerate(_d):
        nf = mapping[int(nl_x[ii, jj, -1])]
        d_aggr[ii, jj] =  np.sum(np.abs(__d))/(3*nf +5)

tot_mean_shell_focus = {
    "ozcs": [], "ofcs": [], "oscs": [], "ogcs": []
}
for ii in range(l_x.shape[0]):
    xyzfile = "../../vss_data/geometries/CSD-76/%s.xyz"% scan_idx[ii]
    kulik_mol = mol3D()
    kulik_mol.readfromxyz(xyzfile)
    get_zeroth_shell(kulik_mol=kulik_mol)
    get_first_shell(kulik_mol=kulik_mol)
    get_second_shell(kulik_mol=kulik_mol)
    get_global_shell(kulik_mol=kulik_mol)
    mean_shell_focus = {}
    for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
        inds = getattr(kulik_mol, attr)
        mean_shell_focus[attr] = np.mean(d_aggr[ii][inds])
        tot_mean_shell_focus[attr] += [mean_shell_focus[attr]]

res = {
    "mean": list(),
    "std": list(),
}
mapping = {
    "ozcs": "metal          ", 
    "ofcs": "first shell    ", 
    "oscs": "second shell   ", 
    "ogcs": "third & global ",
}
print("shell           mean    std. dev.")
for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
    print(mapping[attr], round(np.nanmean(tot_mean_shell_focus[attr]), 4), round(np.nanstd(tot_mean_shell_focus[attr]), 4))
    res["mean"].append(np.nanmean(tot_mean_shell_focus[attr]))
    res["std"].append(0.5*np.nanstd(tot_mean_shell_focus[attr]))

shell           mean    std. dev.
metal           0.0183 0.006
first shell     0.0146 0.0093
second shell    0.0038 0.0068
third & global  0.0013 0.0025
