In [1]:
import numpy as np
import pandas as pd
import json
import pickle
import sklearn.preprocessing

import torch

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr, spearmanr

##### load for plotting (with plotly)

In [2]:
from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
import plotly
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
plotly.io.orca.config.executable = '/Users/chenruduan/opt/anaconda3/envs/mols_newplotly/bin/orca'
init_notebook_mode(connected=True)
glob_layout = go.Layout(
    font=dict(family='Helvetica', size=24, color='black'),
    margin=dict(l=100, r=10, t=10, b=100),
    xaxis=dict(showgrid=False,  zeroline=False, ticks="inside", showline=True,
               tickwidth=1.5, linewidth=1.5, ticklen=10, linecolor='black',
               mirror="allticks", color="black"),
    yaxis=dict(showgrid=False,  zeroline=False, ticks="inside", showline=True,
               tickwidth=1.5, linewidth=1.5, ticklen=10, linecolor='black',
               mirror="allticks", color="black"),
    legend_orientation="v",
    paper_bgcolor='rgba(255,255,255,100)',
    plot_bgcolor='white',
)
blue = "rgba(0, 0, 255, 1)"
red = "rgba(255, 0, 0, 1)"
green = "rgba(0, 196, 64, 1)"
gray = "rgba(140, 140, 140, 1)"

##### load for ML parts 

In [3]:
from dfa_recommender.net import GatedNetwork, MySoftplus, TiledMultiLayerNN, MLP, finalMLP, ElementalGate
from torch.utils.data import DataLoader
from dfa_recommender.dataset import SubsetDataset
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import ExponentialLR
from dfa_recommender.sampler import InfiniteSampler
from dfa_recommender.vat import regVAT
from dfa_recommender.ml_utils import numpy_to_dataset
import copy

### Load models and data files

In [4]:
torch.set_num_threads(4)
torch.manual_seed(0)
np.random.seed(0)
device = torch.device('cpu')
num_workers = 0

In [5]:
from pkg_resources import resource_filename, Requirement
basepath = resource_filename(Requirement.parse("dfa_recommender"), "/dfa_recommender/data/")

In [6]:
X_org = pickle.load(open(basepath +  "/X_vss452.pickle", "rb")) ## features 
df_org = pd.read_csv(basepath + "/VSS-452.csv") ## csv file that stores the compuated vert SSE values at different methods
y_scalers = pickle.load(open(basepath +  "/abs-reg-y_scalers.pkl", "rb")) ## sklearn.preprocessing.StandardScaler object created on the stats of training data
tr_inds, te_inds = list(), list()
for ii, val in enumerate(df_org["train"].values):
    if val == 1:
        tr_inds.append(ii)
    elif val == 0:
        te_inds.append(ii)
len(tr_inds), len(te_inds)


Trying to unpickle estimator StandardScaler from version 0.24.2 when using version 0.22.1. This might lead to breaking code or invalid results. Use at your own risk.



(300, 152)

In [7]:
bz = 16
f = "blyp_hfx_50"
name_tr, name_te = df_org['name'].values[tr_inds],  df_org['name'].values[te_inds]
X_tr, X_te = X_org[tr_inds], X_org[te_inds]
y_t = np.abs(df_org["delta.%s.vertsse"%f].values)
y_scaler = y_scalers[f]
y_t = y_scalers[f].transform(y_t.reshape(-1, 1)).reshape(-1, )
y_tr, y_te = y_t[tr_inds], y_t[te_inds]

data_tr, data_te = numpy_to_dataset(X_tr, y_tr, regression=True), numpy_to_dataset(X_te, y_te, regression=True)
tr_l = SubsetDataset(data_tr, list(range(len(data_tr))))
te_l = SubsetDataset(data_te, list(range(len(data_te))))
# print("sub labeled dataset length: ", len(tr_l), len(te_l))

l_tr_iter = iter(DataLoader(tr_l, bz, num_workers=num_workers,
                            sampler=InfiniteSampler(len(tr_l))))
l_te_iter = iter(DataLoader(te_l, bz, num_workers=num_workers,
                            sampler=InfiniteSampler(len(te_l))))
te_loader = DataLoader(te_l, len(te_l), num_workers=num_workers)
tr_l_loader = DataLoader(tr_l, len(tr_l), num_workers=num_workers)

best_model = pickle.load(open(basepath + "/models-trends/mergedG10-abs-reg-%s.pkl"%f, "rb"))
best_model.eval()
preds = []
labels = []
with torch.no_grad():
    for x, y in te_loader:
        _pred = best_model(x.to(device))
        preds.append(_pred.cpu().numpy())
        labels.append(y.cpu().numpy())
y_t = y_scaler.inverse_transform(labels[0].reshape(-1, 1)).reshape(-1, )
y_hat = y_scaler.inverse_transform(preds[0].reshape(-1, 1)).reshape(-1, )
non_nan_inds = np.where(~np.isnan(y_t))[0]
y_t_super = np.copy(y_t)
_y_t = y_t[non_nan_inds]
_y_hat = y_hat[non_nan_inds]
mae = mean_absolute_error(_y_hat, _y_t)
scaled_mae = mae/(np.max(_y_t) - np.min(_y_t))
R2 = r2_score(_y_t, _y_hat)
rval = pearsonr(_y_t, _y_hat)[0]
print(f, "mae: ", round(mae, 5), "scaled mae: ", round(scaled_mae, 5), "R2: ", round(R2, 4), "r val: ", round(rval, 4))

blyp_hfx_50 mae:  2.25792 scaled mae:  0.06365 R2:  0.7502 r val:  0.8681


### Adversarial attack map

In [8]:
import nglview
from dfa_recommender.df_class import get_molecule
from molSimplify.Classes.mol3D import mol3D

def find_list_different(A, B):
    return list(set(A).difference(set(B)))

def get_zeroth_shell(kulik_mol):
    kulik_mol.ozcs = kulik_mol.findMetal()

def get_first_shell(kulik_mol):
    fcs = kulik_mol.get_fcs()
    kulik_mol.ofcs = find_list_different(fcs, kulik_mol.ozcs) 

def get_second_shell(kulik_mol):
    inds = []
    for ii in kulik_mol.ofcs:
        inds += kulik_mol.getBondedAtoms(ii)
    inds = find_list_different(inds, kulik_mol.ozcs)
    kulik_mol.oscs = find_list_different(inds, kulik_mol.ofcs)

def get_global_shell(kulik_mol):
    inds = list(range(kulik_mol.natoms))
    inds = find_list_different(inds, kulik_mol.ozcs)
    inds = find_list_different(inds, kulik_mol.ofcs)
    kulik_mol.ogcs = find_list_different(inds, kulik_mol.oscs)



##### attention with blyp/50%

In [88]:
eps = 1.
xi = 1e-2
alpha = 1.
cut = True
vat_criterion = regVAT(device, eps, xi, alpha, k=3, cut=cut)

In [89]:
l_tr_iter = iter(DataLoader(
    tr_l, len(tr_l), num_workers=0,
    ))
l_x, l_y = next(l_tr_iter)
l_x.shape

torch.Size([300, 65, 182])

In [90]:
nl_x = l_x.numpy()
mapping = {0: 1e-6, 1: 6, 2: 25, 3: 25, 4: 25, 5: 25, 6: 58*2, 7: 58*2, 8: 58*2, 9: 58*2}
d_x = vat_criterion(best_model, l_x, return_adv=True).numpy()
#d_aggr = np.mean(np.abs(d_x), axis=-1)
d_aggr = np.zeros(shape=(d_x.shape[0], d_x.shape[1]))
for ii, _d in enumerate(d_x):
    for jj, __d in enumerate(_d):
        nf = mapping[int(nl_x[ii, jj, -1])]
        d_aggr[ii, jj] =  np.sum(np.abs(__d))/(3*nf +5)
    

In [91]:
tot_mean_shell_focus = {
    "ozcs": [], "ofcs": [], "oscs": [], "ogcs": []
}
for ii in range(l_x.shape[0]):
    xyzfile = basepath + "/optgeos/optgeo_%s.xyz"%name_tr[ii]
    kulik_mol = mol3D()
    kulik_mol.readfromxyz(xyzfile)
    get_zeroth_shell(kulik_mol=kulik_mol)
    get_first_shell(kulik_mol=kulik_mol)
    get_second_shell(kulik_mol=kulik_mol)
    get_global_shell(kulik_mol=kulik_mol)
    mean_shell_focus = {}
    for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
        inds = getattr(kulik_mol, attr)
        mean_shell_focus[attr] = np.mean(d_aggr[ii][inds])
        tot_mean_shell_focus[attr] += [mean_shell_focus[attr]]

In [92]:
res = {
    "mean": list(),
    "std": list(),
}
mapping = {
    "ozcs": "metal          ", 
    "ofcs": "first shell    ", 
    "oscs": "second shell   ", 
    "ogcs": "third & global ",
}
print("shell           mean    std. dev.")
for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
    print(mapping[attr], round(np.nanmean(tot_mean_shell_focus[attr]), 4), round(np.nanstd(tot_mean_shell_focus[attr]), 4))
    res["mean"].append(np.nanmean(tot_mean_shell_focus[attr]))
    res["std"].append(0.5*np.nanstd(tot_mean_shell_focus[attr]))

shell           mean    std. dev.
metal           0.0244 0.0047
first shell     0.0129 0.0084
second shell    0.0077 0.0071
third & global  0.0032 0.0023


##### model focus in blyp family

In [109]:
res_all = json.load(open(basepath + "/focus_blyp_family.json"))
keys = ["metal", "1<sup>st</sup> coord. sphere", "2<sup>nd</sup> coord. sphere", "global"]
colors = ["rgba(0, 0, 255, 0.6)", "rgba(255, 0, 0, 0.6)",  "rgba(0, 240, 64, 0.6)", "rgba(255, 165, 0, 0.6)", "rgba(140, 140, 140, 0.6)"]
dfa_mapping = {
    "blyp": "BLYP",
    "b3lyp": "B3LYP",
    "blyp/50%": "BLYP:50%",
    "b2gpplyp": "B2GP-PLYP",
    "untrained": "untrained",
}
data = []
for ii, f in enumerate(res_all):
    trace0 = go.Bar(
        name=dfa_mapping[f],
        x=keys, 
        y=res_all[f]["mean"],
        # opacity=1,
        # color=colors[ii],
        marker=dict(color=colors[ii]),
        error_y=dict(type='data', array=res_all[f]["std"])
    )
    data += [trace0,]
layout = go.Layout()
layout.update(glob_layout)
layout["yaxis"].update({'title': "model focus", "range": [0, 0.03], "tickvals": [0, 0.01, 0.02, 0.03]})
layout.legend.update(x=.725, y=0.98, bgcolor="rgba(0,0,0,0)")
layout.update(width=950, height=400, boxmode='group')
fig = go.Figure(data=data, layout=layout)
iplot(fig)
fig.write_image("../../../../Dropbox (MIT)/CD-DFARecFull/Figures/F3//blyp_family.pdf")

##### eample complexes

In [106]:
ii = -1
xyzfile = basepath + "/optgeos/optgeo_%s.xyz"%name_tr[ii]
try:
    mol, _ = get_molecule(
        xyzfile=xyzfile, 
        charge=2, spin=4
        )
except:
    mol, _ = get_molecule(
        xyzfile=xyzfile, 
        charge=2, spin=5
        )
kulik_mol = mol3D()
kulik_mol.readfromxyz(xyzfile)
rev_max = list(reversed((np.argsort(d_aggr[ii]))))[:7]
print(rev_max)
fcs = kulik_mol.get_fcs()
fdict = {}
for _ii in range(kulik_mol.natoms):
    fdict["%d-%s"%(_ii, kulik_mol.getAtom(_ii).symbol())] = np.power(d_aggr[ii][_ii], 1./3)
get_zeroth_shell(kulik_mol=kulik_mol)
get_first_shell(kulik_mol=kulik_mol)
get_second_shell(kulik_mol=kulik_mol)
get_global_shell(kulik_mol=kulik_mol)
mean_shell_focus = {}
print("natoms: ", kulik_mol.natoms)
for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
    inds = getattr(kulik_mol, attr)
    mean_shell_focus[attr] = np.power(np.mean(d_aggr[ii][inds]), 1./3)
print("xyzfile: ", xyzfile)
print("catoms: ", [kulik_mol.getAtom(_ii).symbol() for _ii in fcs], fcs)
print("actual max: ", [kulik_mol.getAtom(_ii).symbol() for _ii in rev_max], rev_max)
print("diff compared to fcs: ", set(fcs).difference(set(rev_max)))
print(fcs, list(reversed((np.argsort(d_aggr[ii])))))
print("fdict: ", fdict)
print("mean_shell_focus: ", mean_shell_focus)

[0, 16, 13, 1, 7, 10, 4]
natoms:  19
xyzfile:  /Users/chenruduan/src/dfa_recommender/dfa_recommender/data//optgeos/optgeo_mn_3_water-O-0-0_water-O-0-0_water-O-0-0_water-O-0-0_thiocyanate-S-0-d1_thiocyanate-S-0-d1_5.xyz
catoms:  ['Mn', 'O', 'O', 'O', 'O', 'S', 'S'] [0, 1, 4, 7, 10, 13, 16]
actual max:  ['Mn', 'S', 'S', 'O', 'O', 'O', 'O'] [0, 16, 13, 1, 7, 10, 4]
diff compared to fcs:  set()
[0, 1, 4, 7, 10, 13, 16] [0, 16, 13, 1, 7, 10, 4, 17, 14, 18, 15, 12, 6, 11, 5, 2, 3, 8, 9, 24, 29, 28, 27, 26, 25, 20, 23, 22, 21, 19, 31, 30, 64, 63, 33, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 32]
fdict:  {'0-Mn': 0.3093280100415076, '1-O': 0.2373133939766853, '2-H': 0.08737556988059822, '3-H': 0.08733506141740821, '4-O': 0.23142964883950423, '5-H': 0.08803140598912224, '6-H': 0.08831499622270939, '7-O': 0.23682430387140058, '8-H': 0.08718530055845645, '9-H': 0.08589393490181285, '10-O': 0.23480166782463832, '11-H': 0.088

In [107]:
view = nglview.show_psi4(mol)
view.representations = [
    {"type": "cartoon", "params": {
        "sele": "protein", "color": "residueindex"
    }},
    {"type": "ball+stick", "params": {
        "sele": "hetero"
    }},
    {"type": "labels", "params": {
        "labelType": "number"
    }}
]
view

NGLWidget()

In [108]:
### r_vadv map
mask = np.zeros(shape=(kulik_mol.natoms, 58))
dd = np.zeros(shape=(kulik_mol.natoms, 58))
mask_dict = {"H": 6, "C": 25, "N": 25, "O": 25, "F": 25, "S": 38, "P": 38, "Cl": 38, "Fe": 58, "Co": 58, "Cr": 58, "Mn": 58}
for ir in range(dd.shape[0]):
    for jc in range(dd.shape[1]):
        if d_x[ii, ir, jc] > 0:
            dd[ir, jc] = np.power(d_x[ii, ir, jc]/np.max(d_x[ii, :, :58]), 1./3)
        else:
            dd[ir, jc] = -1*  np.power(d_x[ii, ir, jc]/np.min(d_x[ii, :, :58]), 1./3)
    mask[ir, :mask_dict[kulik_mol.getAtom(ir).symbol()]] = 1
fig = px.imshow(dd*mask, 
                color_continuous_scale='RdBu_r',
                x=list(range(58)),
                y=[kulik_mol.getAtom(x).symbol() + "%d"%x for x in range(kulik_mol.natoms)]
                )
fig.layout.update(width=900, height=350, boxmode='group')
fig.show()
# fig.write_image("../../../../Dropbox (MIT)/CD-DFARecFull/Figures/F3//VAA_map_2.pdf")

In [95]:
# element-based statistics over the data set
focus_dict = dict()
for ele in mask_dict:
    focus_dict[ele] = list()
for ii in range(d_x.shape[0]):
    xyzfile = basepath + "/optgeos/optgeo_%s.xyz"%name_tr[ii]
    kulik_mol = mol3D()
    kulik_mol.readfromxyz(xyzfile)
    for ir in range(kulik_mol.natoms):
        ele = kulik_mol.getAtom(ir).symbol()
        focus_dict[ele].append(d_x[ii, ir, :mask_dict[ele]])
for ele in mask_dict:
    focus_dict[ele] = np.array(focus_dict[ele])

In [111]:
np.mean(focus_dict["Co"], 0), np.std(focus_dict["Co"], 0)

(array([ 0.01239153,  0.01078557,  0.00825875,  0.0074953 ,  0.00935364,
         0.01792737,  0.00767558,  0.01036122, -0.00314943,  0.0073944 ,
        -0.00635407, -0.00395897, -0.01114259, -0.01310528, -0.01687975,
        -0.00206605, -0.00528158, -0.00858818, -0.00600836, -0.00292405,
        -0.00958739, -0.0097436 , -0.00963641, -0.00925456,  0.00116541,
        -0.00649754, -0.00255184,  0.0049761 , -0.02039176,  0.00111139,
        -0.02078774,  0.0113743 , -0.02632923, -0.00757205, -0.01140696,
        -0.03837543,  0.00678409, -0.00051557, -0.01551171, -0.00169606,
        -0.00607951, -0.0139345 , -0.005543  , -0.00732826, -0.00542829,
        -0.00653464, -0.00587791,  0.00066166, -0.01301356,  0.0015952 ,
         0.00154864,  0.00795291,  0.00439882, -0.00594464,  0.01192662,
        -0.00397677,  0.00509077, -0.01340892], dtype=float32),
 array([0.08942254, 0.07492004, 0.0542485 , 0.05869279, 0.06391342,
        0.12624356, 0.05169532, 0.07588047, 0.04422353, 0.0481091