In [1]:
import numpy as np
import pandas as pd
import json
import pickle
import sklearn.preprocessing

import torch

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr, spearmanr

##### load for plotting (with plotly)

In [2]:
from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
import plotly
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
plotly.io.orca.config.executable = '/Users/chenruduan/opt/anaconda3/envs/mols_newplotly/bin/orca'
init_notebook_mode(connected=True)
glob_layout = go.Layout(
    font=dict(family='Helvetica', size=24, color='black'),
    margin=dict(l=100, r=10, t=10, b=100),
    xaxis=dict(showgrid=False,  zeroline=False, ticks="inside", showline=True,
               tickwidth=1.5, linewidth=1.5, ticklen=10, linecolor='black',
               mirror="allticks", color="black"),
    yaxis=dict(showgrid=False,  zeroline=False, ticks="inside", showline=True,
               tickwidth=1.5, linewidth=1.5, ticklen=10, linecolor='black',
               mirror="allticks", color="black"),
    legend_orientation="v",
    paper_bgcolor='rgba(255,255,255,100)',
    plot_bgcolor='white',
)
blue = "rgba(0, 0, 255, 1)"
red = "rgba(255, 0, 0, 1)"
green = "rgba(0, 196, 64, 1)"
gray = "rgba(140, 140, 140, 1)"

##### load for ML parts 

In [3]:
from dfa_recommender.net import GatedNetwork, MySoftplus, TiledMultiLayerNN, MLP, finalMLP, ElementalGate
from torch.utils.data import DataLoader
from dfa_recommender.dataset import SubsetDataset
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import ExponentialLR
from dfa_recommender.sampler import InfiniteSampler
from dfa_recommender.vat import regVAT
from dfa_recommender.ml_utils import numpy_to_dataset
import copy

### Load models and data files

In [4]:
torch.set_num_threads(4)
torch.manual_seed(0)
np.random.seed(0)
device = torch.device('cpu')
num_workers = 0

In [5]:
from pkg_resources import resource_filename, Requirement
basepath = resource_filename(Requirement.parse("dfa_recommender"), "/dfa_recommender/data/")

In [6]:
X_org = pickle.load(open(basepath +  "/X.pickle", "rb")) ## features 
df_org = pd.read_csv(basepath + "/labeled_res.csv") ## csv file that stores the compuated vert SSE values at different methods
y_scalers = pickle.load(open(basepath +  "/abs-reg-y_scalers.pkl", "rb")) ## sklearn.preprocessing.StandardScaler object created on the stats of training data
inds = pickle.load(open(basepath + "/final_inds.pkl", "rb")) ## random train/test partition where the models has been trained on
super_tr_inds, super_te_inds = inds["train"], inds["test"]
super_tr_inds.shape, super_te_inds.shape


Trying to unpickle estimator StandardScaler from version 0.24.2 when using version 0.23.2. This might lead to breaking code or invalid results. Use at your own risk.



((300,), (152,))

In [7]:
bz = 16
f = "blyp_hfx_50"
tr_inds, te_inds = super_tr_inds, super_te_inds
name_tr, name_te = df_org['name'].values[tr_inds],  df_org['name'].values[te_inds]
X_tr, X_te = X_org[super_tr_inds], X_org[super_te_inds]
y_t = np.abs(df_org["delta.%s.vertsse"%f].values)
y_scaler = y_scalers[f]
y_t = y_scalers[f].transform(y_t.reshape(-1, 1)).reshape(-1, )
y_tr, y_te = y_t[super_tr_inds], y_t[super_te_inds]

data_tr, data_te = numpy_to_dataset(X_tr, y_tr, regression=True), numpy_to_dataset(X_te, y_te, regression=True)
tr_l = SubsetDataset(data_tr, list(range(len(data_tr))))
te_l = SubsetDataset(data_te, list(range(len(data_te))))
# print("sub labeled dataset length: ", len(tr_l), len(te_l))

l_tr_iter = iter(DataLoader(tr_l, bz, num_workers=num_workers,
                            sampler=InfiniteSampler(len(tr_l))))
l_te_iter = iter(DataLoader(te_l, bz, num_workers=num_workers,
                            sampler=InfiniteSampler(len(te_l))))
te_loader = DataLoader(te_l, len(te_l), num_workers=num_workers)
tr_l_loader = DataLoader(tr_l, len(tr_l), num_workers=num_workers)

best_model = pickle.load(open(basepath + "/models-trends/mergedG10-abs-reg-%s.pkl"%f, "rb"))
best_model.eval()
preds = []
labels = []
with torch.no_grad():
    for x, y in te_loader:
        _pred = best_model(x.to(device))
        preds.append(_pred.cpu().numpy())
        labels.append(y.cpu().numpy())
y_t = y_scaler.inverse_transform(labels[0].reshape(-1, 1)).reshape(-1, )
y_hat = y_scaler.inverse_transform(preds[0].reshape(-1, 1)).reshape(-1, )
non_nan_inds = np.where(~np.isnan(y_t))[0]
y_t_super = np.copy(y_t)
_y_t = y_t[non_nan_inds]
_y_hat = y_hat[non_nan_inds]
mae = mean_absolute_error(_y_hat, _y_t)
scaled_mae = mae/(np.max(_y_t) - np.min(_y_t))
R2 = r2_score(_y_t, _y_hat)
rval = pearsonr(_y_t, _y_hat)[0]
print(f, "mae: ", round(mae, 5), "scaled mae: ", round(scaled_mae, 5), "R2: ", round(R2, 4), "r val: ", round(rval, 4))

blyp_hfx_50 mae:  2.25792 scaled mae:  0.06365 R2:  0.7502 r val:  0.8681


### Adversarial attack map

In [8]:
import nglview
from dfa_recommender.df_class import get_molecule
from molSimplify.Classes.mol3D import mol3D

def find_list_different(A, B):
    return list(set(A).difference(set(B)))

def get_zeroth_shell(kulik_mol):
    kulik_mol.ozcs = kulik_mol.findMetal()

def get_first_shell(kulik_mol):
    fcs = kulik_mol.get_fcs()
    kulik_mol.ofcs = find_list_different(fcs, kulik_mol.ozcs) 

def get_second_shell(kulik_mol):
    inds = []
    for ii in kulik_mol.ofcs:
        inds += kulik_mol.getBondedAtoms(ii)
    inds = find_list_different(inds, kulik_mol.ozcs)
    kulik_mol.oscs = find_list_different(inds, kulik_mol.ofcs)

def get_global_shell(kulik_mol):
    inds = list(range(kulik_mol.natoms))
    inds = find_list_different(inds, kulik_mol.ozcs)
    inds = find_list_different(inds, kulik_mol.ofcs)
    kulik_mol.ogcs = find_list_different(inds, kulik_mol.oscs)



##### attention with blyp/50%

In [9]:
eps = 1.
xi = 1e-3
alpha = 1.
cut = True
vat_criterion = regVAT(device, eps, xi, alpha, k=3, cut=cut)

In [10]:
l_tr_iter = iter(DataLoader(
    tr_l, len(tr_l), num_workers=0,
    ))
l_x, l_y = next(l_tr_iter)
l_x.shape

torch.Size([300, 65, 182])

In [11]:
nl_x = l_x.numpy()
mapping = {0: 1e-6, 1: 6, 2: 25, 3: 25, 4: 25, 5: 25, 6: 58*2, 7: 58*2, 8: 58*2, 9: 58*2}
d_x = vat_criterion(best_model, l_x, return_adv=True).numpy()
#d_aggr = np.mean(np.abs(d_x), axis=-1)
d_aggr = np.zeros(shape=(d_x.shape[0], d_x.shape[1]))
for ii, _d in enumerate(d_x):
    for jj, __d in enumerate(_d):
        nf = mapping[int(nl_x[ii, jj, -1])]
        d_aggr[ii, jj] =  np.sum(np.abs(__d))/(3*nf +5)
    

In [12]:
tot_mean_shell_focus = {
    "ozcs": [], "ofcs": [], "oscs": [], "ogcs": []
}
for ii in range(l_x.shape[0]):
    xyzfile = basepath + "/optgeos/optgeo_%s.xyz"%name_tr[ii]
    kulik_mol = mol3D()
    kulik_mol.readfromxyz(xyzfile)
    get_zeroth_shell(kulik_mol=kulik_mol)
    get_first_shell(kulik_mol=kulik_mol)
    get_second_shell(kulik_mol=kulik_mol)
    get_global_shell(kulik_mol=kulik_mol)
    mean_shell_focus = {}
    for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
        inds = getattr(kulik_mol, attr)
        mean_shell_focus[attr] = np.mean(d_aggr[ii][inds])
        tot_mean_shell_focus[attr] += [mean_shell_focus[attr]]


Mean of empty slice.


invalid value encountered in double_scalars



In [13]:
res = {
    "mean": list(),
    "std": list(),
}
mapping = {
    "ozcs": "metal          ", 
    "ofcs": "first shell    ", 
    "oscs": "second shell   ", 
    "ogcs": "third & global ",
}
print("shell           mean    std. dev.")
for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
    print(mapping[attr], round(np.nanmean(tot_mean_shell_focus[attr]), 4), round(np.nanstd(tot_mean_shell_focus[attr]), 4))
    res["mean"].append(np.nanmean(tot_mean_shell_focus[attr]))
    res["std"].append(0.5*np.nanstd(tot_mean_shell_focus[attr]))

shell           mean    std. dev.
metal           0.0239 0.0058
first shell     0.0126 0.0085
second shell    0.0076 0.0071
third & global  0.0031 0.0022


##### model focus in blyp family

In [14]:
res_all = json.load(open(basepath + "/focus_blyp_family.json"))
keys = ["metal", "1st sphere", "2nd sphere", "global"]
data = []
for f in res_all:
    trace0 = go.Bar(
        name=f,
        x=keys, 
        y=res_all[f]["mean"],
        opacity=1,
        error_y=dict(type='data', array=res_all[f]["std"])
    )
    data += [trace0,]
layout = go.Layout()
layout.update(glob_layout)
layout["yaxis"].update({'title': "model focus", })
layout.update(width=900, height=500, boxmode='group')
fig = dict(data=data, layout=layout)
iplot(fig)

##### eample complexes

In [15]:
ii = 113
xyzfile = basepath + "/optgeos/optgeo_%s.xyz"%name_tr[ii]
try:
    mol, _ = get_molecule(
        xyzfile=xyzfile, 
        charge=2, spin=4
        )
except:
    mol, _ = get_molecule(
        xyzfile=xyzfile, 
        charge=2, spin=5
        )
kulik_mol = mol3D()
kulik_mol.readfromxyz(xyzfile)
rev_max = list(reversed((np.argsort(d_aggr[ii]))))[:7]
fcs = kulik_mol.get_fcs()
fdict = {}
for _ii in range(kulik_mol.natoms):
    fdict["%d-%s"%(_ii, kulik_mol.getAtom(_ii).symbol())] = np.power(d_aggr[ii][_ii], 1./3)
get_zeroth_shell(kulik_mol=kulik_mol)
get_first_shell(kulik_mol=kulik_mol)
get_second_shell(kulik_mol=kulik_mol)
get_global_shell(kulik_mol=kulik_mol)
mean_shell_focus = {}
for attr in ["ozcs", "ofcs", "oscs", "ogcs"]:
    inds = getattr(kulik_mol, attr)
    mean_shell_focus[attr] = np.power(np.mean(d_aggr[ii][inds]), 1./3)
print("xyzfile: ", xyzfile)
print("catoms: ", [kulik_mol.getAtom(_ii).symbol() for _ii in fcs], fcs)
print("actual max: ", [kulik_mol.getAtom(_ii).symbol() for _ii in rev_max], rev_max)
print("diff compared to fcs: ", set(fcs).difference(set(rev_max)))
print(fcs, list(reversed((np.argsort(d_aggr[ii])))))
print("fdict: ", fdict)
print("mean_shell_focus: ", mean_shell_focus)



c: [2.0]
fc: [2.0]
m: [4]
fm: [4]
xyzfile:  /Users/duanchenru/Packages/dfa_recommender/dfa_recommender/data//optgeos/optgeo_cr_3_formaldehyde-O-1-0_formaldehyde-O-1-0_formaldehyde-O-1-0_carbonyl-C-0-0_formaldehyde-O-1-0_carbonyl-C-0-0_4.xyz
catoms:  ['Cr', 'O', 'O', 'O', 'C', 'O', 'C'] [0, 2, 6, 10, 13, 16, 19]
actual max:  ['O', 'O', 'O', 'O', 'Cr', 'O', 'O'] [14, 6, 20, 16, 0, 2, 10]
diff compared to fcs:  {19, 13}
[0, 2, 6, 10, 13, 16, 19] [14, 6, 20, 16, 0, 2, 10, 13, 19, 1, 5, 9, 15, 11, 8, 12, 3, 4, 7, 17, 18, 25, 29, 28, 27, 26, 21, 24, 23, 22, 31, 30, 64, 63, 33, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 32]
fdict:  {'0-Cr': 0.2869639828658928, '1-C': 0.1946729340190818, '2-O': 0.2844666467746047, '3-H': 0.11708791137809774, '4-H': 0.11681454831124312, '5-C': 0.19312697662108494, '6-O': 0.29103725342420533, '7-H': 0.11618757687473329, '8-H': 0.11726944013213049, '9-C': 0.1917684838459078, '10-O': 0.2675

In [16]:
view = nglview.show_psi4(mol)
view.representations = [
    {"type": "cartoon", "params": {
        "sele": "protein", "color": "residueindex"
    }},
    {"type": "ball+stick", "params": {
        "sele": "hetero"
    }},
    {"type": "labels", "params": {
        "labelType": "number"
    }}
]
view

NGLWidget()

In [None]:
type(view)