In [1]:
import numpy as np
import pandas as pd
import json
import pickle
from sklearn.metrics import r2_score, mean_absolute_error
from scipy.stats import pearsonr, spearmanr
import copy

##### load for plotting (with plotly)

In [2]:
from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
import plotly
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
plotly.io.orca.config.executable = '/Users/chenruduan/opt/anaconda3/envs/mols_newplotly/bin/orca'
init_notebook_mode(connected=True)
glob_layout = go.Layout(
    font=dict(family='Helvetica', size=24, color='black'),
    margin=dict(l=100, r=10, t=10, b=100),
    xaxis=dict(showgrid=False,  zeroline=False, ticks="inside", showline=True,
               tickwidth=1.5, linewidth=1.5, ticklen=10, linecolor='black',
               mirror="allticks", color="black"),
    yaxis=dict(showgrid=False,  zeroline=False, ticks="inside", showline=True,
               tickwidth=1.5, linewidth=1.5, ticklen=10, linecolor='black',
               mirror="allticks", color="black"),
    legend_orientation="v",
    paper_bgcolor='rgba(255,255,255,100)',
    plot_bgcolor='white',
)
blue = "rgba(0, 0, 255, 1)"
red = "rgba(255, 0, 0, 1)"
green = "rgba(0, 196, 64, 1)"
gray = "rgba(140, 140, 140, 1)"

##### load for ML parts 

In [3]:
from dfa_recommender.net import GatedNetwork, MySoftplus, TiledMultiLayerNN, MLP, finalMLP, ElementalGate
from dfa_recommender.dataset import SubsetDataset
from dfa_recommender.sampler import InfiniteSampler
from dfa_recommender.ml_utils import numpy_to_dataset
import torch
from torch.utils.data import DataLoader

##### DFAs that we considered

In [4]:
base_keys = ["name", "path", "metal",]
functionals = [
    "bp86", "blyp", "pbe",
    "tpss", "scan", "m06-l", "mn15-l",
    "b3p86", "b3pw91", "b3lyp",
    "tpssh", "scan0", "m06", "m06-2x",
    "wb97x", "LRC-wPBEh",
    "b2gpplyp", "pbe0-dh", "dsd-blyp-d3bj", "dsd-pbeb95-d3bj", "dsd-pbep86-d3bj",
]
functionals += ["blyp_hfx_10", "blyp_hfx_20", "blyp_hfx_30", "blyp_hfx_40", "blyp_hfx_50",
                "pbe_hfx_10", "pbe_hfx_20", "pbe_hfx_30", "pbe_hfx_40", "pbe_hfx_50",   
                "scan_hfx_10", "scan_hfx_20", "scan_hfx_30", "scan_hfx_40", "scan_hfx_50", 
                "m06-l_hfx_10", "m06-l_hfx_30", "m06-l_hfx_40", "m06-l_hfx_50", 
                "mn15-l_hfx_10", "mn15-l_hfx_20", "mn15-l_hfx_30", "mn15-l_hfx_40", "mn15-l_hfx_50"]
all_functionals = copy.deepcopy(functionals)

### Predict the veritcal spin-splitting energy on the CSD set

In [5]:
torch.set_num_threads(4)
torch.manual_seed(0)
np.random.seed(0)
device = torch.device('cpu')
num_workers = 0

##### set path for relavant data files: previous model directly tested on the out-of-distribution CSD complexes that have more diverse ligands and connectivities

In [6]:
from pkg_resources import resource_filename, Requirement
basepath = resource_filename(Requirement.parse("dfa_recommender"), "/dfa_recommender/data/")

In [7]:
X_csd = pickle.load(open(basepath +  "/X_csd.pickle", "rb")) ## features 
df_csd = pd.read_csv(basepath + "/CSD-76.csv") ## csv file that stores the compuated vert SSE values at different methods (CSD data)
df_org = pd.read_csv(basepath + "/VSS-452.csv") ## csv file that stores the compuated vert SSE values at different methods (self-assembled complexes, used in training)
y_scalers = pickle.load(open(basepath +  "/abs-reg-y_scalers.pkl", "rb")) ## sklearn.preprocessing.StandardScaler object created on the stats of training data


Trying to unpickle estimator StandardScaler from version 0.24.2 when using version 0.22.1. This might lead to breaking code or invalid results. Use at your own risk.



##### predict |DFA - DLPNO-CCSD(T)| vertical spin splitting

In [8]:
import sklearn.preprocessing

res_csd = {}
mae_dict = {}
for f in all_functionals:
    bz = 16
    
    y_t = np.abs(df_csd["delta.%s.vertsse"%f].values)
    y_scaler = y_scalers[f]
    _y = np.abs(df_org["delta.%s.vertsse"%f].values).reshape(-1,1)
    
    # y_t = (df_csd["delta.%s.vertsse"%f].values)
    # _y = (df_org["delta.%s.vertsse"%f].values).reshape(-1,1)
    y_scaler = sklearn.preprocessing.StandardScaler()
    y_scaler.fit(_y)
    y_t = y_scaler.transform(y_t.reshape(-1, 1)).reshape(-1, )

    data_te = numpy_to_dataset(X_csd, y_t, regression=True)
    te_l = SubsetDataset(data_te, list(range(len(data_te))))

    te_loader = DataLoader(te_l, len(te_l), num_workers=num_workers)
    
    best_model = pickle.load(open(basepath + "/models-trends/mergedG10-abs-reg-%s.pkl"%f, "rb"))
    best_model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for x, y in te_loader:
            _pred = best_model(x.to(device))
            preds.append(_pred.cpu().numpy())
            labels.append(y.cpu().numpy())
    y_t = y_scaler.inverse_transform(labels[0].reshape(-1, 1)).reshape(-1, )
    y_hat = y_scaler.inverse_transform(preds[0].reshape(-1, 1)).reshape(-1, )
    non_nan_inds = np.where(~np.isnan(y_t))[0]
    y_t_super = np.copy(y_t)
    _y_t = y_t[non_nan_inds]
    _y_hat = y_hat[non_nan_inds]
    mae = mean_absolute_error(_y_hat, _y_t)
    scaled_mae = mae/(np.max(_y_t) - np.min(_y_t))
    R2 = r2_score(_y_t, _y_hat)
    rval = pearsonr(_y_t, _y_hat)[0]
    print(f, "mae: ", round(mae, 5), "scaled mae: ", round(scaled_mae, 5), "R2: ", round(R2, 4), "r val: ", round(rval, 4))
    res_csd[f + ".y_t"] = np.abs(y_t)
    res_csd[f + ".y_hat"] = np.abs(y_hat)
    res_csd["name"] = df_csd["name"].values
    res_csd["dlpno.vertsse"] = df_csd["dlpno-CCSD_T.vertsse"].values

bp86 mae:  5.10618 scaled mae:  0.10821 R2:  0.3338 r val:  0.7022
blyp mae:  5.17807 scaled mae:  0.10589 R2:  0.3762 r val:  0.7557
pbe mae:  5.65872 scaled mae:  0.12107 R2:  -0.0429 r val:  0.5629
tpss mae:  5.03308 scaled mae:  0.11058 R2:  0.2663 r val:  0.6756
scan mae:  5.20967 scaled mae:  0.14784 R2:  0.0169 r val:  0.6809
m06-l mae:  7.89871 scaled mae:  0.18344 R2:  -23.2812 r val:  0.1056
mn15-l mae:  3.99876 scaled mae:  0.13287 R2:  -0.2121 r val:  0.2777
b3p86 mae:  5.53022 scaled mae:  0.19485 R2:  -4.4774 r val:  0.4422
b3pw91 mae:  7.59097 scaled mae:  0.28091 R2:  -13.5836 r val:  0.3186
b3lyp mae:  3.76789 scaled mae:  0.12289 R2:  0.2608 r val:  0.7207
tpssh mae:  5.36817 scaled mae:  0.13869 R2:  -0.089 r val:  0.5801
scan0 mae:  4.04293 scaled mae:  0.12544 R2:  0.0898 r val:  0.626
m06 mae:  3.29969 scaled mae:  0.13703 R2:  -0.2063 r val:  0.2964
m06-2x mae:  2.67681 scaled mae:  0.09655 R2:  0.5972 r val:  0.8092
wb97x mae:  6.29522 scaled mae:  0.22941 R2:  

In [62]:
mae_dict = json.load(open("./csd_mae_reg_mergedG10.json", "r"))
data=go.Violin(
    y=list(mae_dict.values()),
    box_visible=True,
    line_color='black',
    meanline_visible=True, 
    fillcolor='lightseagreen',
    pointpos=-1.5,
    points="all",
    opacity=0.6,
    x0='TL models',
    text=list(mae_dict.keys()),
    )
layout = go.Layout()
layout.update(glob_layout)
layout["xaxis"].update({'range': [-0.6, 0.4]})
layout["yaxis"].update({'title': "MAE (kcal/mol)", "range": [1.5, 8]})
layout.update(width=500, height=500, boxmode='group')
fig = go.Figure(data=data, layout=layout)
iplot(fig)
fig.write_image("../../../../Dropbox (MIT)/CD-DFARecFull/Figures/F4/TL_rec_MAE.pdf")

### Analyze

##### sort based on ML predicted |DFA - DLPNO-CCSD(T)| vertical spin splitting to select DFAs

In [9]:
df_res = pd.DataFrame.from_dict(res_csd)

In [27]:
removed = []
functionals = list(set(all_functionals).difference(set(removed)))

thresh = 0.0
errs_t, errs_hat, best_fs, lmstds, lower_bound, true_best_fs, ranks = [
], [], [], [], [], [], []
err = {}
for f in functionals:
    err[f] = []
for _, row in df_res.iterrows():
    res_true = [row[f + ".y_t"]
                if not np.isnan(row[f + ".y_t"]) else 1000 for f in functionals]
    fs_true = [x for _, x in sorted(zip(res_true, functionals))]
    res_true = sorted(res_true)
    res_hat = [row[f + ".y_hat"]
               if ((not np.isnan(row[f + ".y_t"]))) else 1000 for f in functionals]
    fs = [x for _, x in sorted(zip(res_hat, functionals))]
    res_hat = sorted(res_hat)
    ranks += [res_true.index(row[fs[0] + ".y_t"])]

    errs_t += [row[fs[0] + ".y_t"]]
    errs_hat += [row[fs[0] + ".y_hat"]]
    best_fs += [fs[0]]
    lower_bound += [row[fs_true[0] + ".y_t"]]
    true_best_fs += [fs_true[0]]

df_sel = pd.DataFrame.from_dict({"errs_t": errs_t, "errs_hat": errs_hat,
                                 "best_fs": best_fs, "lower_bound": lower_bound, "true_best_fs": true_best_fs,
                                 "ranks": ranks, "name": df_res["name"].values,
                                 "dlpno.vertsse": df_res["dlpno.vertsse"].values
                                 })
df_sel = df_sel.sort_values('errs_t')


topx = 6
thresh = 0.0
errs_t, errs_hat, best_fs, lmstds, lower_bound, true_best_fs, ranks = [
], [], [], [], [], [], []
err = {}
for f in functionals:
    err[f] = []
for _, row in df_res.iterrows():
    res_true = [row[f + ".y_t"]
                if not np.isnan(row[f + ".y_t"]) else 1000 for f in functionals]
    fs_true = [x for _, x in sorted(zip(res_true, functionals))]
    res_true = sorted(res_true)
    res_hat = [row[f + ".y_hat"]
               if ((not np.isnan(row[f + ".y_t"]))) else 1000 for f in functionals]
    fs = [x for _, x in sorted(zip(res_hat, functionals))]
    res_hat = sorted(res_hat)
    ranks += [res_true.index(row[fs[0] + ".y_t"])]

    errs_t += [[round(row[fs[ii] + ".y_t"], 2) for ii in range(topx)]]
    errs_hat += [[round(row[fs[ii] + ".y_hat"], 2) for ii in range(topx)]]
    best_fs += [[fs[ii] for ii in range(topx)]]
    lower_bound += [[round(row[fs_true[ii] + ".y_t"], 2)
                     for ii in range(topx)]]
    true_best_fs += [[fs_true[ii] for ii in range(topx)]]
df_sel_top = pd.DataFrame.from_dict({"errs_t": errs_t, "errs_hat": errs_hat, 
                                     "best_fs": best_fs, "lower_bound": lower_bound, "true_best_fs": true_best_fs,
                                     "ranks": ranks, "name": df_res["name"].values,
                                     "dlpno.vertsse": df_res["dlpno.vertsse"].values,
                                     })
df_sel_top = df_sel_top.sort_values('errs_t')


##### absolute error distribution

In [28]:
hist_data = [df_sel['errs_t'].values]
group_labels = [""]
colors = ['black', blue, green, red]
fig = ff.create_distplot(hist_data, group_labels, show_hist=True, colors=colors, bin_size=0.5)
layout = go.Layout()
layout.legend.update(x=.5, y=1, bgcolor="rgba(0,0,0,0)")
layout.update(showlegend=False)
layout.update(width=550, height=500)
layout.update(glob_layout)
layout["xaxis"].update({'title': "abs. err. (kcal/mol)"})
layout["yaxis"].update({'title': "frequency"})
fig.layout.update(layout)
fig.show()

np.mean(df_sel['errs_t'])

3.0807380323347293

##### DFA ranks

In [71]:
y = []
for ii in range(48):
    y += [len(df_sel[df_sel["ranks"] == ii])*100./len(df_sel)]
data = [go.Bar(x=list(range(48)),
               y=y, name='all', marker_color='rgba(0, 0, 0, 0.5)', showlegend=False),]
xs=list(range(48))
ys=[np.sum(y[:ii])*100./np.sum(y) for ii in xs]
for ii in range(47):
    data += [go.Scatter(x=[xs[ii], xs[ii]], y=[ys[ii], ys[ii+1]], mode='lines', yaxis="y2", line=dict(color='blue', width=2, dash='solid'), showlegend=False)]
    data += [go.Scatter(x=[xs[ii], xs[ii+1]], y=[ys[ii+1], ys[ii+1]], mode='lines', yaxis="y2", line=dict(color='blue', width=2, dash='solid'), showlegend=False)]
layout = go.Layout()
layout.update(glob_layout)
layout.legend.update(x=1, y=1, bgcolor="rgba(0,0,0,0)")
layout["xaxis"].update({'title': "DFA rank"})
layout["yaxis"].update({'title': "percentage", "mirror": False})
layout.update({"yaxis2": dict(
    title="cumulative percentage",
    titlefont=dict(color="black"),
    tickfont=dict(color="black"),
    anchor="free",
    overlaying="y",
    side="right",
    position=1,
    range=[0, 100],
    showgrid=True,  
    zeroline=True, 
    ticks="inside", 
    showline=True,
    tickwidth=3, 
    linewidth=3, 
    ticklen=10,
#     mirror=True,
)},)
layout.update(width=600, height=500, boxmode='group')
fig = go.Figure(data=data, layout=layout)
iplot(fig)
# fig.write_image("../../../../Dropbox (MIT)/CD-DFARecFull/Figures/F4/percentage.pdf")

##### likelihood of top 5 DFAs

In [36]:
vss_top5 = json.load(open("./vss452_rec_top5.json"))

In [69]:
### ----top-5----
cutoff = 15
top5_bfs = {}
for f in functionals:
    top5_bfs[f] = 0
for _, row in df_sel_top.iterrows():
    for f in row["best_fs"]:
        top5_bfs[f] += 1
true_top5_bfs = {}
for f in functionals:
    true_top5_bfs[f] = 0
for _, row in df_sel_top.iterrows():
    for f in row["true_best_fs"]:
        true_top5_bfs[f] += 1

for f in top5_bfs:
    top5_bfs[f] += true_top5_bfs[f]*0.01

from collections import Counter
bfs = dict(sorted(top5_bfs.items(), key=lambda item: item[1], reverse=True))
use_bfs_true = {}
vss_rec = {}
for f in list(bfs.keys())[:cutoff]:
    use_bfs_true[f] = true_top5_bfs[f]
    ind = vss_top5["DFA"].index(f)
    vss_rec[f] = vss_top5["percentage"][ind]
data = [go.Bar(x=list(bfs.keys())[:cutoff],
               y=np.array(list(bfs.values()))[:cutoff]*600./np.sum(list(bfs.values())), name='rec', marker_color='rgba(0, 0, 255, 0.6)'),
        go.Bar(x=list(use_bfs_true.keys()),
               y=np.array(list(use_bfs_true.values()))*600./np.sum(list(bfs.values())), name='true', marker_color='rgba(0, 240, 64, 0.6)'),
        go.Scatter(x=list(bfs.keys()),
                   y=np.array(list(vss_rec.values())), name='true', marker_color='rgba(255, 0, 0, 0.6)',
                   mode="markers+lines",
                   marker=dict(size=10),
                   line=dict(width=1.5, dash="dash"))
       ]
layout = go.Layout()
layout.update(glob_layout)
layout["xaxis"].update({"range": [-0.5, 14.5]})
layout["yaxis"].update({'title': "likelihood of top 5", "range": [0, 65]})
layout.update(width=1000, height=500, boxmode='group')
fig = go.Figure(data=data, layout=layout)
iplot(fig)
fig.write_image("../../../../Dropbox (MIT)/CD-DFARecFull/Figures/F4/top5.pdf")

In [35]:
vss_top5["DFA"].index(f)

20

In [14]:
#---Calculate rank ordering of the recommender selected DFAs and the ground truth---
TL_ranked_DFAs = list(bfs.keys())
bfs_true = dict(sorted(true_top5_bfs.items(), key=lambda item: item[1], reverse=True))
l1 = [ii for ii, _ in enumerate(list(bfs_true.keys()))]
l2 = [list(TL_ranked_DFAs).index(f) for f in list(bfs_true.keys())]
spearmanr(l1, l2)

SpearmanrResult(correlation=0.8823451910408432, pvalue=1.144141383269634e-15)

In [26]:
diff_dict = {
    "top-1": 17.76- 15.78,
    "top-3": 40.13-35.52,
    "top-5": 66.44-61.84,
    "top-10": 84.86-85.52,
    "top-20": 96.36-98.68, 
}
data = []
trace0 = go.Bar(
    x=list(diff_dict.keys()), 
    y=[-1*x for x in list(diff_dict.values())],
    # opacity=1,
    # color=colors[ii],
    marker=dict(color="rgba(140, 140, 140, 0.6)"),
    # error_y=dict(type='data', array=res_all[f]["std"])
)
data += [trace0,]
layout = go.Layout()
layout.update(glob_layout)
layout["yaxis"].update({'title': "%CSD-76 - %VSS-452", "range": [-5, 3], "tickvals": [-4, -2, 0, 2, 4]})
# layout.legend.update(x=.72, y=0.98, bgcolor="rgba(0,0,0,0)")
layout.update(width=500, height=500, boxmode='group')
fig = go.Figure(data=data, layout=layout)
iplot(fig)
# fig.write_image("../../../../Dropbox (MIT)/CD-DFARecFull/Figures/F4/percentage_comparison.pdf")