In [1]:
import numpy as np
import pandas as pd
import json
import pickle
from sklearn.metrics import r2_score, mean_absolute_error
from scipy.stats import pearsonr, spearmanr
import copy

##### load for plotting (with plotly)

In [2]:
from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
import plotly
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
plotly.io.orca.config.executable = '/Users/chenruduan/opt/anaconda3/envs/mols_newplotly/bin/orca'
init_notebook_mode(connected=True)
glob_layout = go.Layout(
    font=dict(family='Helvetica', size=24, color='black'),
    margin=dict(l=100, r=10, t=10, b=100),
    xaxis=dict(showgrid=False,  zeroline=False, ticks="inside", showline=True,
               tickwidth=1.5, linewidth=1.5, ticklen=10, linecolor='black',
               mirror="allticks", color="black"),
    yaxis=dict(showgrid=False,  zeroline=False, ticks="inside", showline=True,
               tickwidth=1.5, linewidth=1.5, ticklen=10, linecolor='black',
               mirror="allticks", color="black"),
    legend_orientation="v",
    paper_bgcolor='rgba(255,255,255,100)',
    plot_bgcolor='white',
)
blue = "rgba(0, 0, 255, 1)"
red = "rgba(255, 0, 0, 1)"
green = "rgba(0, 196, 64, 1)"
gray = "rgba(140, 140, 140, 1)"

##### load for ML parts 

In [3]:
from dfa_recommender.net import GatedNetwork, MySoftplus, TiledMultiLayerNN, MLP, finalMLP, ElementalGate
from dfa_recommender.dataset import SubsetDataset
from dfa_recommender.sampler import InfiniteSampler
from dfa_recommender.ml_utils import numpy_to_dataset
import torch
from torch.utils.data import DataLoader

##### DFAs that we considered

In [4]:
base_keys = ["name", "path", "metal",]
functionals = [
    "bp86", "blyp", "pbe",
    "tpss", "scan", "m06-l", "mn15-l",
    "b3p86", "b3pw91", "b3lyp",
    "tpssh", "scan0", "m06", "m06-2x",
    "wb97x", "LRC-wPBEh",
    "b2gpplyp", "pbe0-dh", "dsd-blyp-d3bj", "dsd-pbeb95-d3bj", "dsd-pbep86-d3bj",
]
functionals += ["blyp_hfx_10", "blyp_hfx_20", "blyp_hfx_30", "blyp_hfx_40", "blyp_hfx_50",
                "pbe_hfx_10", "pbe_hfx_20", "pbe_hfx_30", "pbe_hfx_40", "pbe_hfx_50",   
                "scan_hfx_10", "scan_hfx_20", "scan_hfx_30", "scan_hfx_40", "scan_hfx_50", 
                "m06-l_hfx_10", "m06-l_hfx_30", "m06-l_hfx_40", "m06-l_hfx_50", 
                "mn15-l_hfx_10", "mn15-l_hfx_20", "mn15-l_hfx_30", "mn15-l_hfx_40", "mn15-l_hfx_50"]
all_functionals = copy.deepcopy(functionals)

### Predict the veritcal spin-splitting energy on the 452 set

In [5]:
torch.set_num_threads(4)
torch.manual_seed(0)
np.random.seed(0)
device = torch.device('cpu')
num_workers = 0

##### set path for relavant data files: a random train/test split, where the model is tested on the in-distribution test data

In [6]:
from pkg_resources import resource_filename, Requirement
basepath = resource_filename(Requirement.parse("dfa_recommender"), "/dfa_recommender/data/")

In [7]:
X_org = pickle.load(open(basepath +  "/X_vss452.pickle", "rb")) ## features 
df_org = pd.read_csv(basepath + "/VSS-452.csv") ## csv file that stores the compuated vert SSE values at different methods
y_scalers = pickle.load(open(basepath +  "/abs-reg-y_scalers.pkl", "rb")) ## sklearn.preprocessing.StandardScaler object created on the stats of training data
tr_inds, te_inds = list(), list()
for ii, val in enumerate(df_org["train"].values):
    if val == 1:
        tr_inds.append(ii)
    elif val == 0:
        te_inds.append(ii)
len(tr_inds), len(te_inds)


Trying to unpickle estimator StandardScaler from version 0.24.2 when using version 1.0.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations



(300, 152)

In [60]:
fbest5 = [
     "mn15-l_hfx_50",
     "blyp_hfx_50",
     "m06-l_hfx_40",
     "scan_hfx_40",
     "pbe_hfx_30",
] ### only use when interpreting why certain DFAs are selected at certain ligand fields

In [64]:
for f in fbest5:
    _df = df_org.dropna(subset=["delta.%s.vertsse"%f])
    print(f, pearsonr(_df["delta.%s.vertsse"%f].values, _df["dlpno-CCSD_T.vertsse"].values)[0])

mn15-l_hfx_50 -0.287844709468351
blyp_hfx_50 -0.4548702516281156
m06-l_hfx_40 -0.1967919200661769
scan_hfx_40 -0.2766586962527391
pbe_hfx_30 -0.3292976120220115


In [66]:
trace = go.Scatter(
    y=df_org["delta.m06-l_hfx_40.vertsse"].values,
    x=df_org["dlpno-CCSD_T.vertsse"].values,
    mode="markers",
    marker=dict(size=6)
)
data = [trace]
layout = go.Layout()
layout.update(glob_layout)
layout.legend.update(x=1, y=1, bgcolor="rgba(0,0,0,0)")
layout["yaxis"].update({'title': "absolute error (kcal/mol)",  "range": [0, 20],})
layout["xaxis"].update({'title': "DLPNO DEH–L (kcal/mol)", "range": [-100, -10], "tickvals": [-100, -90, -80, -70, -60, -50, -40, -30, -20, -10]})
layout.update(width=600, height=550, boxmode='group')
fig = go.Figure(data=data, layout=layout)
iplot(fig)

##### predict |DFA - DLPNO-CCSD(T)| vertical spin splitting

In [10]:
res_all = {}
mae_dict = {}
for f in all_functionals:
    bz = 16
    
    X_tr, X_te = X_org[tr_inds], X_org[te_inds]
    y_t = np.abs(df_org["delta.%s.vertsse"%f].values)
    y_scaler = y_scalers[f]
    y_t = y_scalers[f].transform(y_t.reshape(-1, 1)).reshape(-1, )
    y_tr, y_te = y_t[tr_inds], y_t[te_inds]

    data_tr, data_te = numpy_to_dataset(X_tr, y_tr, regression=True), numpy_to_dataset(X_te, y_te, regression=True)
    tr_l = SubsetDataset(data_tr, list(range(len(data_tr))))
    te_l = SubsetDataset(data_te, list(range(len(data_te))))
    # print("sub labeled dataset length: ", len(tr_l), len(te_l))

    l_tr_iter = iter(DataLoader(tr_l, bz, num_workers=num_workers,
                                sampler=InfiniteSampler(len(tr_l))))
    l_te_iter = iter(DataLoader(te_l, bz, num_workers=num_workers,
                                sampler=InfiniteSampler(len(te_l))))
    te_loader = DataLoader(te_l, len(te_l), num_workers=num_workers)
    tr_l_loader = DataLoader(tr_l, len(tr_l), num_workers=num_workers)
    
    best_model = pickle.load(open(basepath + "/models-trends/mergedG10-abs-reg-%s.pkl"%f, "rb"))
    best_model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for x, y in te_loader:
            _pred = best_model(x.to(device))
            preds.append(_pred.cpu().numpy())
            labels.append(y.cpu().numpy())
    y_t = y_scaler.inverse_transform(labels[0].reshape(-1, 1)).reshape(-1, )
    y_hat = y_scaler.inverse_transform(preds[0].reshape(-1, 1)).reshape(-1, )
    non_nan_inds = np.where(~np.isnan(y_t))[0]
    y_t_super = np.copy(y_t)
    _y_t = y_t[non_nan_inds]
    _y_hat = y_hat[non_nan_inds]
    mae = mean_absolute_error(_y_hat, _y_t)
    scaled_mae = mae/(np.max(_y_t) - np.min(_y_t))
    R2 = r2_score(_y_t, _y_hat)
    rval = pearsonr(_y_t, _y_hat)[0]
    mae_dict[f] = np.float64(mae)
    res_all[f + ".y_t"] = np.abs(y_t)
    res_all[f + ".y_hat"] = np.abs(y_hat)
    res_all["name"] = df_org["name"].values[te_inds]
    res_all["metal"] = df_org["metal"].values[te_inds]
    res_all["dlpno.vertsse"] = df_org["dlpno-CCSD_T.vertsse"].values[te_inds]

### TL model errors

In [10]:
mae_dict = json.load(open("mae_reg_mergedG10.json", "r"))
data=go.Violin(
    y=list(mae_dict.values()),
    box_visible=True,
    line_color='black',
    meanline_visible=True, 
    fillcolor='lightseagreen',
    pointpos=-1.5,
    points="all",
    opacity=0.6,
    x0='TLM',
    text=list(mae_dict.keys()),
    )
layout = go.Layout()
layout.update(glob_layout)
layout["xaxis"].update({'range': [-0.6, 0.4]})
layout["yaxis"].update({'title': "MAE (kcal/mol)", "range": [1.5, 4]})
layout.update(width=500, height=500, boxmode='group')
fig = dict(data=data, layout=layout)
iplot(fig)

In [11]:
x, y = [], []
mae_dict = dict(sorted(mae_dict.items(), key=lambda item: item[1]))
dfa_names = []
for f in list(mae_dict.keys()):
    if "_hfx_" in f:
        _ff = f.split("_")
        f = _ff[0] + ":%s%s"%(_ff[-1], "%")
    if not "91" in f:
        f = f.upper().replace("W", "\u03C9")
    else:
        f = f.upper()
    dfa_names.append(f)
data = [go.Bar(x=dfa_names,
               y=list(mae_dict.values()), name='all'),]
layout = go.Layout()
layout.update(glob_layout)
layout["xaxis"].update({'title': "DFA"})
layout["yaxis"].update({'title': "MAE (kcal/mol)", })
layout.update(width=1600, height=500, boxmode='group')
fig = go.Figure(data=data, layout=layout)
iplot(fig)
# fig.write_image("/Users/chenruduan/Dropbox (MIT)/CD-DFARecFull/SI/SI_Figures/TLMAEs/F1.pdf")

In [12]:
data=go.Box(
    y=list(df_org["4.deltaE"].values),
    line_color='black',
    fillcolor='lightseagreen',
    boxpoints='outliers',
    opacity=0.6,
    text=list(mae_dict.keys()),
    boxmean=True,
    # notched=True,
    )
layout = go.Layout()
layout.update(glob_layout)
layout["xaxis"].update({'range': [-0.75, 0.75]})
layout["yaxis"].update({'title': "MAE (kcal/mol)", "range": [0, 6]})
layout.update(width=500, height=500, boxmode='group')
fig = go.Figure(data=data, layout=layout)
iplot(fig)
# fig.write_image("/Users/chenruduan/Dropbox (MIT)/CD-DFARecFull/SI/SI_Figures/BoxErrTop5/F1.pdf")

### Analyze

##### sort based on ML predicted |DFA - DLPNO-CCSD(T)| vertical spin splitting to select DFAs

In [13]:
df_res = pd.DataFrame.from_dict(res_all)
len(df_res)

152

In [37]:
removed = []
functionals = list(set(all_functionals).difference(set(removed)))
functionals = [
     "mn15-l_hfx_50",
     "blyp_hfx_50",
     "m06-l_hfx_40",
     "scan_hfx_40",
     "pbe_hfx_30",
] ### only use when interpreting why certain DFAs are selected at certain ligand fields
thresh = 0.0
errs_t, errs_hat, best_fs, lmstds, lower_bound, true_best_fs, ranks = [
], [], [], [], [], [], []
err = {}
for f in functionals:
    err[f] = []
for _, row in df_res.iterrows():
    res_true = [row[f + ".y_t"]
                if not np.isnan(row[f + ".y_t"]) else 1000 for f in functionals]
    fs_true = [x for _, x in sorted(zip(res_true, functionals))]
    res_true = sorted(res_true)
    res_hat = [row[f + ".y_hat"]
               if ((not np.isnan(row[f + ".y_t"]))) else 1000 for f in functionals]
    fs = [x for _, x in sorted(zip(res_hat, functionals))]
    res_hat = sorted(res_hat)
    ranks += [res_true.index(row[fs[0] + ".y_t"])]

    errs_t += [row[fs[0] + ".y_t"]]
    errs_hat += [row[fs[0] + ".y_hat"]]
    best_fs += [fs[0]]
    lower_bound += [row[fs_true[0] + ".y_t"]]
    true_best_fs += [fs_true[0]]

df_sel = pd.DataFrame.from_dict({"errs_t": errs_t, "errs_hat": errs_hat,
                                 "best_fs": best_fs, "lower_bound": lower_bound, "true_best_fs": true_best_fs,
                                 "ranks": ranks, "name": df_res["name"].values,
                                 "dlpno.vertsse": df_res["dlpno.vertsse"].values
                                 })
df_sel = df_sel.sort_values('errs_t')


topx = 4 + 1
thresh = 0.0
errs_t, errs_hat, best_fs, lmstds, lower_bound, true_best_fs, ranks = [
], [], [], [], [], [], []
err = {}
for f in functionals:
    err[f] = []
for _, row in df_res.iterrows():
    res_true = [row[f + ".y_t"]
                if not np.isnan(row[f + ".y_t"]) else 1000 for f in functionals]
    fs_true = [x for _, x in sorted(zip(res_true, functionals))]
    res_true = sorted(res_true)
    res_hat = [row[f + ".y_hat"]
               if ((not np.isnan(row[f + ".y_t"]))) else 1000 for f in functionals]
    fs = [x for _, x in sorted(zip(res_hat, functionals))]
    res_hat = sorted(res_hat)
    ranks += [res_true.index(row[fs[0] + ".y_t"])]

    errs_t += [[round(row[fs[ii] + ".y_t"], 2) for ii in range(topx)]]
    errs_hat += [[round(row[fs[ii] + ".y_hat"], 2) for ii in range(topx)]]
    best_fs += [[fs[ii] for ii in range(topx)]]
    lower_bound += [[round(row[fs_true[ii] + ".y_t"], 2)
                     for ii in range(topx)]]
    true_best_fs += [[fs_true[ii] for ii in range(topx)]]
df_sel_top = pd.DataFrame.from_dict({"errs_t": errs_t, "errs_hat": errs_hat, 
                                     "best_fs": best_fs, "lower_bound": lower_bound, "true_best_fs": true_best_fs,
                                     "ranks": ranks, "name": df_res["name"].values,
                                     "dlpno.vertsse": df_res["dlpno.vertsse"].values,
                                     "metal": df_res["metal"].values,
                                     })
df_sel_top = df_sel_top.sort_values('errs_t')


In [41]:
# pd.set_option('display.max_rows', None)
# df_sel_top

##### absolute error distribution

In [15]:
hist_data = [df_sel['errs_t'].values]
group_labels = [""]
colors = ['gray', blue, green, red]
fig = ff.create_distplot(hist_data, group_labels, show_hist=True, colors=colors, bin_size=0.25, show_rug=True)
layout = go.Layout()
layout.legend.update(x=.5, y=1, bgcolor="rgba(0,0,0,0)")
layout.update(showlegend=False)
layout.update(width=550, height=500)
layout.update(glob_layout)
layout["xaxis"].update({'title': "abs. err. (kcal/mol)", "range": [0, 10]})
layout["yaxis"].update({'title': "normalized frequency", "range": [0, 0.5]})
fig.layout.update(layout)
fig.show()

# fig.write_image("/Users/chenruduan/Dropbox (MIT)/CD-DFARecFull/Figures/F2/ErrDistr.pdf")
np.mean(df_sel['errs_t'])

2.1270691410234

In [16]:
err_ranges = np.linspace(0, 10, 21)
y = []
for ii, err in enumerate(err_ranges[:-1]):
    y += [len(df_sel[(df_sel["errs_t"] >= err) & (df_sel["errs_t"] < err_ranges[ii+1])])*100./len(df_sel)]
y += [len(df_sel[(df_sel["errs_t"] >= err_ranges[-1])])*100./len(df_sel)]
data = [go.Bar(x=err_ranges,
               y=y, name='all', marker_color='rgba(0, 0, 0, 0.5)', showlegend=False),]
xs=err_ranges
ys=[np.sum(y[:ii])*100./np.sum(y) for ii, _ in enumerate(xs)]
for ii in range(xs.shape[0]-1):
    data += [go.Scatter(x=[xs[ii], xs[ii]], y=[ys[ii], ys[ii+1]], mode='lines', yaxis="y2", line=dict(color='blue', width=2, dash='solid'), showlegend=False)]
    data += [go.Scatter(x=[xs[ii], xs[ii+1]], y=[ys[ii+1], ys[ii+1]], mode='lines', yaxis="y2", line=dict(color='blue', width=2, dash='solid'), showlegend=False)]
layout = go.Layout()
layout.update(glob_layout)
layout.legend.update(x=1, y=1, bgcolor="rgba(0,0,0,0)")
layout["xaxis"].update({'title': "recommender error (kcal/mol)"})
layout["yaxis"].update({'title': "percentage", "mirror": False})
layout.update({"yaxis2": dict(
    title="cumulative percentage",
    titlefont=dict(color="black"),
    tickfont=dict(color="black"),
    anchor="free",
    overlaying="y",
    side="right",
    position=1,
    range=[0, 100],
    showgrid=True,  
    zeroline=True, 
    ticks="inside", 
    showline=True,
    tickwidth=3, 
    linewidth=3, 
    ticklen=10,
#     mirror=True,
)},)
layout.update(width=600, height=500, boxmode='group')
fig = go.Figure(data=data, layout=layout)
iplot(fig)
# fig.write_image("/Users/chenruduan/Dropbox (MIT)/CD-DFARecFull/Figures/F2/ErrBar.pdf")

In [17]:
len(df_sel[df_sel['errs_t'] < 3]), len(df_sel)

(117, 152)

In [86]:
117./152

0.7697368421052632

In [18]:
df_sel

Unnamed: 0,errs_t,errs_hat,best_fs,lower_bound,true_best_fs,ranks,name,dlpno.vertsse
101,0.011120,4.754551,m06-2x,0.011120,m06-2x,0,fe_3_acetonitrile-N-1-0_acetonitrile-N-1-0_ace...,-58.564294
142,0.023864,0.483150,mn15-l_hfx_40,0.023864,mn15-l_hfx_40,0,mn_2_pyridine-N-3-0_pyridine-N-3-0_pyridine-N-...,-82.512136
128,0.052006,0.661040,mn15-l_hfx_50,0.052006,mn15-l_hfx_50,0,mn_2_furan-O-0-0_furan-O-0-0_furan-O-0-0_furan...,-92.709356
76,0.086723,0.381381,mn15-l_hfx_50,0.086723,mn15-l_hfx_50,0,fe_2_formaldehyde-O-1-0_formaldehyde-O-1-0_for...,-47.819742
95,0.122908,0.458692,pbe0-dh,0.122908,pbe0-dh,0,fe_2_phosphine-P-0-0_phosphine-P-0-0_phosphine...,-44.352643
...,...,...,...,...,...,...,...,...
3,6.974671,0.833956,m06-l_hfx_40,0.000174,pbe_hfx_20,25,co_2_acetonitrile-N-1-0_acetonitrile-N-1-0_ace...,-29.716134
129,7.279782,1.374167,mn15-l_hfx_50,0.388184,scan_hfx_30,19,mn_2_furan-O-0-0_furan-O-0-0_furan-O-0-0_furan...,-102.228776
51,7.338565,5.455674,m06,0.734757,mn15-l_hfx_20,9,cr_3_pyridine-N-3-0_pyridine-N-3-0_pyridine-N-...,-40.151677
37,7.803490,1.247773,blyp_hfx_40,0.049328,b3p86,17,cr_2_misc-C-4-0_misc-C-4-0_misc-C-4-0_acetonit...,-53.701043


In [19]:
pearsonr(df_sel["errs_t"], df_sel["dlpno.vertsse"])

(0.0015649160984797972, 0.9847339494428788)

In [29]:
trace = go.Scatter(
    y=df_sel["errs_t"].values,
    x=df_sel["dlpno.vertsse"],
    mode="markers",
    marker=dict(size=6)
)
data = [trace]
layout = go.Layout()
layout.update(glob_layout)
layout.legend.update(x=1, y=1, bgcolor="rgba(0,0,0,0)")
layout["yaxis"].update({'title': "absolute error (kcal/mol)",  "range": [0, 10],})
layout["xaxis"].update({'title': "DLPNO DEH–L (kcal/mol)", "range": [-100, -10], "tickvals": [-100, -90, -80, -70, -60, -50, -40, -30, -20, -10]})
layout.update(width=600, height=550, boxmode='group')
fig = go.Figure(data=data, layout=layout)
iplot(fig)
# fig.write_image("/Users/chenruduan/Dropbox (MIT)/CD-DFARecFull/SI/SI_Figures/ErrsVsDE/ErrsVsDE.pdf")

##### DFA ranks

In [30]:
y = []
for ii in range(48):
    y += [len(df_sel[df_sel["ranks"] == ii])*100./len(df_sel)]
data = [go.Bar(x=list(range(48)),
               y=y, name='all', marker_color='rgba(0, 0, 0, 0.5)', showlegend=False),]
xs=list(range(48))
ys=[np.sum(y[:ii])*100./np.sum(y) for ii in xs]
for ii in range(47):
    data += [go.Scatter(x=[xs[ii], xs[ii]], y=[ys[ii], ys[ii+1]], mode='lines', yaxis="y2", line=dict(color='blue', width=2, dash='solid'), showlegend=False)]
    data += [go.Scatter(x=[xs[ii], xs[ii+1]], y=[ys[ii+1], ys[ii+1]], mode='lines', yaxis="y2", line=dict(color='blue', width=2, dash='solid'), showlegend=False)]
layout = go.Layout()
layout.update(glob_layout)
layout.legend.update(x=1, y=1, bgcolor="rgba(0,0,0,0)")
layout["xaxis"].update({'title': "DFA rank"})
layout["yaxis"].update({'title': "percentage", "mirror": False})
layout.update({"yaxis2": dict(
    title="cumulative percentage",
    titlefont=dict(color="black"),
    tickfont=dict(color="black"),
    anchor="free",
    overlaying="y",
    side="right",
    position=1,
    range=[0, 100],
    showgrid=True,  
    zeroline=True, 
    ticks="inside", 
    showline=True,
    tickwidth=3, 
    linewidth=3, 
    ticklen=10,
#     mirror=True,
)},)
layout.update(width=600, height=500, boxmode='group')
fig = dict(data=data, layout=layout)
iplot(fig)

##### likelihood of top 5 DFAs

In [31]:
### ----top-5----
cutoff = 15
top5_bfs = {}
for f in functionals:
    top5_bfs[f] = 0
for _, row in df_sel_top.iterrows():
    for f in row["best_fs"]:
        top5_bfs[f] += 1
true_top5_bfs = {}
for f in functionals:
    true_top5_bfs[f] = 0
for _, row in df_sel_top.iterrows():
    for f in row["true_best_fs"]:
        true_top5_bfs[f] += 1

for f in top5_bfs:
    top5_bfs[f] += true_top5_bfs[f]*0.01

from collections import Counter
bfs = dict(sorted(top5_bfs.items(), key=lambda item: item[1], reverse=True))
use_bfs_true = {}
for f in list(bfs.keys()):
    use_bfs_true[f] = true_top5_bfs[f]
data = [go.Bar(x=list(bfs.keys())[:cutoff],
               y=np.array(list(bfs.values()))[:cutoff]*600./np.sum(list(bfs.values())), name='rec', marker_color='rgba(0, 0, 255, 0.5)'),
        go.Bar(x=list(use_bfs_true.keys())[:cutoff],
               y=np.array(list(use_bfs_true.values()))[:cutoff]*600./np.sum(list(bfs.values())), name='true', marker_color='rgba(0, 255, 0, 0.5)'),
       ]
layout = go.Layout()
layout.update(glob_layout)
layout["xaxis"].update({'title': "DFA"})
layout["yaxis"].update({'title': "likelihood of top 5", })
layout.update(width=1000, height=500, boxmode='group')
fig = dict(data=data, layout=layout)
iplot(fig)

In [32]:
#---Calculate rank ordering of the recommender selected DFAs and the ground truth---
TL_ranked_DFAs = list(bfs.keys())
bfs_true = dict(sorted(true_top5_bfs.items(), key=lambda item: item[1], reverse=True))
l1 = [ii for ii, _ in enumerate(list(bfs_true.keys()))]
l2 = [list(TL_ranked_DFAs).index(f) for f in list(bfs_true.keys())]
spearmanr(l1, l2)

SpearmanrResult(correlation=0.9478260869565216, pvalue=5.694351597600022e-23)

##### with specific metal

In [33]:
### ----top-5----
cutoff = 15
top5_bfs = {}
_df_sel_top = df_sel_top[df_sel_top["metal"] == "mn"]
for f in functionals:
    top5_bfs[f] = 0
for _, row in _df_sel_top.iterrows():
    for f in row["best_fs"]:
        top5_bfs[f] += 1
true_top5_bfs = {}
for f in functionals:
    true_top5_bfs[f] = 0
for _, row in _df_sel_top.iterrows():
    for f in row["true_best_fs"]:
        true_top5_bfs[f] += 1
        
for f in top5_bfs:
    top5_bfs[f] += true_top5_bfs[f]*0.01

from collections import Counter
bfs = dict(sorted(top5_bfs.items(), key=lambda item: item[1], reverse=True))
use_bfs_true = {}
for f in list(bfs.keys())[:cutoff]:
    use_bfs_true[f] = true_top5_bfs[f]
data = [go.Bar(x=list(bfs.keys())[:cutoff],
               y=np.array(list(bfs.values()))[:cutoff]*600./np.sum(list(bfs.values())), name='rec', marker_color='rgba(0, 0, 255, 0.5)'),
        go.Bar(x=list(use_bfs_true.keys()),
               y=np.array(list(use_bfs_true.values()))*600./np.sum(list(bfs.values())), name='true', marker_color='rgba(0, 255, 0, 0.5)'),
       ]
layout = go.Layout()
layout.update(glob_layout)
layout["xaxis"].update({'title': "DFA"})
layout["yaxis"].update({'title': "likelihood of top 5", })
layout.update(width=1000, height=500, boxmode='group')
fig = dict(data=data, layout=layout)
iplot(fig)
len(_df_sel_top)

38

In [34]:
#---Calculate rank ordering of the recommender selected DFAs and the ground truth---
TL_ranked_DFAs = list(bfs.keys())
bfs_true = dict(sorted(true_top5_bfs.items(), key=lambda item: item[1], reverse=True))
l1 = [ii for ii, _ in enumerate(list(bfs_true.keys()))]
l2 = [list(TL_ranked_DFAs).index(f) for f in list(bfs_true.keys())]
spearmanr(l1, l2)

SpearmanrResult(correlation=0.903820816864295, pvalue=1.8735458097164872e-17)

### Analyze selected DFAs

Note that first run the recommender with only the five DFAs!

In [38]:
df_sel["hfx"] = [int(x.split("_")[-1]) for x in df_sel["best_fs"].values]
df_sel = df_sel.sort_values(by=['hfx'])

In [40]:
fig = px.scatter(df_sel, x="dlpno.vertsse", y="errs_t", color="best_fs",
                 size='lower_bound', hover_data=['name'])
fig.layout.update(glob_layout)
fig.layout.xaxis.update(dict(range=[-105, -5]))
fig.layout.yaxis.update(dict(range=[0, 15]))
fig.layout.update(width=1000, height=500, boxmode='group')

iplot(fig)

In [41]:
fig = px.histogram(df_sel, x="dlpno.vertsse", color="best_fs", marginal="rug", nbins=10, text_auto=False, opacity=0.75,
                   color_discrete_map={"pbe_hfx_30": "rgba(0, 0, 255, 0.7)", 
                                       "scan_hfx_40": "rgba(255, 0, 0, 0.7)", 
                                       "m06-l_hfx_40": "rgba(0, 235, 64, 0.7)", 
                                       "mn15-l_hfx_50":  "rgba(255, 165, 0, 0.7)",
                                       "blyp_hfx_50": "rgba(140, 140, 140, 0.7)",})
fig.layout.update(glob_layout)
fig.layout.xaxis.update(dict(range=[-100, -10], tickvals=[-100, -90, -80, -70, -60, -50, -40, -30, -20, -10]))
fig.layout.yaxis.update(dict(range=[0, 40]))
fig.layout.update(width=900, height=500, boxmode='group')

iplot(fig)
# fig.write_image("/Users/chenruduan/Dropbox (MIT)/CD-DFARecFull/Figures/F3.5/hist.pdf")

In [42]:
df_test = df_org[df_org["train"] == 0]
df_test = df_sel.merge(df_test, on="name", how="left")
len(df_test)

152

In [43]:
category = list()
for _, row in df_test.iterrows():
    if row["dlpno.vertsse"] < -100:
        category += [-105]
    elif row["dlpno.vertsse"] < -90:
        category += [-95]
    elif row["dlpno.vertsse"] < -80:
        category += [-85]
    elif row["dlpno.vertsse"] < -70:
        category += [-75] 
    elif row["dlpno.vertsse"] < -60:
        category += [-65]
    elif row["dlpno.vertsse"] < -50:
        category += [-55]
    elif row["dlpno.vertsse"] < -40:
        category += [-45]
    elif row["dlpno.vertsse"] < -30:
        category += [-35]
    elif row["dlpno.vertsse"] < -20:
        category += [-25]
    elif row["dlpno.vertsse"] < -10:
        category += [-15]
    else:
        category += [-5]
df_test["category"] = category

In [46]:
data = list()
for cate in np.unique(df_test["category"]):
    _df = df_test[df_test["category"] == cate]
    # for f in functionals:
    # print(cate, len(_df))
    for f in ["pbe_hfx_30", "scan_hfx_40", "m06-l_hfx_40", "mn15-l_hfx_50",  "blyp_hfx_50"]:
        __df = _df[_df["best_fs"] == f]
        d = dict(
            cate = cate,
            f = f,
            mae = np.mean(_df["delta.%s.vertsse"%f].values),
            std = np.std(_df["delta.%s.vertsse"%f].values) + 0.5,
            abs_mae = np.nanmean(np.abs(_df["delta.%s.vertsse"%f].values)),
            abs_std = (np.nanstd(np.abs(_df["delta.%s.vertsse"%f].values))) +0.5,
            sel_abs_mae = np.nanmean(np.abs(__df["delta.%s.vertsse"%f].values)),
            sel_abs_std = (np.nanstd(np.abs(__df["delta.%s.vertsse"%f].values))) + 0.5,
            size = np.power(len(__df)/(len(_df)), 1.4) + 0.01,
            bar = len(__df)/(len(_df)) * 100,
            const_size = 25,
        )
        data.append(d)
df_new = pd.DataFrame(data)
# df_new = df_new.dropna()
# df_new


Mean of empty slice


Degrees of freedom <= 0 for slice.



In [47]:
fig = px.bar(df_new, x="cate", y="bar", color="f", opacity=0.75,
                   color_discrete_map={"pbe_hfx_30": "rgba(0, 0, 255, 0.7)", 
                                       "scan_hfx_40": "rgba(255, 0, 0, 0.7)", 
                                       "m06-l_hfx_40": "rgba(0, 225, 64, 0.7)", 
                                       "mn15-l_hfx_50":  "rgba(255, 165, 0, 0.7)",
                                       "blyp_hfx_50": "rgba(140, 140, 140, 0.7)",})
fig.layout.update(glob_layout)
fig.layout.xaxis.update(dict(range=[-100, -10], tickvals=[-100, -90, -80, -70, -60, -50, -40, -30, -20, -10]))
fig.layout.yaxis.update(dict(range=[0, 100]))
fig.layout.update(width=900, height=400, boxmode='group')

iplot(fig)
# fig.write_image("/Users/chenruduan/Dropbox (MIT)/CD-DFARecFull/Figures/F3.5/bar.pdf")

In [50]:
fig = px.scatter(df_new, x="cate", y="abs_mae", color="f",
                 size='size', opacity=0.75, size_max=40,
                 color_discrete_map={"pbe_hfx_30": "rgba(0, 0, 255, 0.7)", 
                                       "scan_hfx_40": "rgba(255, 0, 0, 0.7)", 
                                       "m06-l_hfx_40": "rgba(0, 235, 64, 0.7)", 
                                       "mn15-l_hfx_50":  "rgba(255, 165, 0, 0.7)",
                                       "blyp_hfx_50": "rgba(140, 140, 140, 0.8)",})
fig.layout.update(glob_layout)
fig.layout.xaxis.update(dict(range=[-100, -10], tickvals=[-100, -90, -80, -70, -60, -50, -40, -30, -20, -10]))
fig.layout.yaxis.update(dict(range=[0, 15], tickvals=[0, 5, 10, 15]))
fig.layout.update(width=900, height=400, boxmode='group')
iplot(fig)
# fig.write_image("/Users/chenruduan/Dropbox (MIT)/CD-DFARecFull/Figures/F3.5/scatter.pdf")

In [51]:
fig = px.scatter(df_new, x="cate", y="abs_mae", color="f",
                 size="const_size", opacity=0.75, size_max=10,
                 color_discrete_map={"pbe_hfx_30": "rgba(0, 0, 255, 0.7)", 
                                       "scan_hfx_40": "rgba(255, 0, 0, 0.7)", 
                                       "m06-l_hfx_40": "rgba(0, 235, 64, 0.7)", 
                                       "mn15-l_hfx_50":  "rgba(255, 165, 0, 0.7)",
                                       "blyp_hfx_50": "rgba(140, 140, 140, 0.8)",})
fig.layout.update(glob_layout)
fig.layout.xaxis.update(dict(range=[-100, -10], tickvals=[-100, -90, -80, -70, -60, -50, -40, -30, -20, -10]))
fig.layout.yaxis.update(dict(range=[0, 15], tickvals=[0, 5, 10, 15]))
fig.layout.update(width=900, height=400, boxmode='group')
iplot(fig)
# fig.write_image("/Users/chenruduan/Dropbox (MIT)/CD-DFARecFull/Figures/F3.5/scatter_not_sizes.pdf")

In [53]:
df_cp = df_test.sort_values(by=["dlpno-CCSD_T.vertsse"])
funcs = ["mn15-l_hfx_50", "blyp_hfx_50", "m06-l_hfx_40", "scan_hfx_40", "pbe_hfx_30"]
keys = ["delta.%s.vertsse"%f for f in funcs]
colors = ["black", "blue", "orange", "green", "red", "gray"]
df_cp = df_cp.dropna(subset=keys)
data = []
for ii, f in enumerate(funcs):
    data += [
        go.Scatter(
            x=df_cp["dlpno-CCSD_T.vertsse"].values,
            y=np.convolve(df_cp["delta.%s.vertsse"%f].values, np.ones(10), 'valid')/10,
            marker=dict(color=colors[ii]),
            name=f,
        ),
    ]
layout = go.Layout()
layout.update(glob_layout)
layout["xaxis"].update({'title': "DLPNO-CCSD(T) vertSSE"})
layout["yaxis"].update({'title': "Delta vertSSE", })
# layout.legend.update(x=0, y=0, bgcolor="rgba(0,0,0,0)")
layout.update(width=1000, height=500)
fig = dict(data=data, layout=layout)
iplot(fig)
len(df_cp)

152