In [1]:
from ocpmodels.datasets import LmdbDataset as LD
import matplotlib.pyplot as plt
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [3]:
## get mappings of sid (a random number) to systems (adslabs) that are in the OC20 dataset 
with open("oc20_data_mapping.pkl", "rb") as input_file:
    mappings = pickle.load(input_file)
# load the OC20 reference energies
with open("oc20_ref.pkl", "rb") as input_file:
    ref_energies = pickle.load(input_file)
# load the mapping from an adslab sid to a slab sid
with open("mapping_adslab_slab.pkl", "rb") as input_file:
    adslab_slab_mappings = pickle.load(input_file)

In [4]:
# load the is2re val sets with the DFT energies
df_val_id = pd.read_csv("is2re_val_sets/is2re_val_id.csv", index_col=0)
df_ood_ads = pd.read_csv("is2re_val_sets/is2re_val_ood_ads.csv", index_col=0)
df_ood_cat = pd.read_csv("is2re_val_sets/is2re_val_ood_cat.csv", index_col=0)
df_ood_both = pd.read_csv("is2re_val_sets/is2re_val_ood_both.csv", index_col=0)

In [5]:
# load the surface sid: surface anomalies binary vector for each val set
df_val_id_anom = pd.read_csv("is2re_val_sets_anom/is2re_val_id_surf_anom.csv", index_col=0)
df_ood_ads_anom = pd.read_csv("is2re_val_sets_anom/is2re_val_ood_ads_surf_anom.csv", index_col=0)
df_ood_cat_anom = pd.read_csv("is2re_val_sets_anom/is2re_val_ood_cat_surf_anom.csv", index_col=0)
df_ood_both_anom = pd.read_csv("is2re_val_sets_anom/is2re_val_ood_both_surf_anom.csv", index_col=0)

In [6]:
for df, df_anom in [(df_val_id,df_val_id_anom),(df_ood_ads,df_ood_ads_anom),
                    (df_ood_cat,df_ood_cat_anom),(df_ood_both,df_ood_both_anom)]:
    sorted_surf_anom = []
    for sid in tqdm(df["sid"].values):
        sid = "random" + str(sid)
        surf_anomalies = df_anom["surface_anomalies"].values
        sorted_surf_anom.append(surf_anomalies[np.where(df_anom["sid"].values==sid)[0][0]])
    df["surface_anomalies"] = sorted_surf_anom

100%|██████████| 24943/24943 [00:12<00:00, 2063.72it/s]
100%|██████████| 24961/24961 [00:12<00:00, 2043.86it/s]
100%|██████████| 24963/24963 [00:12<00:00, 2034.06it/s]
100%|██████████| 24987/24987 [00:12<00:00, 2010.58it/s]


In [7]:
## MAE distribution of the MAEs
for df in [df_val_id,df_ood_cat,df_ood_ads,df_ood_both]:
    material_classes = []
    for i in tqdm(df["sid"].values):
        sid = "random"+str(i)
        material_classes.append(mappings[sid]["class"])
    material_classes = np.array(material_classes)
    # add system information to the dataframe
    df["cat_class"] = material_classes

100%|██████████| 24943/24943 [00:00<00:00, 341754.35it/s]
100%|██████████| 24963/24963 [00:00<00:00, 369018.69it/s]
100%|██████████| 24961/24961 [00:00<00:00, 376628.36it/s]
100%|██████████| 24987/24987 [00:00<00:00, 382836.69it/s]


In [8]:
## id
pred_path = "/home/jovyan/shared-scratch/kabdelma/oc20_data_quality_project/ocp/results/2024-04-18-02-54-56-is2re-goc-id-oc20-only/s2ef_predictions.npz"
val_predictions = np.load(pred_path)

FileNotFoundError: [Errno 2] No such file or directory: '/home/jovyan/shared-scratch/kabdelma/oc20_data_quality_project/ocp/results/2024-04-18-02-54-56-is2re-goc-id-oc20-only/s2ef_predictions.npz'

In [None]:
val_id_preds_goc = []
for i in tqdm(range(len(df_val_id))):
    sid = str(df_val_id["sid"][i])
    val_id_preds_goc.append(val_predictions["energy"][np.where(val_predictions["ids"]==sid)[0]][0][0])

In [None]:
df_val_id["direct_ads_energy_pred"] = val_id_preds_goc
val_id_ads_mae = np.mean(np.abs(df_val_id["direct_ads_energy_pred"] - \
                                df_val_id["dft_adsorption_energy"]))
val_id_ads_mae

In [None]:
## ood ads
pred_path = "/home/jovyan/shared-scratch/kabdelma/oc20_data_quality_project/ocp/results/2024-04-18-03-09-52-is2re-goc-ood-ads-oc20-only/s2ef_predictions.npz"
val_predictions = np.load(pred_path)

In [None]:
val_ood_ads_preds_goc = []
for i in tqdm(range(len(df_ood_ads))):
    sid = str(df_ood_ads["sid"][i])
    val_ood_ads_preds_goc.append(val_predictions["energy"][np.where(val_predictions["ids"]==sid)[0]][0][0])

In [None]:
df_ood_ads["direct_ads_energy_pred"] = val_ood_ads_preds_goc
val_ood_ads_ads_mae = np.mean(np.abs(df_ood_ads["direct_ads_energy_pred"] - \
                                     df_ood_ads["dft_adsorption_energy"]))
val_ood_ads_ads_mae

In [None]:
## ood cat
pred_path = "/home/jovyan/shared-scratch/kabdelma/oc20_data_quality_project/ocp/results/2024-04-18-03-39-44-is2re-goc-ood-cat-oc20-only/s2ef_predictions.npz"
val_predictions = np.load(pred_path)

In [None]:
val_ood_cat_preds_goc = []
for i in tqdm(range(len(df_ood_cat))):
    sid = str(df_ood_cat["sid"][i])
    val_ood_cat_preds_goc.append(val_predictions["energy"][np.where(val_predictions["ids"]==sid)[0]][0][0])

In [None]:
df_ood_cat["direct_ads_energy_pred"] = val_ood_cat_preds_goc
val_ood_cat_ads_mae = np.mean(np.abs(df_ood_cat["direct_ads_energy_pred"] - \
                                     df_ood_cat["dft_adsorption_energy"]))
val_ood_cat_ads_mae

In [None]:
## ood both
pred_path = "/home/jovyan/shared-scratch/kabdelma/oc20_data_quality_project/ocp/results/2024-04-18-03-54-40-is2re-goc-ood-both-oc20-only/s2ef_predictions.npz"
val_predictions = np.load(pred_path)

In [None]:
val_ood_both_preds_goc = []
for i in tqdm(range(len(df_ood_both))):
    sid = str(df_ood_both["sid"][i])
    val_ood_both_preds_goc.append(val_predictions["energy"][np.where(val_predictions["ids"]==sid)[0]][0][0])

In [None]:
df_ood_both["direct_ads_energy_pred"] = val_ood_both_preds_goc
val_ood_both_ads_mae = np.mean(np.abs(df_ood_both["direct_ads_energy_pred"] - \
                                      df_ood_both["dft_adsorption_energy"]))
val_ood_both_ads_mae

In [None]:
names = ["Val-ID", "Val-OOD-Ads", "Val-OOD-Cat", "Val-OOD-Both"]
val_ads_maes = [val_id_ads_mae,val_ood_ads_ads_mae,val_ood_cat_ads_mae,val_ood_both_ads_mae]


In [None]:
data = {
    names[i]: [val_ads_maes[i]] for i in range(len(names))
}

df = pd.DataFrame(data, index = ["original"])
df