In [1]:
import sys
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
sys.path.append("..")
from evaluation.generated_dataset import load_all_from_config
from evaluation.statistical_evaluator import StatisticalEvaluator
from evaluation.novelty import NoveltyFilter

In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.


In [2]:
datasets = {
    # Fails
    # "WyFormer": ("WyckoffTransformer", "DiffCSP++10k", "CHGNet_free"),
    "FlowMM": ("FlowMM", "CHGNet_fix"),
    "MiAD": ("MiAD", "CHGNet_free"),
    "DiffCSP++": ("DiffCSP++", "CHGNet_fix_release"),
    "DiffCSP": ("DiffCSP", "CHGNet_fix"),
    "SymmCD": ("SymmCD", "CHGNet_fix"),
    "MP-20-train": ("split", "train"),
    "MP-20-test": ("split", "test"),
    "MP-20-val": ("split", "val")
}

In [3]:
all_datasets = load_all_from_config(datasets=datasets.values(), dataset_name="mp_20")

In [4]:
test_evaluator = StatisticalEvaluator(all_datasets[('split', 'test')].data)

In [5]:
table = pd.DataFrame(
    index=datasets.keys(), columns=[
        "P1 (%)",
        "Space Group chi^2",
        "N atoms EMD"])
for name, transformations in tqdm(datasets.items()):
    dataset = all_datasets[transformations].data
    table.loc[name, "P1 (%)"] = 100 * (dataset.group == 1).mean()
    table.loc[name, r"Space Group chi^2"] = test_evaluator.get_sg_chi2(dataset)
    table.loc[name, "N atoms EMD"] = test_evaluator.get_num_atoms_emd(dataset)

  0%|          | 0/8 [00:00<?, ?it/s]

In [6]:
table

Unnamed: 0,P1 (%),Space Group chi^2,N atoms EMD
FlowMM,45.035105,11.47218,0.95952
MiAD,17.386955,3.368287,1.461373
DiffCSP++,2.3,0.198108,8.936747
DiffCSP,32.8,6.327654,0.20432
SymmCD,1.962422,0.053979,8.288385
MP-20-train,1.732017,0.025769,0.125059
MP-20-test,1.812956,0.0,0.0
MP-20-val,1.624848,0.03528,0.076691


In [7]:
novelty_reference = pd.concat([
    all_datasets[('split', 'train')].data,
    all_datasets[('split', 'val')].data], axis=0, verify_integrity=True)
novelty_filter = NoveltyFilter(novelty_reference)

In [8]:
table = pd.DataFrame(
    index=datasets.keys(), columns=[
        "P1 (%)",
        "Space Group chi^2",
        "N atoms EMD"])
for name, transformations in tqdm(datasets.items()):
    dataset = all_datasets[transformations].data
    dataset["is_novel"] = dataset.apply(novelty_filter.is_novel, axis=1)
    dataset["n_atoms"] = dataset.structure.map(len)
    novel = dataset[dataset["is_novel"]]
    table.loc[name, "P1 (%)"] = 100 * (novel.group == 1).mean()
    table.loc[name, r"Space Group chi^2"] = test_evaluator.get_sg_chi2(novel)
    if len(novel) > 0:
        table.loc[name, "N atoms EMD"] = test_evaluator.get_num_atoms_emd(novel)
    dataset.drop(columns="cdvae_crystal").to_pickle(Path("data_export", name + "chgnet.pkl.gz"))

  0%|          | 0/8 [00:00<?, ?it/s]

In [9]:
table

Unnamed: 0,P1 (%),Space Group chi^2,N atoms EMD
FlowMM,50.111607,14.337726,1.435006
MiAD,20.139697,4.568208,1.675489
DiffCSP++,2.566964,0.254336,9.141479
DiffCSP,36.525612,7.927858,0.697292
SymmCD,2.206573,0.079547,8.22792
MP-20-train,,,
MP-20-test,1.802699,1e-05,0.002511
MP-20-val,,,
