In [1]:
import sys
import pandas as pd
from tqdm.auto import tqdm
sys.path.append("..")
from evaluation.generated_dataset import load_all_from_config
from evaluation.statistical_evaluator import StatisticalEvaluator

In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.


In [2]:
datasets = {
    "WyFormer": ("WyckoffTransformer", "DiffCSP++"),
    "FlowMM": ("FlowMM",),
    "MiAD": ("MiAD", "CHGNet_free"),
    "DiffCSP++": ("DiffCSP++",),
    "DiffCSP": ("DiffCSP",),
    "SymmCD": ("SymmCD",),
}

In [3]:
all_datasets = load_all_from_config(
    datasets=list(datasets.values()) + \
        [("split", "train"), ("split", "val"), ("split", "test")],
    dataset_name="mp_20")

In [4]:
test_evaluator = StatisticalEvaluator(all_datasets[('split', 'test')].data)

In [5]:
table = pd.DataFrame(
    index=datasets.keys(), columns=[
        "P1 (%)",
        "Space Group chi^2",
        "N atoms EMD"])
for name, transformations in tqdm(datasets.items()):
    dataset = all_datasets[transformations].data
    table.loc[name, "P1 (%)"] = 100 * (dataset.group == 1).mean()
    table.loc[name, r"Space Group chi^2"] = test_evaluator.get_sg_chi2(dataset)
    table.loc[name, "N atoms EMD"] = test_evaluator.get_num_atoms_emd(dataset)

  0%|          | 0/6 [00:00<?, ?it/s]

In [6]:
table

Unnamed: 0,P1 (%),Space Group chi^2,N atoms EMD
WyFormer,1.4,0.163555,13.934188
FlowMM,38.09762,11.132666,8.599533
MiAD,17.386955,3.368287,11.970726
DiffCSP++,1.64,0.041517,14.175788
DiffCSP,28.35,5.463993,10.132488
SymmCD,1.973615,0.046386,13.490067
