In [1]:
import pandas as pd
from evaluation.generated_dataset import GeneratedDataset, load_all_from_config

In [5]:
all_datasets = load_all_from_config()

In [6]:
for name, dataset in all_datasets.items():
    dataset.compute_wyckoff_fingerprints()

In [7]:
novelty_reference = pd.concat([
    all_datasets[('split', 'train')].data.fingerprint,
    all_datasets[('split', 'val')].data.fingerprint], verify_integrity=True)
novelty_reference_set = frozenset(novelty_reference)

In [8]:
test_not_novel = all_datasets[('split', 'test')].data.fingerprint.isin(novelty_reference_set)

In [9]:
s = all_datasets[('split', 'test')].data.iloc[0]

In [10]:
cands = all_datasets[('split', 'train')].data[
    all_datasets[('split', 'train')].data.fingerprint == s.fingerprint]

In [11]:
all_datasets[('WyckoffTransformer', 'CHGNet_fix')].load_corrected_chgnet_ehull(
    "generated/WyckoffTransformer_mp_20.ehull.csv.gz")

In [12]:
results = pd.DataFrame(index=pd.Index(all_datasets.keys(), tupleize_cols=False),
    columns=["Unique", "& Novel", "S.U.N. 0", "S.U.N. 0.08"])
sample_size = 995
for transformations, dataset in all_datasets.items():
    #if transformations == ('split', 'train') or transformations == ('split', 'val'):
    #    continue
    if "fingerprint" not in dataset.data.columns:
        continue
    if "corrected_chgnet_ehull" in dataset.data.columns:
        has_e_hull = dataset.data.corrected_chgnet_ehull.notnull()
        with_e_hull = dataset.data.loc[has_e_hull][:sample_size]
        unique = with_e_hull.drop_duplicates(subset="fingerprint")
        results.loc[[transformations], "Unique"] = len(unique) / len(with_e_hull)
        
        is_novel = ~unique.fingerprint.isin(novelty_reference_set)
        novel = unique.loc[is_novel]
        results.loc[[transformations], "& Novel"] = len(novel) / len(with_e_hull)
    
        stable_008 = (novel.corrected_chgnet_ehull < 0.08).sum()
        stable_0 = (novel.corrected_chgnet_ehull <= 0).sum()
        #results.loc[[transformations], "E_hull < 0.08"] = stable_008
        #results.loc[[transformations], "E_hull <= 0"] = stable_0
        
        results.loc[[transformations], "S.U.N. 0"] = stable_0 / len(with_e_hull)
        results.loc[[transformations], "S.U.N. 0.08"] = stable_008 / len(with_e_hull)
    else:
        sample = dataset.data[:sample_size]
        unique = sample.drop_duplicates(subset="fingerprint")
        results.loc[[transformations], "Unique"] = len(unique) / len(sample)
        is_novel = ~unique.fingerprint.isin(novelty_reference_set)
        novel = unique.loc[is_novel]
        results.loc[[transformations], "& Novel"] = len(novel) / len(sample)
results

Unnamed: 0,Unique,& Novel,S.U.N. 0,S.U.N. 0.08
"(WyckoffTransformer,)",0.99799,0.884422,,
"(WyckoffTransformer, CHGNet_fix)",0.99799,0.878392,0.127638,0.364824
"(WyckoffTransformer, CHGNet_free)",0.990955,0.865327,,
"(WyckoffTransformer, CHGNet_fix_release)",0.99799,0.878392,,
"(WyckoffTransformer, DiffCSP++)",0.99799,0.877387,,
"(WyckoffTransformer, DiffCSP++, CHGNet_free)",0.99799,0.876382,0.125628,0.364824
"(CrystalFormer,)",0.99598,0.739698,,
"(CrystalFormer, CHGNet_fix_release)",0.995968,0.731855,0.193548,0.362903
"(DiffCSP,)",0.977889,0.853266,,
"(DiffCSP, CHGNet_fix)",0.975879,0.849246,0.177889,0.547739
