In [1]:
export_config = {
    "mp_20": (
        ("WyckoffTransformer", ),
        ("WyckoffTransformer", "DiffCSP++10k"),
        ("WyckoffTransformer", "DiffCSP++10k", "CHGNet_free", "DFT"),
        ("WyckoffTransformer", "DiffCSP++10k", "CHGNet_free", "DFT-GGA-relax-1"),
        ("WyckoffTransformer", "CrySPR", "CHGNet_fix"),
        ("WyckoffTransformer", "CrySPR", "CHGNet_fix", "DFT"),
        ("WyckoffTransformer", "DiffCSP++"),
        ("WyckoffTransformer", "DiffCSP++", "DFT"),
    ),
    "mpts_52": (
        ("WyckoffTransformer", ),
        ("WyckoffTransformer", "CrySPR", "CHGNet_fix"))
}

In [2]:
import sys
sys.path.append("../..")
from evaluation.generated_dataset import GeneratedDataset

In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.


In [3]:
from pathlib import Path
export_path = Path("WyFomer generated datasets")
from monty.json import MontyEncoder
encoder = MontyEncoder()
def to_json(obj):
    if isinstance(obj, str):
        return obj
    if isinstance(obj, frozenset):
        obj = tuple(obj)
    return encoder.encode(obj)
export_path.mkdir(parents=True, exist_ok=True)

In [4]:
from tqdm.auto import tqdm
from scripts.cache_generated_datasets import compute_fields_and_cache
for dataset, transformation_tuples in tqdm(export_config.items()):
    for these_transformations in tqdm(transformation_tuples):
        dataset_path = export_path.joinpath(dataset).joinpath(*these_transformations) / "data.csv.gz"
        dataset_path.parent.mkdir(parents=True, exist_ok=True)
        try:
            dataset_processed = GeneratedDataset.from_cache(
                transformations=these_transformations,
                dataset=dataset)
        except FileNotFoundError:
            dataset_raw = GeneratedDataset.from_transformations(
                transformations=these_transformations,
                dataset=dataset)
            dataset_processed = compute_fields_and_cache(dataset_raw)
        if "CHGNet" in these_transformations[-1]:
            dataset_processed.data.rename(columns={
                "energy_per_atom": "chgnet_energy_per_atom",
                "corrected_chgnet_ehull": "chgnet_e_above_hull_corrected",
            }, inplace=True)
        elif "DFT" in these_transformations[-1]:
            dataset_processed.data.rename(columns={
                "e_above_hull_corrected": "dft_e_above_hull_corrected",
                "e_uncorrected": "dft_e_uncorrected",
                "e_corrected": "dft_e_corrected",
            }, inplace=True)
        export_filter = dataset_processed.data.filter(
            ["cdvae_crystal", "fingerprint", "composition", "naive_validity",
             "spacegroup_number", "density"], axis=1)
        dataset_processed.data.drop(export_filter, axis=1).map(to_json).to_csv(
            dataset_path, index_label="material_id")
        print(f"Exported {dataset_path}")

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

Exported WyFomer generated datasets/mp_20/WyckoffTransformer/data.csv.gz
Exported WyFomer generated datasets/mp_20/WyckoffTransformer/DiffCSP++10k/data.csv.gz
Exported WyFomer generated datasets/mp_20/WyckoffTransformer/DiffCSP++10k/CHGNet_free/DFT/data.csv.gz
Exported WyFomer generated datasets/mp_20/WyckoffTransformer/DiffCSP++10k/CHGNet_free/DFT-GGA-relax-1/data.csv.gz
Exported WyFomer generated datasets/mp_20/WyckoffTransformer/CrySPR/CHGNet_fix/data.csv.gz
Exported WyFomer generated datasets/mp_20/WyckoffTransformer/CrySPR/CHGNet_fix/DFT/data.csv.gz
Exported WyFomer generated datasets/mp_20/WyckoffTransformer/DiffCSP++/data.csv.gz
Exported WyFomer generated datasets/mp_20/WyckoffTransformer/DiffCSP++/DFT/data.csv.gz


  0%|          | 0/2 [00:00<?, ?it/s]

Exported WyFomer generated datasets/mpts_52/WyckoffTransformer/data.csv.gz
Exported WyFomer generated datasets/mpts_52/WyckoffTransformer/CrySPR/CHGNet_fix/data.csv.gz
