In [1]:
export_configs = {
    "WyFomer generated datasets": {
        "mp_20": (
            ("WyckoffTransformer", ),
            ("WyckoffTransformer", "DiffCSP++10k"),
            ("WyckoffTransformer", "DiffCSP++10k", "CHGNet_free", "DFT"),
            ("WyckoffTransformer", "DiffCSP++10k", "CHGNet_free", "DFT-GGA-relax-1"),
            ("WyckoffTransformer", "CrySPR", "CHGNet_fix"),
            ("WyckoffTransformer", "CrySPR", "CHGNet_fix", "DFT"),
            ("WyckoffTransformer", "DiffCSP++"),
            ("WyckoffTransformer", "DiffCSP++", "DFT"),
        ),
        "mpts_52": (
            ("WyckoffTransformer", ),
            ("WyckoffTransformer", "CrySPR", "CHGNet_fix"))
    }}

In [2]:
import sys
sys.path.append("../..")
from evaluation.generated_dataset import GeneratedDataset, DATA_KEYS

In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.


In [3]:
from pathlib import Path
from omegaconf import OmegaConf
all_data = OmegaConf.load("../../generated/datasets.yaml")
del all_data['mp_20']["FlowMM"]
# These data are not used and might contain errors
del all_data["carbon_24"]
del all_data["perov_5"]
from collections import defaultdict
export_configs["generated_public"] = defaultdict(set)
def flatten_config(dataset, config, prefix=[]):
    for key, value in config.items():
        if key in DATA_KEYS:
            export_configs["generated_public"][dataset].add(tuple(prefix))
            if "path" in value:
                value["path"] = str(Path(value["path"]).parent / "data.csv.gz")
                value["storage_type"] = "monty"
            if "cache_key" in value:
                del value["cache_key"]
            if "storage_key" in value:
                del value["storage_key"]
        else:
            flatten_config(dataset, value, prefix + [key])
    if "structures" in config and "wyckoffs" in config:
        # No need to export two times
        del config["wyckoffs"]
    
for dataset_name, dataset_config in all_data.items():
    flatten_config(dataset_name, dataset_config)

In [4]:
from pathlib import Path
from monty.json import MontyEncoder
encoder = MontyEncoder()
def to_json(obj):
    if isinstance(obj, str):
        return obj
    if isinstance(obj, frozenset):
        obj = tuple(obj)
    return encoder.encode(obj)

In [5]:
from tqdm.auto import tqdm
from pickle import UnpicklingError
from scripts.cache_generated_datasets import compute_fields_and_cache
def export_data(export_path, export_config):
    export_path = Path(export_path)
    export_path.mkdir(parents=True, exist_ok=True)
    for dataset, transformation_tuples in tqdm(export_config.items()):
        for these_transformations in tqdm(transformation_tuples):
            dataset_path = export_path.joinpath(dataset).joinpath(*these_transformations) / "data.csv.gz"
            print(f"Exporting {dataset_path}")
            dataset_path.parent.mkdir(parents=True, exist_ok=True)
            try:
                dataset_processed = GeneratedDataset.from_cache(
                    transformations=these_transformations,
                    dataset=dataset)
            except (FileNotFoundError, UnpicklingError):
                dataset_raw = GeneratedDataset.from_transformations(
                    transformations=these_transformations,
                    dataset=dataset)
                dataset_processed = compute_fields_and_cache(dataset_raw)
            if "CHGNet" in these_transformations[-1]:
                dataset_processed.data.rename(columns={
                    "energy_per_atom": "chgnet_energy_per_atom",
                    "corrected_chgnet_ehull": "chgnet_e_above_hull_corrected",
                }, inplace=True)
            elif "DFT" in these_transformations[-1]:
                dataset_processed.data.rename(columns={
                    "e_above_hull_corrected": "dft_e_above_hull_corrected",
                    "e_uncorrected": "dft_e_uncorrected",
                    "e_corrected": "dft_e_corrected",
                }, inplace=True)
            export_filter = dataset_processed.data.filter(
                ["cdvae_crystal", "fingerprint", "composition", "naive_validity",
                "spacegroup_number", "density"], axis=1)
            dataset_processed.data.drop(export_filter, axis=1).map(to_json).to_csv(
                dataset_path, index_label="material_id")            

In [6]:
OmegaConf.save(all_data, "generated_public/datasets.yaml")
export_data("generated_public", export_configs["generated_public"])

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

Exporting generated_public/mp_20/WyckoffTransformer-letters/data.csv.gz
Exporting generated_public/mp_20/CrystalFormer/DFT/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer/DiffCSP++/CHGNet_fix/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer/DiffCSP++10k/CHGNet_free/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer/DiffCSP++/DFT/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer/data.csv.gz
Exporting generated_public/mp_20/SymmCD/data.csv.gz
Exporting generated_public/mp_20/UN-DiffCSP++44/data.csv.gz
Exporting generated_public/mp_20/WyckoffLLM-site-symmetry/data.csv.gz
Exporting generated_public/mp_20/WyckoffLLM-naive/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer/CrySPR/CHGNet_free/data.csv.gz
Exporting generated_public/mp_20/DiffCSP++/CHGNet_fix_release/data.csv.gz
Exporting generated_public/mp_20/split/test/data.csv.gz
Exporting generated_public/mp_20/split/train/data.csv.gz
Exporting generated_public/mp_20/Mi





  0%|          | 0/1000 [00:00<?, ?it/s]

Exporting generated_public/mp_20/WyckoffLLM-vanilla/DiffCSP++/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer-letters/DiffCSP++/CHGNet_fix/data.csv.gz
Read 996 CIFs
Valid records: 959


Attempt 0 failed to convert structure Full Formula (Sr4 I2 Br2 N24)
Reduced Formula: Sr2IBrN12
abc   :  10.547454   5.986636   7.900035
angles:  90.000000  86.838987  90.000000
pbc   :       True       True       True
Sites (32)
  #  SP           a        b         c
---  ----  --------  -------  --------
  0  Sr    0.152481  0        0.792998
  1  Sr    0.847519  0        0.207002
  2  Sr    0.652481  0.5      0.792998
  3  Sr    0.347519  0.5      0.207002
  4  I     0         0.5      0.5
  5  I     0.5       0        0.5
  6  Br    0         0.5      0
  7  Br    0.5       0        0
  8  N     0.904649  0        0.810172
  9  N     0.095351  0        0.189828
 10  N     0.404649  0.5      0.810172
 11  N     0.595351  0.5      0.189828
 12  N     0.330788  0        0.562134
 13  N     0.669212  0        0.437866
 14  N     0.830788  0.5      0.562134
 15  N     0.169212  0.5      0.437866
 16  N     0.675624  0        0.439222
 17  N     0.324376  0        0.560778
 18  N     0.17



  0%|          | 0/959 [00:00<?, ?it/s]

Exporting generated_public/mp_20/WyckoffTransformer/DiffCSP++/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer-letters/DiffCSP++/DFT/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer-harmonic/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer-harmonic/DiffCSP++/CHGNet_fix/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer-harmonic/DiffCSP++/DFT/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer/CrySPR/CHGNet_fix_release/data.csv.gz
Exporting generated_public/mp_20/DiffCSP/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer/DFT/data.csv.gz
Exporting generated_public/mp_20/UN-DiffCSP++17/data.csv.gz
Exporting generated_public/mp_20/DiffCSP++/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer/DiffCSP++10k/CHGNet_free/DFT/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer/CrySPR/CHGNet_fix/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer/CrySPR/CHGNet_fix/DFT/data.csv.gz

spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: No centring was found.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: P



  0%|          | 0/9852 [00:00<?, ?it/s]

Exporting generated_public/mp_20/CrystalFormer/data.csv.gz
Exporting generated_public/mp_20/DiffCSP/DFT/data.csv.gz
Exporting generated_public/mp_20/FlowMM-relaxed/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer-letters/DiffCSP++/data.csv.gz
Exporting generated_public/mp_20/DiffCSP/CHGNet_fix/data.csv.gz
Exporting generated_public/mp_20/MiAD/CHGNet_free/data.csv.gz
Exporting generated_public/mp_20/WyckoffLLM-vanilla/data.csv.gz
Exporting generated_public/mp_20/UN-DiffCSP++42/data.csv.gz


Attempt 0 failed to convert structure Full Formula (Cu4 Ni32 S32)
Reduced Formula: Cu(NiS)8
abc   :  10.829515  10.829515  10.829515
angles:  90.000000  90.000000  90.000000
pbc   :       True       True       True
Sites (68)
  #  SP           a         b         c
---  ----  --------  --------  --------
  0  S     0.747662  0         0
  1  S     0.252338  0         0
  2  S     0         0.747662  0
  3  S     0         0.252338  0
  4  S     0         0         0.747662
  5  S     0         0         0.252338
  6  S     0.747662  0.5       0.5
  7  S     0.252338  0.5       0.5
  8  S     0         0.247662  0.5
  9  S     0         0.752338  0.5
 10  S     0         0.5       0.247662
 11  S     0         0.5       0.752338
 12  S     0.247662  0         0.5
 13  S     0.752338  0         0.5
 14  S     0.5       0.747662  0.5
 15  S     0.5       0.252338  0.5
 16  S     0.5       0         0.247662
 17  S     0.5       0         0.752338
 18  S     0.247662  0.5       0
 19  S   



  0%|          | 0/27135 [00:00<?, ?it/s]

Exporting generated_public/mp_20/SymmCD/CHGNet_fix/data.csv.gz
Exporting generated_public/mp_20/SymmCD/DFT/data.csv.gz
Exporting generated_public/mp_20/WyckoffLLM-naive/DiffCSP++/data.csv.gz
Exporting generated_public/mp_20/CrystalFormer/CHGNet_fix_release/data.csv.gz


Attempt 0 failed to convert structure Full Formula (Sr4 Co2 Re2 O12)
Reduced Formula: Sr2CoReO6
abc   :   5.565280   5.358300   9.820380
angles:  90.000000 122.818000  90.000000
pbc   :       True       True       True
Sites (20)
  #  SP          a        b        c
---  ----  -------  -------  -------
  0  Sr    0.75461  0.00935  0.75283
  1  Sr    0.24539  0.50935  0.74717
  2  Sr    0.24539  0.99065  0.24717
  3  Sr    0.75461  0.49065  0.25283
  4  Co    0        0        0.5
  5  Co    0        0.5      0
  6  Re    0.5      0        0
  7  Re    0.5      0.5      0.5
  8  O     0.20445  0.73291  0.48016
  9  O     0.79555  0.23291  0.01984
 10  O     0.79555  0.26709  0.51984
 11  O     0.20445  0.76709  0.98016
 12  O     0.21609  0.77349  0.97633
 13  O     0.78391  0.27349  0.52367
 14  O     0.78391  0.22651  0.02367
 15  O     0.21609  0.72651  0.47633
 16  O     0.6887   0.49753  0.73744
 17  O     0.3113   0.99753  0.76256
 18  O     0.3113   0.50247  0.26256
 19  O     0.



  0%|          | 0/992 [00:00<?, ?it/s]

Exporting generated_public/mp_20/split/val/data.csv.gz
Exporting generated_public/mp_20/WyckoffTransformer/DiffCSP++10k/CHGNet_free/DFT-GGA-relax-1/data.csv.gz
Exporting generated_public/mp_20/UN-DiffCSP++43/data.csv.gz
Exporting generated_public/mp_20/MiAD/CHGNet_free/DFT-GGA-relax-1/data.csv.gz


spglib: No centring was found.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: s



  0%|          | 0/9887 [00:00<?, ?it/s]

Exporting generated_public/mp_20/WyckoffTransformerNew/data.csv.gz


  0%|          | 0/8 [00:00<?, ?it/s]

Exporting generated_public/mp_20_biternary/split/test/data.csv.gz
Parsing CIFs...






  0%|          | 0/7118 [00:00<?, ?it/s]

Exporting generated_public/mp_20_biternary/WyCryst/CrySPR/CHGNet_fix/DFT/data.csv.gz






  0%|          | 0/103 [00:00<?, ?it/s]

Exporting generated_public/mp_20_biternary/split/train/data.csv.gz
Parsing CIFs...






  0%|          | 0/21317 [00:00<?, ?it/s]

Exporting generated_public/mp_20_biternary/WyCryst/CrySPR/CHGNet_fix/data.csv.gz






  0%|          | 0/994 [00:00<?, ?it/s]

Exporting generated_public/mp_20_biternary/WyckoffTransformer/CrySPR/CHGNet_fix/data.csv.gz
   Unnamed: 0      formula                                           file_dir  \
0           0       K6 Te4  temp_files/nikita_Tformer_gene2/wyckoff_genes_...   
1           1       Fe2 W2  temp_files/nikita_Tformer_gene2/wyckoff_genes_...   
2           2   Tb3 Nd3 S6  temp_files/nikita_Tformer_gene2/wyckoff_genes_...   
3           3  Sr4 Br8 O32  temp_files/nikita_Tformer_gene2/wyckoff_genes_...   
4           4  Ac4 Sn4 Ir8  temp_files/nikita_Tformer_gene2/wyckoff_genes_...   

                                           error_str  folder_ind  \
0                                           no_error         224   
1                                           no_error         460   
2  No valid decomposition found for PDEntry : Tb3...         546   
3                                           no_error         945   
4                                           no_error         382   

   chgnet_eh





  0%|          | 0/999 [00:00<?, ?it/s]

Exporting generated_public/mp_20_biternary/split/val/data.csv.gz
Parsing CIFs...






  0%|          | 0/7140 [00:00<?, ?it/s]

Exporting generated_public/mp_20_biternary/WyckoffTransformer/data.csv.gz
   Unnamed: 0      formula                                           file_dir  \
0           0       K6 Te4  temp_files/nikita_Tformer_gene2/wyckoff_genes_...   
1           1       Fe2 W2  temp_files/nikita_Tformer_gene2/wyckoff_genes_...   
2           2   Tb3 Nd3 S6  temp_files/nikita_Tformer_gene2/wyckoff_genes_...   
3           3  Sr4 Br8 O32  temp_files/nikita_Tformer_gene2/wyckoff_genes_...   
4           4  Ac4 Sn4 Ir8  temp_files/nikita_Tformer_gene2/wyckoff_genes_...   

                                           error_str  folder_ind  \
0                                           no_error         224   
1                                           no_error         460   
2  No valid decomposition found for PDEntry : Tb3...         546   
3                                           no_error         945   
4                                           no_error         382   

   chgnet_ehull  corrected_chg



  0%|          | 0/7 [00:00<?, ?it/s]

Exporting generated_public/mpts_52/split/test/data.csv.gz
Parsing CIFs...


spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.




  0%|          | 0/8096 [00:00<?, ?it/s]

Exporting generated_public/mpts_52/split/train/data.csv.gz
Parsing CIFs...


spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.




  0%|          | 0/27380 [00:00<?, ?it/s]

Exporting generated_public/mpts_52/SymmCD/CHGNet_fix/data.csv.gz
Read 9170 CIFs


Occupancy 2.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 2.0 exceeded tolerance.


Valid records: 8890


ERROR:scripts.data:Attempt 0 failed to convert structure Full Formula (Ta1 Cu1 H6 N2 Cl2 O18)
Reduced Formula: TaCuH6N2(ClO9)2
abc   :   6.293473   5.646473  10.684729
angles:  80.702843  97.080959  94.641670
pbc   :       True       True       True
Sites (30)
  #  SP           a         b         c
---  ----  --------  --------  --------
  0  Ta    0         0         0
  1  Cu    0         0.5       1
  2  H     0.71503   0.495429  0.685904
  3  H     0.28497   0.504571  0.314096
  4  H     0.645474  0.658189  0.347981
  5  H     0.354526  0.341811  0.652019
  6  H     0.967011  0.386342  0.788459
  7  H     0.032989  0.613658  0.211541
  8  N     0.767586  0.616644  0.519429
  9  N     0.232414  0.383356  0.480571
 10  Cl    0.868324  0.041336  0.253038
 11  Cl    0.131676  0.958664  0.746962
 12  O     0.709188  0.657247  0.633825
 13  O     0.290812  0.342753  0.366175
 14  O     0.7735    0.044577  0.378012
 15  O     0.2265    0.955423  0.621988
 16  O     0.357441  0.945886  0.



  0%|          | 0/8890 [00:00<?, ?it/s]

Exporting generated_public/mpts_52/WyckoffTransformer/CrySPR/CHGNet_fix/data.csv.gz
Exporting generated_public/mpts_52/split/val/data.csv.gz
Parsing CIFs...


spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.
spglib: get_bravais_exact_positions_and_lattice failed.
spglib: ssm_get_exact_positions failed.




  0%|          | 0/5000 [00:00<?, ?it/s]

Exporting generated_public/mpts_52/WyckoffTransformer/data.csv.gz
Exporting generated_public/mpts_52/SymmCD/data.csv.gz






  0%|          | 0/9170 [00:00<?, ?it/s]