In [1]:
from pymatgen.io.cif import CifParser
from pymatgen.core import Structure
def read_cif(cif: str) -> Structure|None:
    try:
        return CifParser.from_str(cif).parse_structures(primitive=False)[0]
    except ValueError:
        return None

In [2]:
from pathlib import Path
import sys
sys.path.append("../..")
from evaluation.cdvae_metrics import structure_validity
from operator import methodcaller
import pandas as pd
def guaraded_structure_validity(structure: Structure) -> bool:
    try:
        return structure_validity(structure)
    except ValueError:
        return False
def filter_cifs(input_cif_path: Path, output_json_path: Path) -> None:
    """
    Filter CIFs to retain only valid structures.
    """
    # Read the CIFs from the input file
    cifs = pd.read_csv(input_cif_path, header=None, names=['cif']).squeeze("columns")
    print(f"CIFs contain {len(cifs)} structures")
    structures = cifs.map(read_cif)
    filtered_structures = structures.dropna()
    print(f"Parsed {len(filtered_structures)} structures")
    valid_distances = filtered_structures.map(guaraded_structure_validity)
    valid_structures = filtered_structures[valid_distances]
    print(f"Filtered to {len(valid_structures)} valid structures")
    # Save the filtered structures to a JSON file
    valid_structures.map(methodcaller("as_dict")).to_json(output_json_path, orient="records")

In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.


In [3]:
generated_path = Path("..", "..", "generated", "Dropbox")
filter_cifs(generated_path / 'mp_20/SymmCD/crystal_symmcd_mp20.csv.gz',
            generated_path / 'mp_20/SymmCD/crystal_symmcd_mp20_valid.json.gz')

CIFs contain 10000 structures


Occupancy 2.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 2.0 exceeded tolerance.
Occupancy 6.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 6.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 2.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 2.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 2.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 2.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 2.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 6.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 2.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 2.0 exceeded tolerance.
Some occupancies ([2.0, 2.0]) sum to > 1! If they are within the occupancy_tolerance, they will be rescaled. The current occupancy_tolerance is set to: 1.0
No structure par

Parsed 9847 structures
Filtered to 9475 valid structures


In [4]:
filter_cifs(generated_path / 'mpts_52/SymmCD/crystal_symmcd_mpts52.csv.gz',
            generated_path / 'mpts_52/SymmCD/crystal_symmcd_mpts52_valid.json.gz')

CIFs contain 10000 structures


Occupancy 2.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 2.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 2.0 exceeded tolerance.
Occupancy 6.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 6.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 2.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 2.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 2.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 2.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 2.0 exceeded tolerance.
Occupancy 3.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 3.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 6.0 exceeded tolerance.
Occupancy 7.0 exceeded tolerance.
No structure parsed for section 1 in CIF.
Occupancy 7.0 exceeded tolerance.
No structure parsed for sect

Parsed 9661 structures
Filtered to 9170 valid structures
