In [1]:
import gzip
import json
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from monty.json import MontyDecoder
from pymatgen.io.cif import CifWriter
import matbench_discovery.data
import matbench_discovery.energy

In [2]:
mp_energies = pd.read_csv(matbench_discovery.data.DataFiles.mp_energies.path, index_col="material_id")

In [3]:
with gzip.open(matbench_discovery.data.DataFiles.mp_computed_structure_entries.path, "rt") as f:
    mp_2022_raw = json.load(f)

In [None]:
mp_2022_processed = {}
decoder = MontyDecoder()
for entry_id, material_id in mp_2022_raw['material_id'].items():
    entry = decoder.process_decoded(mp_2022_raw['entry'][entry_id])
    mp_2022_processed[material_id] = {
        "cif": str(CifWriter(entry.structure)),
        "formation_energy_per_atom": matbench_discovery.energy.get_e_form_per_atom(entry)
    }
mp_2022 = pd.DataFrame.from_dict(mp_2022_processed, orient="index")

In [11]:
assert (mp_2022.reindex(mp_energies.index).formation_energy_per_atom - mp_energies.formation_energy_per_atom).abs().max() < 1e-4

In [None]:
train, val = train_test_split(mp_2022, test_size=0.1, random_state=42)
train.to_csv(Path("data", "matbench_discovery_mp_2022", "train.csv.gz"), index_label="material_id")
val.to_csv(Path("data", "matbench_discovery_mp_2022", "val.csv.gz"), index_label="material_id")

In [5]:
print(len(mp_2022), len(train), len(val))

154718 139246 15472
