Skip to content

Commit

Permalink
Fix loading 2023-02-07-ppd-mp.pkl.gz (#26)
Browse files Browse the repository at this point in the history
* fix fetch_process_wbm_dataset.py after pandas v2 breaking changes

* drop pytest-markdown-docs from optional deps

* fix double slash in PRED_FILES

* bump deps

* update docs, DataFiles and matbench_discovery/energy.py with updated (2023-02-07) MP elemental reference energies (closes #23)

* update 2022-10-19-wbm-summary.csv formation energies with 2023-02-07 element reference energies

compress data/mp/2023-02-07-mp-elemental-reference-entries.json.gz
update data/figshare/1.0.0.json file links

* pin pandas>=2.0.0

#22 (comment)
mark test_load_train_test_no_mock() for mp_computed_structure_entries as very_slow

* load_train_test() support loading and caching pickle files (for mp_patched_phase_diagram)

change signature from data_names (str | list[str], optional) = 'all' to data_key (str)

* rename load_train_test() to load()
  • Loading branch information
janosh committed Jun 20, 2023
1 parent 44a6873 commit b2e98a0
Show file tree
Hide file tree
Showing 14 changed files with 165 additions and 171 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ data/**/raw
data/**/tsne
data/2022-*
data/m3gnet-*
!data/mp/2023-02-07-mp-elemental-reference-entries.json.gz

# slurm + Weights and Biases logs
wandb/
Expand Down
4 changes: 2 additions & 2 deletions data/figshare/1.0.0.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"mp_computed_structure_entries": "https://figshare.com/ndownloader/files/40344436",
"mp_elemental_ref_entries": "https://figshare.com/ndownloader/files/40344445",
"mp_elemental_ref_entries": "https://figshare.com/ndownloader/files/40387775",
"mp_energies": "https://figshare.com/ndownloader/files/40344448",
"mp_patched_phase_diagram": "https://figshare.com/ndownloader/files/40344451",
"wbm_computed_structure_entries": "https://figshare.com/ndownloader/files/40344463",
"wbm_initial_structures": "https://figshare.com/ndownloader/files/40344466",
"wbm_cses_plus_init_structs": "https://figshare.com/ndownloader/files/40344469",
"wbm_summary": "https://figshare.com/ndownloader/files/40344475"
"wbm_summary": "https://figshare.com/ndownloader/files/40407575"
}
Binary file not shown.
8 changes: 5 additions & 3 deletions data/mp/build_phase_diagram.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@
wbm_computed_entries + mp_computed_entries, verbose=True
)

# save MP+WBM PPD to disk (not run)
with gzip.open(f"{module_dir}/{today}-ppd-mp.pkl.gz", "wb") as zip_file:
# save MP+WBM PPD to disk (was not run)
with gzip.open(f"{module_dir}/{today}-ppd-mp-wbm.pkl.gz", "wb") as zip_file:
pickle.dump(mp_wbm_ppd, zip_file)


Expand All @@ -98,7 +98,9 @@
elemental_ref_entries = get_elemental_ref_entries(mp_computed_entries)

# save elemental_ref_entries to disk as json
with open(f"{ROOT}/data/mp/{today}-mp-elemental-reference-entries.json", "w") as file:
with gzip.open(
f"{ROOT}/data/mp/{today}-mp-elemental-reference-entries.json.gz", "wt"
) as file:
json.dump(elemental_ref_entries, file, default=lambda x: x.as_dict())


Expand Down
54 changes: 32 additions & 22 deletions data/wbm/fetch_process_wbm_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def increment_wbm_material_id(wbm_id: str) -> str:


# %%
df_wbm["computed_structure_entry"] = pd.concat(dfs_wbm_cses.values()).to_numpy()
df_wbm["computed_structure_entry"] = np.concatenate([*dfs_wbm_cses.values()]).squeeze()

for mat_id, cse in df_wbm.computed_structure_entry.items():
# needed to ensure MaterialsProjectCompatibility can process the entries
Expand Down Expand Up @@ -319,9 +319,9 @@ def increment_wbm_material_id(wbm_id: str) -> str:
)


assert sum(df_summary.index == "None") == 6
assert sum(no_id_mask := df_summary.index.isna()) == 6, f"{sum(no_id_mask)=}"
# the 'None' materials have 0 volume, energy, n_sites, bandgap, etc.
assert all(df_summary[df_summary.index == "None"].drop(columns=["formula"]) == 0)
assert all(df_summary[no_id_mask].drop(columns=["formula"]) == 0)
assert len(df_summary.query("volume > 0")) == len(df_wbm) + len(nan_init_structs_ids)
# make sure dropping materials with 0 volume removes exactly 6 materials, the same ones
# listed in bad_struct_ids above
Expand All @@ -332,14 +332,22 @@ def increment_wbm_material_id(wbm_id: str) -> str:

df_summary.index = df_summary.index.map(increment_wbm_material_id) # format IDs
# drop materials with id='None' and missing initial structures
df_summary = df_summary.drop(index=[*nan_init_structs_ids, "None"])
df_summary = df_summary.drop(index=[*nan_init_structs_ids, float("NaN")])

# the 8403 material IDs in step 3 with final number larger than any of the ones in
# bad_struct_ids are now misaligned between df_summary and df_wbm
# the IDs in df_summary are consecutive while the IDs in df_wbm skip over the numbers in
# bad_struct_ids. we fix this with fix_bad_struct_index_mismatch() by mapping the IDs in
# df_wbm to the ones in df_summary so that both indices become consecutive.
assert sum(df_summary.index != df_wbm.index) == 8403
assert {*df_summary.index} - {*df_wbm.index} == {
"wbm-3-70803",
"wbm-3-70804",
"wbm-3-70826",
"wbm-3-70827",
"wbm-3-70829",
"wbm-3-70830",
}


def fix_bad_struct_index_mismatch(material_id: str) -> str:
Expand Down Expand Up @@ -559,7 +567,6 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
assert sum(df_wbm.index != df_summary.index) == 0

e_form_col = "e_form_per_atom_uncorrected"
assert e_form_col not in df_summary

for row in tqdm(df_wbm.itertuples(), total=len(df_wbm)):
mat_id, cse, formula = row.Index, row.cse, row.formula_from_cse
Expand All @@ -568,17 +575,21 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:

entry_like = dict(composition=formula, energy=cse.uncorrected_energy)
e_form = get_e_form_per_atom(entry_like)
e_form_ppd = ppd_mp.get_form_energy_per_atom(cse)
e_form_ppd = ppd_mp.get_form_energy_per_atom(cse) - cse.correction_per_atom

correction = cse.correction_per_atom
# make sure the PPD.get_e_form_per_atom() and standalone get_e_form_per_atom()
# method of calculating formation energy agree
assert (
abs(e_form - (e_form_ppd - correction)) < 1e-4
), f"{mat_id=}: {e_form=:.5} != {e_form_ppd - correction=:.5}"
abs(e_form - e_form_ppd) < 1e-4
), f"{mat_id}: {e_form=:.3} != {e_form_ppd=:.3} (diff={e_form - e_form_ppd:.3}))"
df_summary.at[cse.entry_id, e_form_col] = e_form


df_summary[e_form_col.replace("uncorrected", "mp2020_corrected")] = (
df_summary[e_form_col] + df_summary["e_correction_per_atom_mp2020"]
)


# %%
try:
from aviary.wren.utils import get_aflow_label_from_spglib
Expand Down Expand Up @@ -623,17 +634,16 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
df_summary.round(6).to_csv(f"{module_dir}/{today}-wbm-summary.csv")


# %% read summary data from disk
df_summary = pd.read_csv(f"{module_dir}/2022-10-19-wbm-summary.csv").set_index(
"material_id"
)


# %% read WBM initial structures and computed structure entries from disk
df_wbm = pd.read_json(
f"{module_dir}/2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
).set_index("material_id")
# %% only here to load data quickly for later inspection
if False:
df_summary = pd.read_csv(f"{module_dir}/2022-10-19-wbm-summary.csv").set_index(
"material_id"
)
df_wbm = pd.read_json(
f"{module_dir}/2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
).set_index("material_id")

df_wbm["cse"] = [
ComputedStructureEntry.from_dict(x) for x in tqdm(df_wbm.computed_structure_entry)
]
df_wbm["cse"] = [
ComputedStructureEntry.from_dict(x)
for x in tqdm(df_wbm.computed_structure_entry)
]
133 changes: 63 additions & 70 deletions matbench_discovery/data.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
from __future__ import annotations

import gzip
import json
import os
import pickle
import sys
import urllib.error
from collections.abc import Sequence
import urllib.request
from glob import glob
from pathlib import Path
from typing import Any, Callable

import pandas as pd
from pymatgen.core import Structure
from pymatgen.entries.computed_entries import ComputedStructureEntry
from monty.json import MontyDecoder
from pymatgen.analysis.phase_diagram import PatchedPhaseDiagram
from tqdm import tqdm

from matbench_discovery import FIGSHARE
Expand Down Expand Up @@ -40,13 +42,13 @@ def as_dict_handler(obj: Any) -> dict[str, Any] | None:
# removes e.g. non-serializable AseAtoms from M3GNet relaxation trajectories


def load_train_test(
data_names: str | Sequence[str],
def load(
data_key: str,
version: str = figshare_versions[-1],
cache_dir: str | Path = default_cache_dir,
hydrate: bool = False,
**kwargs: Any,
) -> pd.DataFrame:
) -> pd.DataFrame | PatchedPhaseDiagram:
"""Download parts of or the full MP training data and WBM test data as pandas
DataFrames. The full training and test sets are each about ~500 MB as compressed
JSON which will be cached locally to cache_dir for faster re-loading unless
Expand All @@ -56,8 +58,8 @@ def load_train_test(
see https://janosh.github.io/matbench-discovery/contribute#--direct-download.
Args:
data_names (str | list[str], optional): Which parts of the MP/WBM data to load.
Can be any subset of set(DATA_FILES) or 'all'.
data_key (str): Which parts of the MP/WBM data to load. Must be one of
list(DATA_FILES).
version (str, optional): Which version of the dataset to load. Defaults to
latest version of data files published to Figshare. Pass any invalid version
to see valid options.
Expand All @@ -71,77 +73,68 @@ def load_train_test(
depending on which file is loaded.
Raises:
ValueError: On bad version number or bad data names.
ValueError: On bad version number or bad data_key.
Returns:
pd.DataFrame: Single dataframe or dictionary of dfs if multiple data requested.
"""
if version not in figshare_versions:
raise ValueError(f"Unexpected {version=}. Must be one of {figshare_versions}.")
if data_names == "all":
data_names = list(DATA_FILES)
elif isinstance(data_names, str):
data_names = [data_names]

if missing := set(data_names) - set(DATA_FILES):
raise ValueError(f"{missing} must be subset of {set(DATA_FILES)}")
if not isinstance(data_key, str) or data_key not in DATA_FILES:
raise ValueError(f"Unknown {data_key=}, must be one of {list(DATA_FILES)}.")

with open(f"{FIGSHARE}/{version}.json") as json_file:
file_urls = json.load(json_file)

dfs = {}
for key in data_names:
file = DataFiles.__dict__[key]
csv_ext = (".csv", ".csv.gz", ".csv.bz2")
reader = pd.read_csv if file.endswith(csv_ext) else pd.read_json

cache_path = f"{cache_dir}/{file}"
if os.path.isfile(cache_path): # load from disk cache
print(f"Loading {key!r} from cached file at {cache_path!r}")
df = reader(cache_path, **kwargs)
else: # download from Figshare URL
# manually set compression since pandas can't infer from URL
if file.endswith(".gz"):
kwargs.setdefault("compression", "gzip")
elif file.endswith(".bz2"):
kwargs.setdefault("compression", "bz2")
url = file_urls[key]
print(f"Downloading {key!r} from {url}")
file = DataFiles.__dict__[data_key]

cache_path = f"{cache_dir}/{file}"
if not os.path.isfile(cache_path): # download from Figshare URL
url = file_urls[data_key]
print(f"Downloading {data_key!r} from {url}")
try:
# ensure directory exists
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
# download and save to disk
urllib.request.urlretrieve(url, cache_path)
print(f"Cached {data_key!r} to {cache_path!r}")
except urllib.error.HTTPError as exc:
raise ValueError(f"Bad {url=}") from exc
except Exception:
print(f"\n\nvariable dump:\n{file=},\n{url=}")
raise

print(f"Loading {data_key!r} from cached file at {cache_path!r}")
if ".pkl" in file: # handle key='mp_patched_phase_diagram' separately
with gzip.open(cache_path, "rb") as zip_file:
return pickle.load(zip_file)

csv_ext = (".csv", ".csv.gz", ".csv.bz2")
reader = pd.read_csv if file.endswith(csv_ext) else pd.read_json
try:
df = reader(cache_path, **kwargs)
except Exception:
print(f"\n\nvariable dump:\n{file=},\n{reader=}\n{kwargs=}")
raise

if "material_id" in df:
df = df.set_index("material_id")
if hydrate:
for col in df:
if not isinstance(df[col].iloc[0], dict):
continue
try:
df = reader(url, **kwargs)
except urllib.error.HTTPError as exc:
raise ValueError(f"Bad {url=}") from exc
# convert dicts to pymatgen Structures and ComputedStructureEntrys
df[col] = [
MontyDecoder().process_decoded(dct)
for dct in tqdm(df[col], desc=col)
]
except Exception:
print(f"\n\nvariable dump:\n{file=},\n{url=},\n{reader=},\n{kwargs=}")
print(f"\n\nvariable dump:\n{col=},\n{df[col]=}")
raise
if cache_dir and not os.path.isfile(cache_path):
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
if ".csv" in file:
df.to_csv(cache_path, index=False)
elif ".json" in file:
df.to_json(cache_path, default_handler=as_dict_handler)
else:
raise ValueError(f"Unexpected file type {file}")
print(f"Cached {key!r} to {cache_path!r}")
if "material_id" in df:
df = df.set_index("material_id")
if hydrate:
for col in df:
if not isinstance(df[col].iloc[0], dict):
continue
try:
df[col] = [
ComputedStructureEntry.from_dict(d)
for d in tqdm(df[col], desc=col)
]
except Exception:
df[col] = [Structure.from_dict(d) for d in tqdm(df[col], desc=col)]

dfs[key] = df

if len(data_names) == 1:
return dfs[data_names[0]]
return dfs

return df


def glob_to_df(
Expand Down Expand Up @@ -228,20 +221,20 @@ class DataFiles(Files):
def _on_not_found(self, key: str, msg: str) -> None: # type: ignore[override]
msg += (
" Would you like to download it now using matbench_discovery."
f"data.load_train_test({key!r}). This will cache the file for future use."
f"data.load({key!r}). This will cache the file for future use."
)

# default to 'y' if not in interactive session, and user can't answer
answer = "" if sys.stdin.isatty() else "y"
while answer not in ("y", "n"):
answer = input(f"{msg} [y/n] ").lower().strip()
if answer == "y":
load_train_test(key) # download and cache data file
load(key) # download and cache data file

mp_computed_structure_entries = (
"mp/2023-02-07-mp-computed-structure-entries.json.gz"
)
mp_elemental_ref_entries = "mp/2022-09-19-mp-elemental-reference-entries.json"
mp_elemental_ref_entries = "mp/2023-02-07-mp-elemental-reference-entries.json.gz"
mp_energies = "mp/2023-01-10-mp-energies.csv"
mp_patched_phase_diagram = "mp/2023-02-07-ppd-mp.pkl.gz"
wbm_computed_structure_entries = (
Expand All @@ -254,9 +247,9 @@ def _on_not_found(self, key: str, msg: str) -> None: # type: ignore[override]
wbm_summary = "wbm/2022-10-19-wbm-summary.csv"


# data files can be downloaded and cached with matbench_discovery.data.load_train_test()
# data files can be downloaded and cached with matbench_discovery.data.load()
DATA_FILES = DataFiles()


df_wbm = load_train_test("wbm_summary")
df_wbm = load("wbm_summary")
df_wbm["material_id"] = df_wbm.index
10 changes: 4 additions & 6 deletions matbench_discovery/energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ def get_elemental_ref_entries(

# tested to agree with TRI's MP reference energies
# https://github.com/TRI-AMDD/CAMD/blob/1c965cba636531e542f4821a555b98b2d81ed034/camd/utils/data.py#L134
# fmt: off
mp_elemental_ref_energies = {
"Ne": -0.0259, "He": -0.0091, "Ar": -0.0688, "F": -1.9115, "O": -4.948, "Cl": -1.8485, "N": -8.3365, "Kr": -0.0567, "Br": -1.6369, "I": -1.524, "Xe": -0.0362, "S": -4.1364, "Se": -3.4959, "C": -9.2268, "Au": -3.2739, "W": -12.9581, "Pb": -3.7126, "Rh": -7.3643, "Pt": -6.0709, "Ru": -9.2744, "Pd": -5.1799, "Os": -11.2274, "Ir": -8.8384, "H": -3.3927, "P": -5.4133, "As": -4.6591, "Mo": -10.8456, "Te": -3.1433, "Sb": -4.129, "B": -6.6794, "Bi": -3.89, "Ge": -4.623, "Hg": -0.3037, "Sn": -4.0096, "Ag": -2.8326, "Ni": -5.7801, "Tc": -10.3606, "Si": -5.4253, "Re": -12.4445, "Cu": -4.0992, "Co": -7.1083, "Fe": -8.47, "Ga": -3.0281, "In": -2.7517, "Cd": -0.9229, "Cr": -9.653, "Zn": -1.2597, "V": -9.0839, "Tl": -2.3626, "Al": -3.7456, "Nb": -10.1013, "Be": -3.7394, "Mn": -9.162, "Ti": -7.8955, "Ta": -11.8578, "Pa": -9.5147, "U": -11.2914, "Sc": -6.3325, "Np": -12.9478, "Zr": -8.5477, "Mg": -1.6003, "Th": -7.4139, "Hf": -9.9572, "Pu": -14.2678, "Lu": -4.521, "Tm": -4.4758, "Er": -4.5677, "Ho": -4.5824, "Y": -6.4665, "Dy": -4.6068, "Gd": -14.0761, "Eu": -10.292, "Sm": -4.7186, "Nd": -4.7681, "Pr": -4.7809, "Pm": -4.7505, "Ce": -5.9331, "Yb": -1.5396, "Tb": -4.6344, "La": -4.936, "Ac": -4.1212, "Ca": -2.0056, "Li": -1.9089, "Sr": -1.6895, "Na": -1.3225, "Ba": -1.919, "Rb": -0.9805, "K": -1.1104, "Cs": -0.8954, # noqa: E501
elem: round(entry.energy_per_atom, 4)
for elem, entry in mp_elem_reference_entries.items()
}
# fmt: on


def get_e_form_per_atom(
entry: EntryLike, elemental_ref_energies: dict[str, float] = None
entry: EntryLike,
elemental_ref_energies: dict[str, float] = mp_elemental_ref_energies,
) -> float:
"""Get the formation energy of a composition from a list of entries and a dict
mapping elements to reference energies.
Expand All @@ -96,8 +96,6 @@ def get_e_form_per_atom(
Returns:
float: formation energy in eV/atom.
"""
elemental_ref_energies = elemental_ref_energies or mp_elemental_ref_energies

if isinstance(entry, dict):
energy = entry["energy"]
comp = Composition(entry["composition"]) # is idempotent if already Composition
Expand Down
Loading

0 comments on commit b2e98a0

Please sign in to comment.