Fix loading 2023-02-07-ppd-mp.pkl.gz (#26)

* fix fetch_process_wbm_dataset.py after pandas v2 breaking changes * drop pytest-markdown-docs from optional deps * fix double slash in PRED_FILES * bump deps * update docs, DataFiles and matbench_discovery/energy.py with updated (2023-02-07) MP elemental reference energies (closes #23) * update 2022-10-19-wbm-summary.csv formation energies with 2023-02-07 element reference energies compress data/mp/2023-02-07-mp-elemental-reference-entries.json.gz update data/figshare/1.0.0.json file links * pin pandas>=2.0.0 #22 (comment) mark test_load_train_test_no_mock() for mp_computed_structure_entries as very_slow * load_train_test() support loading and caching pickle files (for mp_patched_phase_diagram) change signature from data_names (str | list[str], optional) = 'all' to data_key (str) * rename load_train_test() to load()
janosh · Jun 20, 2023 · b2e98a0 · b2e98a0
1 parent 44a6873
commit b2e98a0
Show file tree

Hide file tree

Showing 14 changed files with 165 additions and 171 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,6 +14,7 @@ data/**/raw
 data/**/tsne
 data/2022-*
 data/m3gnet-*
+!data/mp/2023-02-07-mp-elemental-reference-entries.json.gz
 
 # slurm + Weights and Biases logs
 wandb/

diff --git a/data/figshare/1.0.0.json b/data/figshare/1.0.0.json
@@ -1,10 +1,10 @@
 {
   "mp_computed_structure_entries": "https://figshare.com/ndownloader/files/40344436",
-  "mp_elemental_ref_entries": "https://figshare.com/ndownloader/files/40344445",
+  "mp_elemental_ref_entries": "https://figshare.com/ndownloader/files/40387775",
   "mp_energies": "https://figshare.com/ndownloader/files/40344448",
   "mp_patched_phase_diagram": "https://figshare.com/ndownloader/files/40344451",
   "wbm_computed_structure_entries": "https://figshare.com/ndownloader/files/40344463",
   "wbm_initial_structures": "https://figshare.com/ndownloader/files/40344466",
   "wbm_cses_plus_init_structs": "https://figshare.com/ndownloader/files/40344469",
-  "wbm_summary": "https://figshare.com/ndownloader/files/40344475"
+  "wbm_summary": "https://figshare.com/ndownloader/files/40407575"
 }
diff --git a/data/mp/2023-02-07-mp-elemental-reference-entries.json.gz b/data/mp/2023-02-07-mp-elemental-reference-entries.json.gz
diff --git a/data/mp/build_phase_diagram.py b/data/mp/build_phase_diagram.py
@@ -88,8 +88,8 @@
     wbm_computed_entries + mp_computed_entries, verbose=True
 )
 
-# save MP+WBM PPD to disk (not run)
-with gzip.open(f"{module_dir}/{today}-ppd-mp.pkl.gz", "wb") as zip_file:
+# save MP+WBM PPD to disk (was not run)
+with gzip.open(f"{module_dir}/{today}-ppd-mp-wbm.pkl.gz", "wb") as zip_file:
     pickle.dump(mp_wbm_ppd, zip_file)
 
 
@@ -98,7 +98,9 @@
 elemental_ref_entries = get_elemental_ref_entries(mp_computed_entries)
 
 # save elemental_ref_entries to disk as json
-with open(f"{ROOT}/data/mp/{today}-mp-elemental-reference-entries.json", "w") as file:
+with gzip.open(
+    f"{ROOT}/data/mp/{today}-mp-elemental-reference-entries.json.gz", "wt"
+) as file:
     json.dump(elemental_ref_entries, file, default=lambda x: x.as_dict())
 
 

diff --git a/data/wbm/fetch_process_wbm_dataset.py b/data/wbm/fetch_process_wbm_dataset.py
@@ -222,7 +222,7 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 
 
 # %%
-df_wbm["computed_structure_entry"] = pd.concat(dfs_wbm_cses.values()).to_numpy()
+df_wbm["computed_structure_entry"] = np.concatenate([*dfs_wbm_cses.values()]).squeeze()
 
 for mat_id, cse in df_wbm.computed_structure_entry.items():
     # needed to ensure MaterialsProjectCompatibility can process the entries
@@ -319,9 +319,9 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 )
 
 
-assert sum(df_summary.index == "None") == 6
+assert sum(no_id_mask := df_summary.index.isna()) == 6, f"{sum(no_id_mask)=}"
 # the 'None' materials have 0 volume, energy, n_sites, bandgap, etc.
-assert all(df_summary[df_summary.index == "None"].drop(columns=["formula"]) == 0)
+assert all(df_summary[no_id_mask].drop(columns=["formula"]) == 0)
 assert len(df_summary.query("volume > 0")) == len(df_wbm) + len(nan_init_structs_ids)
 # make sure dropping materials with 0 volume removes exactly 6 materials, the same ones
 # listed in bad_struct_ids above
@@ -332,14 +332,22 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 
 df_summary.index = df_summary.index.map(increment_wbm_material_id)  # format IDs
 # drop materials with id='None' and missing initial structures
-df_summary = df_summary.drop(index=[*nan_init_structs_ids, "None"])
+df_summary = df_summary.drop(index=[*nan_init_structs_ids, float("NaN")])
 
 # the 8403 material IDs in step 3 with final number larger than any of the ones in
 # bad_struct_ids are now misaligned between df_summary and df_wbm
 # the IDs in df_summary are consecutive while the IDs in df_wbm skip over the numbers in
 # bad_struct_ids. we fix this with fix_bad_struct_index_mismatch() by mapping the IDs in
 # df_wbm to the ones in df_summary so that both indices become consecutive.
 assert sum(df_summary.index != df_wbm.index) == 8403
+assert {*df_summary.index} - {*df_wbm.index} == {
+    "wbm-3-70803",
+    "wbm-3-70804",
+    "wbm-3-70826",
+    "wbm-3-70827",
+    "wbm-3-70829",
+    "wbm-3-70830",
+}
 
 
 def fix_bad_struct_index_mismatch(material_id: str) -> str:
@@ -559,7 +567,6 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
 assert sum(df_wbm.index != df_summary.index) == 0
 
 e_form_col = "e_form_per_atom_uncorrected"
-assert e_form_col not in df_summary
 
 for row in tqdm(df_wbm.itertuples(), total=len(df_wbm)):
     mat_id, cse, formula = row.Index, row.cse, row.formula_from_cse
@@ -568,17 +575,21 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
 
     entry_like = dict(composition=formula, energy=cse.uncorrected_energy)
     e_form = get_e_form_per_atom(entry_like)
-    e_form_ppd = ppd_mp.get_form_energy_per_atom(cse)
+    e_form_ppd = ppd_mp.get_form_energy_per_atom(cse) - cse.correction_per_atom
 
-    correction = cse.correction_per_atom
     # make sure the PPD.get_e_form_per_atom() and standalone get_e_form_per_atom()
     # method of calculating formation energy agree
     assert (
-        abs(e_form - (e_form_ppd - correction)) < 1e-4
-    ), f"{mat_id=}: {e_form=:.5} != {e_form_ppd - correction=:.5}"
+        abs(e_form - e_form_ppd) < 1e-4
+    ), f"{mat_id}: {e_form=:.3} != {e_form_ppd=:.3} (diff={e_form - e_form_ppd:.3}))"
     df_summary.at[cse.entry_id, e_form_col] = e_form
 
 
+df_summary[e_form_col.replace("uncorrected", "mp2020_corrected")] = (
+    df_summary[e_form_col] + df_summary["e_correction_per_atom_mp2020"]
+)
+
+
 # %%
 try:
     from aviary.wren.utils import get_aflow_label_from_spglib
@@ -623,17 +634,16 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
 df_summary.round(6).to_csv(f"{module_dir}/{today}-wbm-summary.csv")
 
 
-# %% read summary data from disk
-df_summary = pd.read_csv(f"{module_dir}/2022-10-19-wbm-summary.csv").set_index(
-    "material_id"
-)
-
-
-# %% read WBM initial structures and computed structure entries from disk
-df_wbm = pd.read_json(
-    f"{module_dir}/2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
-).set_index("material_id")
+# %% only here to load data quickly for later inspection
+if False:
+    df_summary = pd.read_csv(f"{module_dir}/2022-10-19-wbm-summary.csv").set_index(
+        "material_id"
+    )
+    df_wbm = pd.read_json(
+        f"{module_dir}/2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
+    ).set_index("material_id")
 
-df_wbm["cse"] = [
-    ComputedStructureEntry.from_dict(x) for x in tqdm(df_wbm.computed_structure_entry)
-]
+    df_wbm["cse"] = [
+        ComputedStructureEntry.from_dict(x)
+        for x in tqdm(df_wbm.computed_structure_entry)
+    ]
diff --git a/matbench_discovery/data.py b/matbench_discovery/data.py
@@ -1,17 +1,19 @@
 from __future__ import annotations
 
+import gzip
 import json
 import os
+import pickle
 import sys
 import urllib.error
-from collections.abc import Sequence
+import urllib.request
 from glob import glob
 from pathlib import Path
 from typing import Any, Callable
 
 import pandas as pd
-from pymatgen.core import Structure
-from pymatgen.entries.computed_entries import ComputedStructureEntry
+from monty.json import MontyDecoder
+from pymatgen.analysis.phase_diagram import PatchedPhaseDiagram
 from tqdm import tqdm
 
 from matbench_discovery import FIGSHARE
@@ -40,13 +42,13 @@ def as_dict_handler(obj: Any) -> dict[str, Any] | None:
         # removes e.g. non-serializable AseAtoms from M3GNet relaxation trajectories
 
 
-def load_train_test(
-    data_names: str | Sequence[str],
+def load(
+    data_key: str,
     version: str = figshare_versions[-1],
     cache_dir: str | Path = default_cache_dir,
     hydrate: bool = False,
     **kwargs: Any,
-) -> pd.DataFrame:
+) -> pd.DataFrame | PatchedPhaseDiagram:
     """Download parts of or the full MP training data and WBM test data as pandas
     DataFrames. The full training and test sets are each about ~500 MB as compressed
     JSON which will be cached locally to cache_dir for faster re-loading unless
@@ -56,8 +58,8 @@ def load_train_test(
     see https://janosh.github.io/matbench-discovery/contribute#--direct-download.
 
     Args:
-        data_names (str | list[str], optional): Which parts of the MP/WBM data to load.
-            Can be any subset of set(DATA_FILES) or 'all'.
+        data_key (str): Which parts of the MP/WBM data to load. Must be one of
+            list(DATA_FILES).
         version (str, optional): Which version of the dataset to load. Defaults to
             latest version of data files published to Figshare. Pass any invalid version
             to see valid options.
@@ -71,77 +73,68 @@ def load_train_test(
             depending on which file is loaded.
 
     Raises:
-        ValueError: On bad version number or bad data names.
+        ValueError: On bad version number or bad data_key.
 
     Returns:
         pd.DataFrame: Single dataframe or dictionary of dfs if multiple data requested.
     """
     if version not in figshare_versions:
         raise ValueError(f"Unexpected {version=}. Must be one of {figshare_versions}.")
-    if data_names == "all":
-        data_names = list(DATA_FILES)
-    elif isinstance(data_names, str):
-        data_names = [data_names]
 
-    if missing := set(data_names) - set(DATA_FILES):
-        raise ValueError(f"{missing} must be subset of {set(DATA_FILES)}")
+    if not isinstance(data_key, str) or data_key not in DATA_FILES:
+        raise ValueError(f"Unknown {data_key=}, must be one of {list(DATA_FILES)}.")
 
     with open(f"{FIGSHARE}/{version}.json") as json_file:
         file_urls = json.load(json_file)
 
-    dfs = {}
-    for key in data_names:
-        file = DataFiles.__dict__[key]
-        csv_ext = (".csv", ".csv.gz", ".csv.bz2")
-        reader = pd.read_csv if file.endswith(csv_ext) else pd.read_json
-
-        cache_path = f"{cache_dir}/{file}"
-        if os.path.isfile(cache_path):  # load from disk cache
-            print(f"Loading {key!r} from cached file at {cache_path!r}")
-            df = reader(cache_path, **kwargs)
-        else:  # download from Figshare URL
-            # manually set compression since pandas can't infer from URL
-            if file.endswith(".gz"):
-                kwargs.setdefault("compression", "gzip")
-            elif file.endswith(".bz2"):
-                kwargs.setdefault("compression", "bz2")
-            url = file_urls[key]
-            print(f"Downloading {key!r} from {url}")
+    file = DataFiles.__dict__[data_key]
+
+    cache_path = f"{cache_dir}/{file}"
+    if not os.path.isfile(cache_path):  # download from Figshare URL
+        url = file_urls[data_key]
+        print(f"Downloading {data_key!r} from {url}")
+        try:
+            # ensure directory exists
+            os.makedirs(os.path.dirname(cache_path), exist_ok=True)
+            # download and save to disk
+            urllib.request.urlretrieve(url, cache_path)
+            print(f"Cached {data_key!r} to {cache_path!r}")
+        except urllib.error.HTTPError as exc:
+            raise ValueError(f"Bad {url=}") from exc
+        except Exception:
+            print(f"\n\nvariable dump:\n{file=},\n{url=}")
+            raise
+
+    print(f"Loading {data_key!r} from cached file at {cache_path!r}")
+    if ".pkl" in file:  # handle key='mp_patched_phase_diagram' separately
+        with gzip.open(cache_path, "rb") as zip_file:
+            return pickle.load(zip_file)
+
+    csv_ext = (".csv", ".csv.gz", ".csv.bz2")
+    reader = pd.read_csv if file.endswith(csv_ext) else pd.read_json
+    try:
+        df = reader(cache_path, **kwargs)
+    except Exception:
+        print(f"\n\nvariable dump:\n{file=},\n{reader=}\n{kwargs=}")
+        raise
+
+    if "material_id" in df:
+        df = df.set_index("material_id")
+    if hydrate:
+        for col in df:
+            if not isinstance(df[col].iloc[0], dict):
+                continue
             try:
-                df = reader(url, **kwargs)
-            except urllib.error.HTTPError as exc:
-                raise ValueError(f"Bad {url=}") from exc
+                # convert dicts to pymatgen Structures and ComputedStructureEntrys
+                df[col] = [
+                    MontyDecoder().process_decoded(dct)
+                    for dct in tqdm(df[col], desc=col)
+                ]
             except Exception:
-                print(f"\n\nvariable dump:\n{file=},\n{url=},\n{reader=},\n{kwargs=}")
+                print(f"\n\nvariable dump:\n{col=},\n{df[col]=}")
                 raise
-            if cache_dir and not os.path.isfile(cache_path):
-                os.makedirs(os.path.dirname(cache_path), exist_ok=True)
-                if ".csv" in file:
-                    df.to_csv(cache_path, index=False)
-                elif ".json" in file:
-                    df.to_json(cache_path, default_handler=as_dict_handler)
-                else:
-                    raise ValueError(f"Unexpected file type {file}")
-                print(f"Cached {key!r} to {cache_path!r}")
-        if "material_id" in df:
-            df = df.set_index("material_id")
-        if hydrate:
-            for col in df:
-                if not isinstance(df[col].iloc[0], dict):
-                    continue
-                try:
-                    df[col] = [
-                        ComputedStructureEntry.from_dict(d)
-                        for d in tqdm(df[col], desc=col)
-                    ]
-                except Exception:
-                    df[col] = [Structure.from_dict(d) for d in tqdm(df[col], desc=col)]
-
-        dfs[key] = df
-
-    if len(data_names) == 1:
-        return dfs[data_names[0]]
-    return dfs
+
+    return df
 
 
 def glob_to_df(
@@ -228,20 +221,20 @@ class DataFiles(Files):
     def _on_not_found(self, key: str, msg: str) -> None:  # type: ignore[override]
         msg += (
             " Would you like to download it now using matbench_discovery."
-            f"data.load_train_test({key!r}). This will cache the file for future use."
+            f"data.load({key!r}). This will cache the file for future use."
         )
 
         # default to 'y' if not in interactive session, and user can't answer
         answer = "" if sys.stdin.isatty() else "y"
         while answer not in ("y", "n"):
             answer = input(f"{msg} [y/n] ").lower().strip()
         if answer == "y":
-            load_train_test(key)  # download and cache data file
+            load(key)  # download and cache data file
 
     mp_computed_structure_entries = (
         "mp/2023-02-07-mp-computed-structure-entries.json.gz"
     )
-    mp_elemental_ref_entries = "mp/2022-09-19-mp-elemental-reference-entries.json"
+    mp_elemental_ref_entries = "mp/2023-02-07-mp-elemental-reference-entries.json.gz"
     mp_energies = "mp/2023-01-10-mp-energies.csv"
     mp_patched_phase_diagram = "mp/2023-02-07-ppd-mp.pkl.gz"
     wbm_computed_structure_entries = (
@@ -254,9 +247,9 @@ def _on_not_found(self, key: str, msg: str) -> None:  # type: ignore[override]
     wbm_summary = "wbm/2022-10-19-wbm-summary.csv"
 
 
-# data files can be downloaded and cached with matbench_discovery.data.load_train_test()
+# data files can be downloaded and cached with matbench_discovery.data.load()
 DATA_FILES = DataFiles()
 
 
-df_wbm = load_train_test("wbm_summary")
+df_wbm = load("wbm_summary")
 df_wbm["material_id"] = df_wbm.index
diff --git a/matbench_discovery/energy.py b/matbench_discovery/energy.py
@@ -70,15 +70,15 @@ def get_elemental_ref_entries(
 
 # tested to agree with TRI's MP reference energies
 # https://github.com/TRI-AMDD/CAMD/blob/1c965cba636531e542f4821a555b98b2d81ed034/camd/utils/data.py#L134
-# fmt: off
 mp_elemental_ref_energies = {
-    "Ne": -0.0259, "He": -0.0091, "Ar": -0.0688, "F": -1.9115, "O": -4.948, "Cl": -1.8485, "N": -8.3365, "Kr": -0.0567, "Br": -1.6369, "I": -1.524, "Xe": -0.0362, "S": -4.1364, "Se": -3.4959, "C": -9.2268, "Au": -3.2739, "W": -12.9581, "Pb": -3.7126, "Rh": -7.3643, "Pt": -6.0709, "Ru": -9.2744, "Pd": -5.1799, "Os": -11.2274, "Ir": -8.8384, "H": -3.3927, "P": -5.4133, "As": -4.6591, "Mo": -10.8456, "Te": -3.1433, "Sb": -4.129, "B": -6.6794, "Bi": -3.89, "Ge": -4.623, "Hg": -0.3037, "Sn": -4.0096, "Ag": -2.8326, "Ni": -5.7801, "Tc": -10.3606, "Si": -5.4253, "Re": -12.4445, "Cu": -4.0992, "Co": -7.1083, "Fe": -8.47, "Ga": -3.0281, "In": -2.7517, "Cd": -0.9229, "Cr": -9.653, "Zn": -1.2597, "V": -9.0839, "Tl": -2.3626, "Al": -3.7456, "Nb": -10.1013, "Be": -3.7394, "Mn": -9.162, "Ti": -7.8955, "Ta": -11.8578, "Pa": -9.5147, "U": -11.2914, "Sc": -6.3325, "Np": -12.9478, "Zr": -8.5477, "Mg": -1.6003, "Th": -7.4139, "Hf": -9.9572, "Pu": -14.2678, "Lu": -4.521, "Tm": -4.4758, "Er": -4.5677, "Ho": -4.5824, "Y": -6.4665, "Dy": -4.6068, "Gd": -14.0761, "Eu": -10.292, "Sm": -4.7186, "Nd": -4.7681, "Pr": -4.7809, "Pm": -4.7505, "Ce": -5.9331, "Yb": -1.5396, "Tb": -4.6344, "La": -4.936, "Ac": -4.1212, "Ca": -2.0056, "Li": -1.9089, "Sr": -1.6895, "Na": -1.3225, "Ba": -1.919, "Rb": -0.9805, "K": -1.1104, "Cs": -0.8954,  # noqa: E501
+    elem: round(entry.energy_per_atom, 4)
+    for elem, entry in mp_elem_reference_entries.items()
 }
-# fmt: on
 
 
 def get_e_form_per_atom(
-    entry: EntryLike, elemental_ref_energies: dict[str, float] = None
+    entry: EntryLike,
+    elemental_ref_energies: dict[str, float] = mp_elemental_ref_energies,
 ) -> float:
     """Get the formation energy of a composition from a list of entries and a dict
     mapping elements to reference energies.
@@ -96,8 +96,6 @@ def get_e_form_per_atom(
     Returns:
         float: formation energy in eV/atom.
     """
-    elemental_ref_energies = elemental_ref_energies or mp_elemental_ref_energies
-
     if isinstance(entry, dict):
         energy = entry["energy"]
         comp = Composition(entry["composition"])  # is idempotent if already Composition