Skip to content

Commit

Permalink
feat: interface thermal stability dataset (#411)
Browse files Browse the repository at this point in the history
* fix concurrent requests
  • Loading branch information
kjappelbaum committed Dec 12, 2022
1 parent 97fa791 commit 4d37dfc
Show file tree
Hide file tree
Showing 10 changed files with 281 additions and 9 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,4 @@ jobs:
- name: Test with pytest
run: |
export RASPA_DIR=/usr/share/miniconda3/envs/test
pytest -n "auto" tests/
pytest -n "auto" --dist="loadgroup" tests/
4 changes: 4 additions & 0 deletions docs/source/api/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,9 @@ Structure datasets
:members:


.. automodule:: mofdscribe.datasets.thermal_stability_dataset
:members:


.. automodule:: mofdscribe.datasets.structuredataset
:members:
3 changes: 3 additions & 0 deletions docs/source/featurizers/host_guest/host_guest_aprdf.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
Guest-centered atomic-property labeled radial distribution function (APRDF)
............................................................................

This featurizer builds on the :ref:`APRDF` featurizer, but instead of using the
correlations between all atoms, it only considers the ones between the guest and all host atoms
(within some cutoff distance).

.. math::
Expand Down
2 changes: 1 addition & 1 deletion docs/source/references.rst
Original file line number Diff line number Diff line change
Expand Up @@ -111,4 +111,4 @@ References
.. [Trappe] `Potoff, J. J.; Siepmann, J. I. Vapor–Liquid Equilibria of Mixtures Containing Alkanes, Carbon Dioxide, and Nitrogen. AIChE Journal 2001, 47 (7), 1676–1682. <https://doi.org/10.1002/aic.690470719>`_
.. [Varoquaux] `Varoquaux, G. Cross-Validation Failure: Small Sample Sizes Lead to Large Error Bars. NeuroImage 2018, 180, 68–77. <https://doi.org/10.1016/j.neuroimage.2017.06.061>`_
.. [Varoquaux] `Varoquaux, G. Cross-Validation Failure: Small Sample Sizes Lead to Large Error Bars. NeuroImage 2018, 180, 68–77. <https://doi.org/10.1016/j.neuroimage.2017.06.061>`_
3 changes: 1 addition & 2 deletions src/mofdscribe/datasets/core_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ class CoREDataset(AbstractStructureDataset):
The available labels are:
* 'pure_CO2_kH': Henry coefficient of CO2 obtained by Widom method in mol kg-1 Pa-1
* 'pure_CO2_widomHOA': Heat of adsorption of CO2 obtained by Widom method in
* 'pure_methane_kH': Henry coefficient of methane obtained by Widom method in mol kg-1 Pa-1
Expand Down Expand Up @@ -256,5 +255,5 @@ def citations(self) -> Tuple[str]:
"title = {Computation-Ready, Experimental Metal{\textendash}Organic Frameworks: "
"A Tool To Enable High-Throughput Screening of Nanoporous Crystals},"
"journal = {Chemistry of Materials}"
"}"
"}",
]
239 changes: 239 additions & 0 deletions src/mofdscribe/datasets/thermal_stability_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
# -*- coding: utf-8 -*-
"""Thermal Stability Dataset."""
import os
from typing import Collection, Optional, Tuple

import numpy as np
import pandas as pd
from loguru import logger

from mofdscribe.constants import MOFDSCRIBE_PYSTOW_MODULE
from mofdscribe.datasets.checks import check_all_file_exists, length_check
from mofdscribe.datasets.dataset import AbstractStructureDataset
from mofdscribe.datasets.utils import compress_dataset

__all__ = ["ThermalStabilityDataset"]


class ThermalStabilityDataset(AbstractStructureDataset):
"""Thermal stability for a subset of CoRE MOFs.
Reproduced from [Nandy2022]_.
Nandy et al. (2022) digitized traces from thermogravimetric analysis.
The decomposition temperature they determined in this way is reported in
`outputs.assigned_T_decomp`.
To reduce the risk of data leakage, we (by default) also only keep one representative
structure for a "base refcode" (i.e. the first five letters of a refcode).
For instance, the base refcode for IGAHED001 is IGAHED. Structures with same
base refcode but different refcodes are often different refinements, or measurements
at different temperatures and hence chemically quite similar. For instance,
the base refcode `UMODEH` would appear 21 times, `KEDJAG` 17 times, and `UMOYOM` 17 times
in the CoRE dataset used by Moosavi et al.
Additionally, we (by default) only keep one structure per "structure hash"
which is an approximate graph-isomoprhism check, assuming the VESTA bond thresholds
for the derivation of the structure graph (e.g. the structure
graph of ULOMAL occurs 59 in the CoRE database used by Moosavi et al.).
The years refer to the publication dates of the paper crossreferenced
in the CSD entry of the structure.
The available labels are:
* `outputs.assigned_T_decomp`: Decomposition temperature in Kelvin.
References::
.. [Nandy2022] `Nandy, A.; Terrones, G.; Arunachalam, N.; Duan, C.;
Kastner, D. W.; Kulik, H. J.
MOFSimplify, Machine Learning Models with Extracted Stability Data
of Three Thousand Metal–Organic Frameworks.
Scientific Data 2022, 9 (1). <https://doi.org/10.1038/s41597-022-01181-0>`_
"""

_files = {
"v0.0.1": {
"df": "https://zenodo.org/record/7428485/files/data.json?download=1",
"structures": "https://zenodo.org/record/7428485/files/structures.tar.gz?download=1",
"expected_length": 2039,
}
}

def __init__(
self,
version: str = "v0.0.1",
drop_basename_duplicates: bool = True,
drop_graph_duplicates: bool = True,
subset: Optional[Collection[int]] = None,
drop_nan: bool = False,
):
"""Construct an instance of the ThermalStabilityDataset.
Args:
version (str): version number to use.
Defaults to "v0.0.1".
drop_basename_duplicates (bool): If True, keep only one structure
per CSD basename. Defaults to True.
drop_graph_duplicates (bool): If True, keep only one structure
per decorated graph hash. Defaults to True.
subset (Collection[int], optional): indices of the structures to include.
Defaults to None.
drop_nan (bool): If True, drop rows with NaN values in features or hashes.
Defaults to True.
Raises:
ValueError: If the provided version number is not available.
"""
self._drop_basename_duplicates = drop_basename_duplicates
self._drop_nan = drop_nan
self._drop_graph_duplicates = drop_graph_duplicates
if version not in self._files:
raise ValueError(
f"Version {version} not available. Available versions: {list(self._files.keys())}"
)
self.version = version

self._structure_dir = MOFDSCRIBE_PYSTOW_MODULE.ensure_untar(
"thermal-stability",
self.version,
name="structures.tar.gz",
url=self._files[version]["structures"],
)

self._df = pd.DataFrame(
MOFDSCRIBE_PYSTOW_MODULE.ensure_json(
"thermal-stability", self.version, name="data.json", url=self._files[version]["df"]
)
).reset_index(drop=True)

compress_dataset(self._df)

length_check(self._df, self._files[version]["expected_length"])

if drop_basename_duplicates:
old_len = len(self._df)
self._df = self._df.drop_duplicates(subset=["info.basename"])
logger.debug(
f"Dropped {old_len - len(self._df)} duplicate basenames. New length {len(self._df)}"
)
if drop_graph_duplicates:
old_len = len(self._df)
self._df = self._df.drop_duplicates(subset=["info.decorated_graph_hash"])
logger.debug(
f"Dropped {old_len - len(self._df)} duplicate graphs. New length {len(self._df)}"
)
self._df = self._df.reset_index(drop=True)
if drop_nan:
self._df.dropna(
subset=[c for c in self._df.columns if c.startswith("features.")]
+ [c for c in self._df.columns if c.startswith("info.")],
inplace=True,
)
self._df.reset_index(drop=True, inplace=True)

if subset is not None:
self._df = self._df.iloc[subset]
self._df = self._df.reset_index(drop=True)

self._structures = [
os.path.join(self._structure_dir, f + ".cif") for f in self._df["info.CoRE_name"]
]

check_all_file_exists(self._structures)

self._years = self._df["info.year"].values
self._decorated_graph_hashes = self._df["info.decorated_graph_hash"].values
self._undecorated_graph_hashes = self._df["info.undecorated_graph_hash"].values
self._decorated_scaffold_hashes = self._df["info.decorated_scaffold_hash"].values
self._undecorated_scaffold_hashes = self._df["info.undecorated_scaffold_hash"].values
self._densities = self._df["info.density"].values
self._labelnames = (c for c in self._df.columns if c.startswith("outputs."))
self._featurenames = (c for c in self._df.columns if c.startswith("features."))
self._infonames = (c for c in self._df.columns if c.startswith("info."))

def get_subset(self, indices: Collection[int]) -> "AbstractStructureDataset":
"""Get a subset of the dataset.
Args:
indices (Collection[int]): indices of the structures to include.
Returns:
AbstractStructureDataset: a new dataset containing only the structures
specified by the indices.
"""
return ThermalStabilityDataset(
version=self.version,
drop_basename_duplicates=self._drop_basename_duplicates,
drop_graph_duplicates=self._drop_graph_duplicates,
subset=indices,
drop_nan=self._drop_nan,
)

@property
def available_info(self) -> Tuple[str]:
return self._infonames

@property
def available_features(self) -> Tuple[str]:
return self._featurenames

@property
def available_labels(self) -> Tuple[str]:
return self._labelnames

def get_labels(self, idx: Collection[int], labelnames: Collection[str] = None) -> np.ndarray:
labelnames = labelnames if labelnames is not None else self._labelnames
return self._df.iloc[idx][list(labelnames)].values

@property
def citations(self) -> Tuple[str]:
return [
"@article{Chung2019,"
"doi = {10.1021/acs.jced.9b00835},"
"url = {https://doi.org/10.1021/acs.jced.9b00835},"
"year = {2019},"
"month = nov,"
"publisher = {American Chemical Society ({ACS})},"
"volume = {64},"
"number = {12},"
"pages = {5985--5998},"
"author = {Yongchul G. Chung and Emmanuel Haldoupis and Benjamin J. Bucior "
"and Maciej Haranczyk and Seulchan Lee and Hongda Zhang and "
"Konstantinos D. Vogiatzis and Marija Milisavljevic and Sanliang Ling "
"and Jeffrey S. Camp and Ben Slater and J. Ilja Siepmann and "
"David S. Sholl and Randall Q. Snurr},"
"title = {Advances, Updates, and Analytics for the Computation-Ready, "
"Experimental Metal{\textendash}Organic Framework Database: {CoRE} {MOF} 2019},"
r"journal = {Journal of Chemical {\&}amp$\mathsemicolon$ Engineering Data}"
"}",
"@article{Chung2014,"
"doi = {10.1021/cm502594j},"
"url = {https://doi.org/10.1021/cm502594j},"
"year = {2014},"
"month = oct,"
"publisher = {American Chemical Society ({ACS})},"
"volume = {26},"
"number = {21},"
"pages = {6185--6192},"
"author = {Yongchul G. Chung and Jeffrey Camp and "
"Maciej Haranczyk and Benjamin J. Sikora and Wojciech Bury "
"and Vaiva Krungleviciute and Taner Yildirim and Omar K. Farha "
"and David S. Sholl and Randall Q. Snurr},"
"title = {Computation-Ready, Experimental Metal{\textendash}Organic Frameworks: "
"A Tool To Enable High-Throughput Screening of Nanoporous Crystals},"
"journal = {Chemistry of Materials}"
"}",
"@article{Nandy_2022,"
"doi = {10.1038/s41597-022-01181-0},"
"url = {https://doi.org/10.1038%2Fs41597-022-01181-0},"
"year = 2022,"
"month = {mar},"
"publisher = {Springer Science and Business Media {LLC}},"
"volume = {9},"
"number = {1},"
"author = {Aditya Nandy and Gianmarco Terrones and "
"Naveen Arunachalam and Chenru Duan and David W. Kastner and Heather J. Kulik},"
"title = {{MOFSimplify}, machine learning models with extracted stability data "
"of three thousand metal{\textendash}organic frameworks},"
"journal = {Sci Data}"
"}",
]
2 changes: 1 addition & 1 deletion tests/bench/test_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def predict(self, idx, structures):
x = np.array([self.featurize(s) for s in structures]).reshape(-1, 1)
return self.model.predict(x)


@pytest.mark.xdist_group(name="core-ds")
def test_mofbench(tmp_path_factory):
"""Test the MOFBench class."""
ds = CoREDataset()
Expand Down
27 changes: 27 additions & 0 deletions tests/datasets/test_thermal_stability.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
"""Test thermal stability dataset."""
import numpy as np
import pytest
from pymatgen.core import IStructure

from mofdscribe.datasets.thermal_stability_dataset import ThermalStabilityDataset


@pytest.mark.xdist_group(name="thermal-stability-ds")
def test_thermal_stability():
dataset = ThermalStabilityDataset()

assert isinstance(list(dataset.get_structures([1]))[0], IStructure)
assert isinstance(dataset.get_labels([1]), np.ndarray)
assert isinstance(dataset.get_years([1]), np.ndarray)
assert len(dataset.get_years([1])) == 1
assert len(dataset.get_years([1, 2, 4])) == 3

# make sure we can get a subset of the dataset
subset = dataset.get_subset([1, 2, 3, 8])
assert isinstance(subset, ThermalStabilityDataset)
assert len(subset._df) == 4
assert (
dataset._df.iloc[[1, 2, 3, 8]]["info.basename"].values == subset._df["info.basename"].values
).all()
assert list(dataset.get_structures([1]))[0] == list(subset.get_structures([0]))[0]
6 changes: 3 additions & 3 deletions tests/metrics/test_adverserial.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# -*- coding: utf-8 -*-
"""Test the adversial validation"""

import pytest
import numpy as np

from mofdscribe.datasets import CoREDataset
from mofdscribe.metrics.adverserial import AdverserialValidator
from mofdscribe.splitters import DensitySplitter, HashSplitter


@pytest.mark.xdist_group(name="core-ds")
def test_adverserial_validator():
"""Test the adverserial validation"""
dataset = CoREDataset()
Expand All @@ -25,7 +25,7 @@ def test_adverserial_validator():
assert len(score) == 5
assert np.abs(score.mean() - 0.5) < 0.25


@pytest.mark.xdist_group(name="core-ds")
def test_adverserial_validator_with_different_dist():
"""We use the DensitySplitter to create different distributions.
Expand Down
2 changes: 1 addition & 1 deletion tests/splitters/test_splitters.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def test_kennard_stone_splitter():
assert len(splits) == 3
assert len(splits[0]) > len(splits[1]) > len(splits[2])


@pytest.mark.xdist_group(name="core-ds")
def test_cluster_splitter():
"""Ensure that the splits add up to the total number of structures and are non-overlapping."""
ds = CoREDataset()
Expand Down

0 comments on commit 4d37dfc

Please sign in to comment.