Run scripts in CI (#19)

* pyproject add [tool.setuptools.package-data] ../data/figshare/* (closes #16) pyproject add optional-dependencies fetch-data add url to install aviary from git * add .github/workflows/run-scripts.yml to test a few scripts run error-free in CI * make aviary optional in data/wbm/fetch_process_wbm_dataset.py * add dep pymatviz * run-scripts.yml continue-on-error: true * i meant fail-fast: false * editable install in run-scripts.yml * install extra deps pymatviz[export-figs] to get kaleido * add TMP_FIGS dir looks like data/wbm/fetch_process_wbm_dataset.py might auto-cancelled due to exceeding the download limit or sth like that just getting Error: The operation was canceled. so let's test scripts/compile_metrics.py instead * compile_metrics.py make dataframe_image optional * run-scripts provide WANDB_API_KEY * compile_metrics.py fix missing column access * editable install in gh-pages.yml * fix vite build ReferenceError: latest_figshare_release is not defined
janosh · Apr 30, 2023 · 0da443f · 0da443f
1 parent ebb57e1
commit 0da443f
Show file tree

Hide file tree

Showing 8 changed files with 93 additions and 34 deletions.
diff --git a/.github/workflows/gh-pages.yml b/.github/workflows/gh-pages.yml
@@ -16,5 +16,5 @@ jobs:
       pre-build: |
         pip install lazydocs
         # lazydocs needs package deps to be installed
-        pip install ..
+        pip install -e ..
         python ../scripts/make_api_docs.py
diff --git a/.github/workflows/run-scripts.yml b/.github/workflows/run-scripts.yml
@@ -0,0 +1,34 @@
+name: Run Scripts
+
+on:
+  pull_request:
+    branches: [main]
+  push:
+    branches: [main]
+  workflow_dispatch:
+
+jobs:
+  run:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        script:
+          - scripts/compile_metrics.py
+          - scripts/analyze_element_errors.py
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v2
+
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.9
+
+      - name: Install package and dependencies
+        run: pip install -e .[fetch-data]
+
+      - name: Run script
+        run: python ${{ matrix.script }}
+        env:
+          WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
diff --git a/data/wbm/eda.py b/data/wbm/eda.py
@@ -22,7 +22,7 @@
 __date__ = "2023-03-30"
 
 """
-WBM exploratory data analysis.
+WBM exploratory data analysis (EDA).
 Start with comparing MP and WBM elemental prevalence.
 """
 

diff --git a/data/wbm/fetch_process_wbm_dataset.py b/data/wbm/fetch_process_wbm_dataset.py
@@ -8,7 +8,6 @@
 
 import numpy as np
 import pandas as pd
-from aviary.wren.utils import get_aflow_label_from_spglib
 from pymatgen.analysis.phase_diagram import PatchedPhaseDiagram
 from pymatgen.core import Composition, Structure
 from pymatgen.entries.compatibility import (
@@ -58,6 +57,8 @@
 
 
 # %%
+os.makedirs(f"{module_dir}/raw", exist_ok=True)
+
 for step, file_id in google_drive_ids.items():
     file_path = f"{module_dir}/raw/wbm-structures-step-{step}.json.bz2"
 
@@ -579,26 +580,31 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
 
 
 # %%
-df_init_struct = pd.read_json(
-    f"{module_dir}/2022-10-19-wbm-init-structs.json.bz2"
-).set_index("material_id")
-
-wyckoff_col = "wyckoff_spglib"
-if wyckoff_col not in df_init_struct:
-    df_init_struct[wyckoff_col] = None
-
-for idx, struct in tqdm(
-    df_init_struct.initial_structure.items(), total=len(df_init_struct)
-):
-    if not pd.isna(df_summary.at[idx, wyckoff_col]):
-        continue
-    try:
-        struct = Structure.from_dict(struct)
-        df_summary.at[idx, wyckoff_col] = get_aflow_label_from_spglib(struct)
-    except Exception as exc:
-        print(f"{idx=} {exc=}")
-
-assert df_summary[wyckoff_col].isna().sum() == 0
+try:
+    from aviary.wren.utils import get_aflow_label_from_spglib
+
+    df_init_struct = pd.read_json(
+        f"{module_dir}/2022-10-19-wbm-init-structs.json.bz2"
+    ).set_index("material_id")
+
+    wyckoff_col = "wyckoff_spglib"
+    if wyckoff_col not in df_init_struct:
+        df_init_struct[wyckoff_col] = None
+
+    for idx, struct in tqdm(
+        df_init_struct.initial_structure.items(), total=len(df_init_struct)
+    ):
+        if not pd.isna(df_summary.at[idx, wyckoff_col]):
+            continue
+        try:
+            struct = Structure.from_dict(struct)
+            df_summary.at[idx, wyckoff_col] = get_aflow_label_from_spglib(struct)
+        except Exception as exc:
+            print(f"{idx=} {exc=}")
+
+    assert df_summary[wyckoff_col].isna().sum() == 0
+except ImportError:
+    print("aviary not installed, skipping Wyckoff label generation")
 
 
 # %% site-stats.json.gz was generated by scripts/compute_struct_fingerprints.py

diff --git a/matbench_discovery/__init__.py b/matbench_discovery/__init__.py
@@ -8,8 +8,9 @@
 FIGS = f"{ROOT}/site/src/figs"  # directory to store interactive figures
 MODELS = f"{ROOT}/site/src/routes/models"  # directory to write model analysis
 FIGSHARE = f"{ROOT}/data/figshare"
+TMP_FIGS = f"{ROOT}/tmp/figs"  # directory to store temporary figures
 
-for directory in [FIGS, MODELS, FIGSHARE]:
+for directory in [FIGS, MODELS, FIGSHARE, TMP_FIGS]:
     os.makedirs(directory, exist_ok=True)
 
 # whether a currently running slurm job is in debug mode

diff --git a/matbench_discovery/data.py b/matbench_discovery/data.py
@@ -172,7 +172,7 @@ def __init__(
 
         Args:
             root (str, optional): Root directory used to absolufy every file path.
-            Defaults to '~/.cache/matbench-discovery/{latest_figshare_release}' where
+            Defaults to '~/.cache/matbench-discovery/[latest_figshare_release]' where
                 latest_figshare_release is e.g. 1.0.0. Can also be set through env var
                 MATBENCH_DISCOVERY_CACHE_DIR.
             key_map (dict[str, str], optional): Mapping from attribute names to keys in

diff --git a/pyproject.toml b/pyproject.toml
@@ -33,14 +33,15 @@ classifiers = [
 requires-python = ">=3.9"
 dependencies = [
   "matplotlib",
-  "pymatgen",
   "numpy",
   # output_formatting needed to for pandas Stylers
   # see https://github.com/pandas-dev/pandas/blob/main/pyproject.toml#L78
   "pandas[output_formatting]",
+  "plotly",
+  "pymatgen",
+  "pymatviz[export-figs]",
   "scikit-learn",
   "scipy",
-  "plotly",
   "tqdm",
   "wandb",
 ]
@@ -52,11 +53,24 @@ Package = "https://pypi.org/project/matbench-discovery"
 
 [project.optional-dependencies]
 test = ["pytest", "pytest-cov", "pytest-markdown-docs"]
-running-models = ["aviary", "m3gnet", "maml", "megnet"]
+# how to specify git deps: https://stackoverflow.com/a/73572379
+running-models = [
+  # torch needs to install before aviary
+  "torch",
+  "aviary@git+https://github.com/CompRhys/aviary",
+  "m3gnet",
+  "maml",
+  "megnet",
+]
 3d-structures = ["crystaltoolkit"]
+fetch-data = ["gdown"]
+
+[tool.setuptools.packages.find]
+include = ["matbench_discovery*"]
+exclude = ["tests", "tests.*"]
 
-[tool.setuptools.packages]
-find = { include = ["matbench_discovery*"], exclude = ["tests*"] }
+[tool.setuptools.package-data]
+matbench_discovery = ["../data/figshare/*"]
 
 [tool.distutils.bdist_wheel]
 universal = true

diff --git a/scripts/compile_metrics.py b/scripts/compile_metrics.py
@@ -9,7 +9,6 @@
 import re
 from typing import Any
 
-import dataframe_image as dfi
 import numpy as np
 import pandas as pd
 import plotly.express as px
@@ -234,12 +233,17 @@
 # hide_rows = list(set(df_metrics) - set(df_metrics.T.F1.nlargest(6).index))
 # styler.hide(hide_rows)  # show only the best models by F1 score
 png_metrics = f"{ROOT}/tmp/figs/metrics-table.png"
-dfi.export(styler, png_metrics, dpi=300)
-print(f"{png_metrics=}")
+try:
+    import dataframe_image
+
+    dataframe_image.export(styler, png_metrics, dpi=300)
+except ImportError:
+    print("dataframe_image not installed, skipping png export")
 
 
 # %% write model metrics to json for use by the website
-df_stats["missing_preds"] = df_preds[list(df_metrics)].isna().sum()
+in_both = [*set(df_metrics) & set(df_preds)]
+df_stats["missing_preds"] = df_preds[in_both].isna().sum()
 df_stats["missing_percent"] = [
     f"{x / len(df_preds):.2%}" for x in df_stats.missing_preds
 ]