Skip to content

Commit

Permalink
Run scripts in CI (#19)
Browse files Browse the repository at this point in the history
* pyproject add [tool.setuptools.package-data] ../data/figshare/* (closes #16)

pyproject add optional-dependencies fetch-data
add url to install aviary from git

* add .github/workflows/run-scripts.yml to test a few scripts run error-free in CI

* make aviary optional in data/wbm/fetch_process_wbm_dataset.py

* add dep pymatviz

* run-scripts.yml continue-on-error: true

* i meant fail-fast: false

* editable install in run-scripts.yml

* install extra deps pymatviz[export-figs] to get kaleido

* add TMP_FIGS dir

looks like data/wbm/fetch_process_wbm_dataset.py might auto-cancelled due to exceeding the download limit or sth like that

just getting Error: The operation was canceled.
so let's test scripts/compile_metrics.py instead

* compile_metrics.py make dataframe_image optional

* run-scripts provide WANDB_API_KEY

* compile_metrics.py fix missing column access

* editable install in gh-pages.yml

* fix vite build ReferenceError: latest_figshare_release is not defined
  • Loading branch information
janosh committed Apr 30, 2023
1 parent ebb57e1 commit 0da443f
Show file tree
Hide file tree
Showing 8 changed files with 93 additions and 34 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/gh-pages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ jobs:
pre-build: |
pip install lazydocs
# lazydocs needs package deps to be installed
pip install ..
pip install -e ..
python ../scripts/make_api_docs.py
34 changes: 34 additions & 0 deletions .github/workflows/run-scripts.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: Run Scripts

on:
pull_request:
branches: [main]
push:
branches: [main]
workflow_dispatch:

jobs:
run:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
script:
- scripts/compile_metrics.py
- scripts/analyze_element_errors.py
steps:
- name: Check out repository
uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.9

- name: Install package and dependencies
run: pip install -e .[fetch-data]

- name: Run script
run: python ${{ matrix.script }}
env:
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
2 changes: 1 addition & 1 deletion data/wbm/eda.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
__date__ = "2023-03-30"

"""
WBM exploratory data analysis.
WBM exploratory data analysis (EDA).
Start with comparing MP and WBM elemental prevalence.
"""

Expand Down
48 changes: 27 additions & 21 deletions data/wbm/fetch_process_wbm_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

import numpy as np
import pandas as pd
from aviary.wren.utils import get_aflow_label_from_spglib
from pymatgen.analysis.phase_diagram import PatchedPhaseDiagram
from pymatgen.core import Composition, Structure
from pymatgen.entries.compatibility import (
Expand Down Expand Up @@ -58,6 +57,8 @@


# %%
os.makedirs(f"{module_dir}/raw", exist_ok=True)

for step, file_id in google_drive_ids.items():
file_path = f"{module_dir}/raw/wbm-structures-step-{step}.json.bz2"

Expand Down Expand Up @@ -579,26 +580,31 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:


# %%
df_init_struct = pd.read_json(
f"{module_dir}/2022-10-19-wbm-init-structs.json.bz2"
).set_index("material_id")

wyckoff_col = "wyckoff_spglib"
if wyckoff_col not in df_init_struct:
df_init_struct[wyckoff_col] = None

for idx, struct in tqdm(
df_init_struct.initial_structure.items(), total=len(df_init_struct)
):
if not pd.isna(df_summary.at[idx, wyckoff_col]):
continue
try:
struct = Structure.from_dict(struct)
df_summary.at[idx, wyckoff_col] = get_aflow_label_from_spglib(struct)
except Exception as exc:
print(f"{idx=} {exc=}")

assert df_summary[wyckoff_col].isna().sum() == 0
try:
from aviary.wren.utils import get_aflow_label_from_spglib

df_init_struct = pd.read_json(
f"{module_dir}/2022-10-19-wbm-init-structs.json.bz2"
).set_index("material_id")

wyckoff_col = "wyckoff_spglib"
if wyckoff_col not in df_init_struct:
df_init_struct[wyckoff_col] = None

for idx, struct in tqdm(
df_init_struct.initial_structure.items(), total=len(df_init_struct)
):
if not pd.isna(df_summary.at[idx, wyckoff_col]):
continue
try:
struct = Structure.from_dict(struct)
df_summary.at[idx, wyckoff_col] = get_aflow_label_from_spglib(struct)
except Exception as exc:
print(f"{idx=} {exc=}")

assert df_summary[wyckoff_col].isna().sum() == 0
except ImportError:
print("aviary not installed, skipping Wyckoff label generation")


# %% site-stats.json.gz was generated by scripts/compute_struct_fingerprints.py
Expand Down
3 changes: 2 additions & 1 deletion matbench_discovery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
FIGS = f"{ROOT}/site/src/figs" # directory to store interactive figures
MODELS = f"{ROOT}/site/src/routes/models" # directory to write model analysis
FIGSHARE = f"{ROOT}/data/figshare"
TMP_FIGS = f"{ROOT}/tmp/figs" # directory to store temporary figures

for directory in [FIGS, MODELS, FIGSHARE]:
for directory in [FIGS, MODELS, FIGSHARE, TMP_FIGS]:
os.makedirs(directory, exist_ok=True)

# whether a currently running slurm job is in debug mode
Expand Down
2 changes: 1 addition & 1 deletion matbench_discovery/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def __init__(
Args:
root (str, optional): Root directory used to absolufy every file path.
Defaults to '~/.cache/matbench-discovery/{latest_figshare_release}' where
Defaults to '~/.cache/matbench-discovery/[latest_figshare_release]' where
latest_figshare_release is e.g. 1.0.0. Can also be set through env var
MATBENCH_DISCOVERY_CACHE_DIR.
key_map (dict[str, str], optional): Mapping from attribute names to keys in
Expand Down
24 changes: 19 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,15 @@ classifiers = [
requires-python = ">=3.9"
dependencies = [
"matplotlib",
"pymatgen",
"numpy",
# output_formatting needed to for pandas Stylers
# see https://github.com/pandas-dev/pandas/blob/main/pyproject.toml#L78
"pandas[output_formatting]",
"plotly",
"pymatgen",
"pymatviz[export-figs]",
"scikit-learn",
"scipy",
"plotly",
"tqdm",
"wandb",
]
Expand All @@ -52,11 +53,24 @@ Package = "https://pypi.org/project/matbench-discovery"

[project.optional-dependencies]
test = ["pytest", "pytest-cov", "pytest-markdown-docs"]
running-models = ["aviary", "m3gnet", "maml", "megnet"]
# how to specify git deps: https://stackoverflow.com/a/73572379
running-models = [
# torch needs to install before aviary
"torch",
"aviary@git+https://github.com/CompRhys/aviary",
"m3gnet",
"maml",
"megnet",
]
3d-structures = ["crystaltoolkit"]
fetch-data = ["gdown"]

[tool.setuptools.packages.find]
include = ["matbench_discovery*"]
exclude = ["tests", "tests.*"]

[tool.setuptools.packages]
find = { include = ["matbench_discovery*"], exclude = ["tests*"] }
[tool.setuptools.package-data]
matbench_discovery = ["../data/figshare/*"]

[tool.distutils.bdist_wheel]
universal = true
Expand Down
12 changes: 8 additions & 4 deletions scripts/compile_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import re
from typing import Any

import dataframe_image as dfi
import numpy as np
import pandas as pd
import plotly.express as px
Expand Down Expand Up @@ -234,12 +233,17 @@
# hide_rows = list(set(df_metrics) - set(df_metrics.T.F1.nlargest(6).index))
# styler.hide(hide_rows) # show only the best models by F1 score
png_metrics = f"{ROOT}/tmp/figs/metrics-table.png"
dfi.export(styler, png_metrics, dpi=300)
print(f"{png_metrics=}")
try:
import dataframe_image

dataframe_image.export(styler, png_metrics, dpi=300)
except ImportError:
print("dataframe_image not installed, skipping png export")


# %% write model metrics to json for use by the website
df_stats["missing_preds"] = df_preds[list(df_metrics)].isna().sum()
in_both = [*set(df_metrics) & set(df_preds)]
df_stats["missing_preds"] = df_preds[in_both].isna().sum()
df_stats["missing_percent"] = [
f"{x / len(df_preds):.2%}" for x in df_stats.missing_preds
]
Expand Down

0 comments on commit 0da443f

Please sign in to comment.