In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
###########################################################################################
# Data parsing utilities
# Authors: Ilyes Batatia, Gregor Simm and David Kovacs
# This program is distributed under the MIT License (see MIT.md)
###########################################################################################

import logging
from dataclasses import dataclass
from typing import Dict, List, Optional, Sequence, Tuple

import ase.data
import ase.io
import h5py
import numpy as np

from mace.tools import AtomicNumberTable

Vector = np.ndarray  # [3,]
Positions = np.ndarray  # [..., 3]
Forces = np.ndarray  # [..., 3]
Stress = np.ndarray  # [6, ], [3,3], [9, ]
Virials = np.ndarray  # [6, ], [3,3], [9, ]
Charges = np.ndarray  # [..., 1]
Cell = np.ndarray  # [3,3]
Pbc = tuple  # (3,)

DEFAULT_CONFIG_TYPE = "Default"
DEFAULT_CONFIG_TYPE_WEIGHTS = {DEFAULT_CONFIG_TYPE: 1.0}


@dataclass
class Configuration:
    atomic_numbers: np.ndarray
    positions: Positions  # Angstrom
    energy: Optional[float] = None  # eV
    forces: Optional[Forces] = None  # eV/Angstrom
    stress: Optional[Stress] = None  # eV/Angstrom^3
    virials: Optional[Virials] = None  # eV
    dipole: Optional[Vector] = None  # Debye
    charges: Optional[Charges] = None  # atomic unit
    cell: Optional[Cell] = None
    pbc: Optional[Pbc] = None

    weight: float = 1.0  # weight of config in loss
    energy_weight: float = 1.0  # weight of config energy in loss
    forces_weight: float = 1.0  # weight of config forces in loss
    stress_weight: float = 1.0  # weight of config stress in loss
    virials_weight: float = 1.0  # weight of config virial in loss
    config_type: Optional[str] = DEFAULT_CONFIG_TYPE  # config_type of config

    # Additional fields can be handled with a flexible dictionary
    additional_fields: Dict[str, Any] = field(default_factory=dict)


Configurations = List[Configuration]

Configuration([1], [2], hi=1)

TypeError: Configuration.__init__() got an unexpected keyword argument 'hi'

In [13]:
from pathlib import Path
import numpy as np
import pandas as pd
from ase import Atoms
from ase.io import read, write

In [8]:
# train + val
df_train_total_energy = pd.read_csv(
    "dataset/gold-odac-2024/train/DFT-train_results.csv"
)
df_train_total = pd.read_json("dataset/gold-odac-2024/train/train.json")
# join the two dataframes
df_train_total = df_train_total.merge(df_train_total_energy, on="Name")
print(len(df_train_total))
# test
df_test_total_energy = pd.read_csv("dataset/gold-odac-2024/test/DFT-test_results.csv")
df_test_total = pd.read_json("dataset/gold-odac-2024/test/test.json")
df_test = df_test_total.merge(df_test_total_energy, on="Name")
print(len(df_test))

720
300


In [9]:
# Constants for splitting
train_size = 24
val_size = 12
split_size = train_size + val_size

# Containers for train and validation splits
train_splits = []
val_splits = []

# Split the DataFrame
for i in range(len(df_train_total) // split_size):
    start_idx = i * split_size
    train_end_idx = start_idx + train_size
    val_end_idx = train_end_idx + val_size

    # Append the slices to the respective lists
    train_splits.append(df_train_total.iloc[start_idx:train_end_idx])
    val_splits.append(df_train_total.iloc[train_end_idx:val_end_idx])

# Concatenate the lists into DataFrames if needed
df_train = pd.concat(train_splits, ignore_index=True)
df_val = pd.concat(val_splits, ignore_index=True)
print(len(df_train), len(df_val), len(df_test))

480 240 300


In [10]:
assert len(df_train["Name"].apply(lambda x: x.split("_")[0]).unique()) == 40
assert len(df_val["Name"].apply(lambda x: x.split("_")[0]).unique()) == 20
assert len(df_test["Name"].apply(lambda x: x.split("_")[0]).unique()) == 25
# Discard if interaction energy > 2 eV
# df_train = df_train[df_train["DFT_E_int"] < 2]
# df_val = df_val[df_val["DFT_E_int"] < 2]
# df_test = df_test[df_test["DFT_E_int"] < 2]
print(len(df_train), len(df_val), len(df_test))

480 240 300


In [16]:
def make_mace_dataset(df, save_path=None):
    atoms_list = []
    for i, row in df.iterrows():
        st = row["Structure"]
        atoms = Atoms(
            numbers=st["numbers"],
            positions=st["positions"],
            cell=st["cell"],
            pbc=st["pbc"],
        )
        # set tags
        atoms.set_tags(row["Tag"])
        # set arrays
        atoms.arrays["forces"] = np.array(row["Forces"])
        # save info
        atoms.info["energy"] = row["DFT_E_total"]
        atoms.info["name"] = row["Name"]
        atoms.info["group"] = row["Group"]
        atoms.info["metal"] = row["Metal"]
        atoms.info["DFT_E_total"] = row["DFT_E_total"]
        atoms.info["DFT_E_mof"] = row["DFT_E_mof"]
        atoms.info["DFT_E_gas"] = row["DFT_E_gas"]
        atoms.info["DFT_E_int"] = row["DFT_E_int"]
        # append to list
        atoms_list.append(atoms)

    if save_path:
        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
        write(save_path, atoms_list)
    else:
        return atoms_list

In [17]:
# make_mace_dataset(df_train, "dataset/gold-odac-2024/v1/train.xyz")
# make_mace_dataset(df_val, "dataset/gold-odac-2024/v1/val.xyz")
make_mace_dataset(df_test, "dataset/gold-odac-2024/v1/test.xyz")

In [3]:
len(read("dataset/gold-odac-2024/v1/train.xyz", index=":")), len(
    read("dataset/gold-odac-2024/v1/val.xyz", index=":")
), len(read("dataset/gold-odac-2024/v1/test.xyz", index=":")),

(480, 240, 251)

In [29]:
atoms_list = read("dataset/gold-odac-2024/v1/test.xyz", index=":")
atoms = atoms_list[0]
tags = atoms.get_tags()
atoms[tags == 1].symbols

Symbols('CO2')