In [None]:
# TODO:
# - presentation figures
# - %RMSE or MAE?

In [1]:
# Useful standard and scientific ML libraries
import os
import ase.io
import matplotlib.pyplot as plt
import numpy as np
import pyscf
import py3Dmol
import torch

# M-Stack packages
import equistore  # storage format for atomistic ML
import chemiscope  # interactive molecular visualization
import rascaline  # generating structural representations
import qstack  # quantum chemistry toolkit

import rholearn  # torch-based density leaning
from rholearn import io, features, training, plots, predictor, pretraining, utils
from settings import RASCAL_HYPERS, DATA_SETTINGS, ML_SETTINGS

In [2]:
# Read the water molecules from file
n_structures = 1000
frames = ase.io.read(
    os.path.join(DATA_SETTINGS["data_dir"], "water_monomers_1k.xyz"), index=f":{n_structures}"
)

# Display molecules with chemiscope
chemiscope.show(
    frames,
    properties={
        "Mean O-H bond length, Angstrom": [np.mean([f.get_distance(0, 1), f.get_distance(0, 2)]) for f in frames],
        "H-O-H angle, degrees": [f.get_angle(1, 0, 2) for f in frames],
    },
)

ChemiscopeWidget(value='{"meta": {"name": " "}, "structures": [{"size": 3, "names": ["O", "H", "H"], "x": [0.0…

# Generate Equivariant Structural Representation

In [None]:
# Compute lambda-SOAP: uses rascaline to compute a SphericalExpansion (~ 25 secs)
input = features.lambda_soap_vector(
    frames, RASCAL_HYPERS, even_parity_only=True
)
# Drop the block for l=5, Hydrogen as this isn't included in the output electron density
input = equistore.drop_blocks(input, keys=equistore.Labels(input.keys.names, np.array([[5, 1]])))

# Save lambda-SOAP and hypers to file
equistore.save(os.path.join(DATA_SETTINGS["data_dir"], "lambda_soap.npz"), input)
io.pickle_dict(os.path.join(DATA_SETTINGS["data_dir"], "rascal_hypers.pickle"), RASCAL_HYPERS)

# Simulation Setup

### Train-Test Split

In [None]:
from equisolve.utils import split_data

# Load lambda-SOAP and output electron density. Check metadata is consistent.
input = equistore.load(os.path.join(DATA_SETTINGS["data_dir"], "lambda_soap.npz"))
output = equistore.load(os.path.join(DATA_SETTINGS["data_dir"], "e_densities.npz"))
assert equistore.equal_metadata(input, output, check=["samples", "components"])

# Split the data into training, validation, and test sets
[[in_train, in_test, in_val], [out_train, out_test, out_val]], grouped_labels = split_data(
    [input, output],
    axis=DATA_SETTINGS["axis"],
    names=DATA_SETTINGS["names"],
    n_groups=DATA_SETTINGS["n_groups"],
    group_sizes=DATA_SETTINGS["group_sizes"],
    seed=DATA_SETTINGS["seed"],
)
tm_files = {
    "in_train.npz": in_train,
    "in_test.npz": in_test,
    "out_train.npz": out_train,
    "out_test.npz": out_test,
    "in_val.npz": in_val,
    "out_val.npz": out_val,
}
# Save the TensorMaps to file
for name, tm in tm_files.items():
    equistore.save(os.path.join(DATA_SETTINGS["data_dir"], name), tm)

### Prepare Run Directory

In [None]:
# Create simulation run directory and save simulation
io.check_or_create_dir(ML_SETTINGS["run_dir"])
io.pickle_dict(os.path.join(ML_SETTINGS["run_dir"], "train_settings.pickle"), ML_SETTINGS)

# IMPORTANT! - set the torch default dtype
torch.set_default_dtype(ML_SETTINGS["torch"]["dtype"])

# Pre-construct the appropriate torch objects (i.e. models, loss fxns)
pretraining.construct_torch_objects_in_train_dir(
    DATA_SETTINGS["data_dir"], ML_SETTINGS["run_dir"], ML_SETTINGS, 
)

# Model Training

In [None]:
# Define the training subdirectory
train_rel_dir = ""
train_run_dir = os.path.join(ML_SETTINGS["run_dir"], train_rel_dir)

# Load training data and torch objects
data, model, loss_fn, optimizer = pretraining.load_training_objects(
    train_rel_dir, DATA_SETTINGS["data_dir"], ML_SETTINGS, ML_SETTINGS["training"]["restart_epoch"]
)

# Unpack the data
in_train, in_test, out_train, out_test = data

# Execute model training
print(f"\nTraining in subdirectory {train_run_dir}")
training.train(
    in_train=in_train,
    out_train=out_train,
    in_test=in_test,
    out_test=out_test,
    model=model,
    loss_fn=loss_fn,
    optimizer=optimizer,
    n_epochs=ML_SETTINGS["training"]["n_epochs"],
    save_interval=ML_SETTINGS["training"]["save_interval"],
    save_dir=train_run_dir,
    restart=ML_SETTINGS["training"]["restart_epoch"],
)

# Model Analysis

In [None]:
# Load the train and test losses
losses = np.load(os.path.join(ML_SETTINGS["run_dir"], "losses.npz"))

# Plot losses
fig, ax = plt.subplots(1, 1, sharey=True)
ax.loglog(losses["train"], label="linear, train", color="blue", linestyle="dashed")
ax.loglog(losses["test"], label="linear, test", color="blue")
ax.set_ylabel("MSE Loss")
ax.legend()

In [None]:
# Load validation structure and make a prediction
# Load the input and output validation TensorMaps
in_val = io.load_tensormap_to_torch(
    os.path.join(DATA_SETTINGS["data_dir"], "in_val.npz"), **ML_SETTINGS["torch"]
)
out_val = equistore.load(os.path.join(DATA_SETTINGS["data_dir"], "out_val.npz"))

# Retrieve the unique structure
val_idx = equistore.unique_metadata(in_val, axis="samples", names="structure")[0][0]
val_frame = ase.io.read(
    os.path.join(DATA_SETTINGS["data_dir"], "water_monomers_1k.xyz"), index=val_idx
)

# Build a pyscf Molecule object
val_mol = pyscf.gto.Mole().build(
    atom=[
        (i, j) for i, j in zip(val_frame.get_chemical_symbols(), val_frame.positions)
    ],
    basis="ccpvqz jkfit",
)

# Predict the density
out_val_pred, coeffs = predictor.predict_density_from_mol(
    in_val,
    val_mol,
    model_path=os.path.join(ML_SETTINGS["run_dir"], "epoch_25", "model.pt"),
    inv_means_path=os.path.join(DATA_SETTINGS["data_dir"], "inv_means.npz"),
)

# Build a delta density TensorMap
out_val_delta = equistore.subtract(out_val_pred, out_val)

In [None]:
fig, ax = plots.parity_plot(
    target=utils.standardize_invariants(
        out_val,
        equistore.load(os.path.join(DATA_SETTINGS["data_dir"], "inv_means.npz")),
    ),
    predicted=utils.standardize_invariants(
        out_val_pred,
        equistore.load(os.path.join(DATA_SETTINGS["data_dir"], "inv_means.npz")),
    ),
    color_by="spherical_harmonics_l",
)
lim = [-0.07, 0.07]
ax.set_xlim(lim)
ax.set_ylim(lim)
ax.set_aspect("equal")
ax.set_xlabel("target density coefficient")
ax.set_ylabel("predicted density coefficient")
ax.legend()

# Density Visualization

In [None]:
# Vectorize the coefficients from each of the TensorMaps
new_key_names = ["spherical_harmonics_l", "element"]
vect_coeffs_target = qstack.equio.tensormap_to_vector(
    val_mol,
    utils.rename_tensor(
        utils.drop_metadata_name(out_val, "samples", "structure"),
        keys_names=new_key_names,
    ),
)
vect_coeffs_input = qstack.equio.tensormap_to_vector(
    val_mol,
    utils.rename_tensor(
        utils.drop_metadata_name(out_val_pred, "samples", "structure"),
        keys_names=new_key_names,
    ),
)
vect_coeffs_delta = qstack.equio.tensormap_to_vector(
    val_mol,
    utils.rename_tensor(
        utils.drop_metadata_name(out_val_delta, "samples", "structure"),
        keys_names=new_key_names,
    ),
)
pos_delta = [i if i >= 0 else 0 for i in vect_coeffs_delta]
neg_delta = [i if i < 0 else 0 for i in vect_coeffs_delta]

# Convert the basis function coefficients to a cube file
plot_dir = os.path.join(ML_SETTINGS["run_dir"], "plots")
io.check_or_create_dir(plot_dir)
n = 80  # grid points per dimension
for (coeffs, filename) in [
    (vect_coeffs_target, "out_val.cube"),
    (vect_coeffs_input, "out_val_pred.cube"),
    (vect_coeffs_delta, "out_val_delta.cube"),
    (pos_delta, "out_val_delta_pos.cube"),
    (neg_delta, "out_val_delta_neg.cube"),
]:
    qstack.fields.density2file.coeffs_to_cube(
        val_mol,
        coeffs,
        os.path.join(plot_dir, filename),
        nx=n,
        ny=n,
        nz=n,
        resolution=None,
    )

## Predicted Electron Density

In [None]:
# Visualize the target density
v = py3Dmol.view(os.path.join(plot_dir, "out_val_pred.cube"))
v.setStyle({"stick": {}})
v.addVolumetricData(
    open(os.path.join(plot_dir, "out_val_pred.cube"), "r").read(),
    "cube",
    {"isoval": 0.05, "color": "blue", "opacity": 0.8},
)
v.show()

## Visualize the prediction error (100x magnification)

### $\Delta \rho$ "delta electron density"

In [None]:
# Visualize the delta density
v = py3Dmol.view(os.path.join(plot_dir, "out_val_delta.cube"))
v.setStyle({"stick": {}})
v.addVolumetricData(
    open(os.path.join(plot_dir, "out_val_delta.cube"), "r").read(),
    "cube",
    {"isoval": 0.0005, "color": "blue", "opacity": 0.8},
)
v.show()

In [None]:
# Visualize the delta density
v = py3Dmol.view(os.path.join(plot_dir, "out_val_delta.cube"))
v.setStyle({"stick": {}})
v.addVolumetricData(
    open(os.path.join(plot_dir, "out_val_delta_pos.cube"), "r").read(),
    "cube",
    {"isoval": 0.0005, "color": "blue", "opacity": 0.8},
)
v.addVolumetricData(
    open(os.path.join(plot_dir, "out_val_delta_neg.cube"), "r").read(),
    "cube",
    {"isoval": 0.0005, "color": "red", "opacity": 0.8},
)
v.show()