In [1]:
from pathlib import Path

import ase
import numpy as np
import pandas as pd
import seaborn as sns
from mace.calculators import mace_mp
from sklearn.neural_network import MLPClassifier

### Settings and loading data

In [None]:
# Settings
device = "cuda:0"  # or "cpu"
data_path = Path("../data/universal")

# Load data
lattice_data = np.load(data_path / "lattice.npy", allow_pickle=True).item()
atomic_numbers_data = np.load(
    data_path / "atomic_numbers.npy", allow_pickle=True
).item()
positions_data = np.load(data_path / "positions_800K.npz")
model = mace_mp(model="small", device=device, default_dtype="float32")

### Get MACE invariant features

In [None]:
def get_descriptor(mpid):
    lattice = lattice_data[mpid]
    atomic_numbers = atomic_numbers_data[mpid]
    positions = positions_data[mpid][0]
    atoms = ase.Atoms(
        numbers=atomic_numbers, positions=positions, cell=lattice, pbc=True
    )
    desc = model.get_descriptors(atoms)
    desc_Li = np.mean(desc[atomic_numbers == 3], axis=0)
    desc_frame = np.mean(desc[atomic_numbers != 3], axis=0)
    return {"Li": desc_Li, "frame": desc_frame}


descriptor_data = {mpid: get_descriptor(mpid) for mpid in list(lattice_data.keys())}

### Create labels for the prior classifier

In [None]:
# Load training data
df = pd.concat(
    [pd.read_csv(data_path / f"train_{temp}K.csv") for temp in [600, 800, 1000, 1200]]
)

# Compute log10 MSD/t and binarize
df["log_msd_t_Li"] = np.log10(df["msd_t_Li"])
df["log_msd_t_frame"] = np.log10(df["msd_t_frame"])
df["log_msd_t_Li_label"] = pd.cut(
    df["log_msd_t_Li"], bins=[-np.inf, -1.0, np.inf], labels=[0, 1]
)
df["log_msd_t_frame_label"] = pd.cut(
    df["log_msd_t_frame"], bins=[-np.inf, -1.0, np.inf], labels=[0, 1]
)

### Train the prior classifier

In [None]:
# Descriptors for Li
X_train_Li = np.array([descriptor_data[mpid]["Li"] for mpid in df["name"]])
X_train_Li = np.hstack([X_train_Li, df["temp"].values[:, None] / 1000.0])
y_train_Li = df["log_msd_t_Li_label"].cat.codes

# Train an MLP classifier
clf_Li = MLPClassifier(hidden_layer_sizes=(32, 16), max_iter=1000, random_state=42)
clf_Li.fit(X_train_Li, y_train_Li)

# Predict the labels for the training data
y_train_pred_Li = clf_Li.predict(X_train_Li)
df["prior_Li"] = y_train_pred_Li
df["prior_Li"] = df["prior_Li"].astype("category")
sns.histplot(df, x="log_msd_t_Li", hue="prior_Li", bins=50, kde=True)

In [None]:
# Descriptors for frame
X_train_frame = np.array([descriptor_data[mpid]["frame"] for mpid in df["name"]])
X_train_frame = np.hstack([X_train_frame, df["temp"].values[:, None] / 1000.0])
y_train_frame = df["log_msd_t_frame_label"].cat.codes

# Train an MLP classifier
clf_frame = MLPClassifier(hidden_layer_sizes=(32, 16), max_iter=1000, random_state=42)
clf_frame.fit(X_train_frame, y_train_frame)

# Predict the labels for the training data
y_train_pred_frame = clf_frame.predict(X_train_frame)
df["prior_frame"] = y_train_pred_frame
df["prior_frame"] = df["prior_frame"].astype("category")
sns.histplot(df, x="log_msd_t_frame", hue="prior_frame", bins=50, kde=True)

### Test the prior classifier
Note: We assume `msd_t` labels for the test set are available (just for the purpose of this notebook).

In [None]:
df = pd.concat(
    [pd.read_csv(data_path / f"test_{temp}K.csv") for temp in [600, 800, 1000, 1200]]
)

# Compute log10 MSD/t and binarize
df["log_msd_t_Li"] = np.log10(df["msd_t_Li"])
df["log_msd_t_frame"] = np.log10(df["msd_t_frame"])
df["log_msd_t_Li_label"] = pd.cut(
    df["log_msd_t_Li"], bins=[-np.inf, -1.0, np.inf], labels=[0, 1]
)
df["log_msd_t_frame_label"] = pd.cut(
    df["log_msd_t_frame"], bins=[-np.inf, -1.0, np.inf], labels=[0, 1]
)

# Descriptors for test data
X_test_Li = np.array([descriptor_data[mpid]["Li"] for mpid in df["name"]])
X_test_Li = np.hstack([X_test_Li, df["temp"].values[:, None] / 1000.0])
y_test_Li = df["log_msd_t_Li_label"].cat.codes
X_test_frame = np.array([descriptor_data[mpid]["frame"] for mpid in df["name"]])
X_test_frame = np.hstack([X_test_frame, df["temp"].values[:, None] / 1000.0])
y_test_frame = df["log_msd_t_frame_label"].cat.codes

# Predict the labels for the test data
y_test_pred_Li = clf_Li.predict(X_test_Li)
df["prior_Li"] = y_test_pred_Li
df["prior_Li"] = df["prior_Li"].astype("category")
y_test_pred_frame = clf_frame.predict(X_test_frame)
df["prior_frame"] = y_test_pred_frame
df["prior_frame"] = df["prior_frame"].astype("category")

sns.histplot(df, x="log_msd_t_Li", hue="prior_Li", bins=50, kde=True)
sns.histplot(df, x="log_msd_t_frame", hue="prior_frame", bins=50, kde=True)

### Annotate prior class predictions to the csv files

In [36]:
for split in ["train", "test"]:
    for temp in [600, 800, 1000, 1200]:
        df = pd.read_csv(data_path / f"{split}_{temp}K.csv")
        # Compute descriptors
        X_test_Li = np.array([descriptor_data[mpid]["Li"] for mpid in df["name"]])
        X_test_Li = np.hstack([X_test_Li, df["temp"].values[:, None] / 1000.0])
        X_test_frame = np.array([descriptor_data[mpid]["frame"] for mpid in df["name"]])
        X_test_frame = np.hstack([X_test_frame, df["temp"].values[:, None] / 1000.0])

        # Predict the prior labels
        df["prior_Li"] = clf_Li.predict(X_test_Li)
        df["prior_frame"] = clf_frame.predict(X_test_frame)

        # Save the data
        df.to_csv(data_path / f"{split}_{temp}K.csv", index=False)