In [None]:
# Requires umap-learn, pymatgen, ase, numpy, pandas and plotly

In [None]:
import umap
import numpy as np
import pandas as pd
from ase.atom import Atom
from pymatgen.core import Composition
np.set_printoptions(precision=4, suppress=True)

In [None]:
# Prepare the elemental substitutions of 
# https://www.nature.com/articles/s41524-020-00481-6

petiffor = np.loadtxt("substitution.dat")[1:99,1:99]
element_count = np.loadtxt("substitution.dat.element_count")[1:99]
petiffor += element_count*np.eye(petiffor.shape[0])
petiffor = np.tril(petiffor, k=0)

# normalise the rows to one so that the entries can be
# interpreted as a measure of similarity.
petiffor /= np.linalg.norm(petiffor, axis=1, keepdims=True)

# To compensate for the incomplete information present in the ICSD (the 
# off-diagonal components are underestimated with respect to the diagonal)
# modify the matrix elements by raising them by a power 1/2, followed by a 
# renormalisation of the lines.
petiffor = petiffor**0.5
petiffor /= np.linalg.norm(petiffor, axis=1, keepdims=True)

In [None]:
petiffor = pd.DataFrame(petiffor, columns=[Atom(i).symbol for i in range(1,99)], \
                       index=[Atom(i).symbol for i in range(1,99)])
petiffor = petiffor.fillna(0)

In [None]:
# Function to calculate the compositional embeddings. Expects formulas in pymatgen.core.Composition
# and the elemental embeddings as pandas.DataFrame
def get_onehot_comp(composition, elemental_embeddings):
  a = composition.fractional_composition.get_el_amt_dict()
  comp_finger =  np.array([a.get(Atom(i).symbol, 0) for i in range(1,99)])
  comp_finger = comp_finger @ elemental_embeddings.values
  return comp_finger

# Example for the Perovskites in Fig. 3

In [None]:
# Load dataset (124k entries); Taken from the Alexandria DB (https://alexandria.icams.rub.de)
perovskites = pd.read_csv("perovskites.csv")

# Calculate embeddings for each composition (takes ~15s in a Macbook Pro m1)
perovskites["composition"] = perovskites.formula_pretty.apply(Composition)
perovskites["formula_embedding"] = perovskites.composition.apply(lambda x: get_onehot_comp(x, petiffor))

In [None]:
# Perform dimensionality reduction using UMAP (takes ~40s in a Macbook Pro m1)
embeddings = perovskites.formula_embedding.values.tolist()

n_components = 2
reducer = umap.UMAP(random_state=124, n_components=n_components)

embeddings = reducer.fit_transform(embeddings)
data = pd.concat([perovskites[["mat_id", "formula_pretty", "bandgap", "e_above_hull", "is_magnetic"]], \
                  pd.DataFrame(embeddings, columns=[f"umap{i}" for i in range(n_components)])], axis=1)

In [None]:
# For visualization purposes (color scale), cap ehull to 0.5 eV
data["e_above_hull"] = data.e_above_hull.apply(lambda x: np.min([x, 0.5]))

In [None]:
import plotly.express as px

fig = px.scatter(data, x="umap0", y="umap1", color="e_above_hull", opacity=0.25, color_continuous_scale="RdYlGn_r", \
                 custom_data=['mat_id', 'formula_pretty', "bandgap", "is_magnetic"])

fig.update_traces(textposition='top center', hovertemplate= \
                  "Mat_id: %{customdata[0]}<br>Formula: %{customdata[1]}<br>Gap: %{customdata[2]}<br>Magnetic: %{customdata[3]}")

fig.update_layout(height=800)

fig.show()