# Analyze relationships between descriptors

In [None]:
from madas import MaterialsDatabase
from madas.apis.NOMAD_web_API import API as web_API

from processing_functions import get_dos_values, get_dos_energies

In [None]:
query = {
    "and": [
      {
        "results.material.n_elements": {
          "gte": 3,
          "lte": 3
        },
        "results.material.symmetry.structure_name:any": [
          "cubic perovskite"
        ],
        "results.method.simulation.program_name:any": [
          "VASP"
        ],
        "external_db:any": [
          "AFLOW"
        ],
        "results.properties.available_properties:all": [
          "dos_electronic"
        ]
      },
      {
        "quantities:all": [
          "results.method.simulation.program_name"
        ]
      }
    ]
  }

In [None]:
processing = web_API().processing
processing.pop("archive")
processing["electronic_dos_values"] = get_dos_values
processing["electronic_dos_energies"] = get_dos_energies

In [None]:
db = MaterialsDatabase(filename="AFLOW_cubic_perovskites.db", api=web_API(processing=processing))

In [None]:
# some downloads will fail, but the majority are available
db.fill_database(query)

In [None]:
len(db)

## Gen fingerprints

In [None]:
import numpy as np
from sklearn.metrics.pairwise import rbf_kernel

from madas import Fingerprint

# First we define how the similarity between two fingerprints can be calculated.
# Here, we use the cosine similarity between two SOAP vectors as a metric.
# To ensure numerical stability, the result is rounded by 12 digits.
def SOAP_similarity(fp1, fp2, gamma=1e-9):
    return np.round(rbf_kernel([fp1.data["coef"]], [fp2.data["coef"]], gamma=gamma)[0][0], 12)

# Define a new Fingerprint class
class SOAPFingerprint(Fingerprint):

    # We need to pass the `SOAP` object to the fingerprint.
    # Thus we define the `__init__` method of the class.
    def __init__(self,
                 name=None,
                 similarity_function=SOAP_similarity,
                 pass_on_exceptions=True,
                 creator=None) -> None:
        # Set the default values of the fingerprint
        super().__init__(fp_type = "SOAP",
                         name = name,
                         similarity_function=similarity_function,
                         pass_on_exceptions=pass_on_exceptions)
        # The SOAP object will be called `creator`
        self.creator = creator

    # Every fingerprint needs a `from_material` method.
    # It is used to derive the descriptor values from a `Material` object
    def from_material(self, material):
        # Upon calculation, we need to set the material id
        self.set_mid(material)
        # Next, calculate the descriptor data
        at = material.atoms.copy()
        at.set_atomic_numbers([1 for _ in at])
        coef = self.creator.create(at)
        # Store the data in the fingerprint to be retrieved later
        self.set_data("coef", coef.tolist())
        return self

In [None]:
# We need the class to create the descriptor data
from dscribe.descriptors.soap import SOAP

In [None]:
soap = SOAP(r_cut=15, # cutoff radius: this property defines the length of interactions between ions
            n_max=5, # number of basis functions
            l_max=5, # highest order of spherical harmonics
            species=["H"],  # (complete) set of species
            periodic=True, # periodicity of the input structures
            average="outer") # average local environments to obtain a global descriptor

In [None]:
from madas.fingerprints import DOSFingerprint

In [None]:
grid=DOSFingerprint.get_default_grid().create(e_ref=-2, delta_e_max=1.5, cutoff=[-8,12], n_pix=1024)

In [None]:
db.get_metadata()

In [None]:
# Calculate fingerprints and store them in the database
db.add_fingerprints(["PTE", "DOS", SOAPFingerprint],
                    names=["PTE", "DOS", "SOAP"],
                    fingerprint_kwargs_list=[{}, {"grid_id":grid.get_grid_id()}, {"creator":soap}])
# Ignoring entries where Xe is not recognized properly 
# (see https://gitlab.mpcdf.mpg.de/nomad-lab/nomad-FAIR/-/issues/1850) 

In [None]:
# Generate similarity matrices
pte_simat = db.get_similarity_matrix("PTE", name="PTE")
dos_simat = db.get_similarity_matrix("DOS", name="DOS")

In [None]:
# For the SOAP fingerprint, we pass the class directly
soap_simat = db.get_similarity_matrix(SOAPFingerprint, name="SOAP")

In [None]:
# align matrices to account for missing entries in some of them
pte_simat.align([soap_simat, dos_simat])

### Find clusters in the similarity matrices

In [None]:
from threshold_clusterer import ThresholdClusterer

from madas.clustering import SimilarityMatrixClusterer

clus_pte = SimilarityMatrixClusterer(pte_simat, 
                                     clusterer=ThresholdClusterer, 
                                     clusterer_kwargs={"threshold":1}, 
                                     use_complement=False).cluster()
clus_dos = SimilarityMatrixClusterer(dos_simat, 
                                     clusterer=ThresholdClusterer, 
                                     clusterer_kwargs={"threshold":0.75}, 
                                     use_complement=False).cluster()
clus_soap = SimilarityMatrixClusterer(soap_simat,
                                      clusterer=ThresholdClusterer, 
                                      clusterer_kwargs={"threshold":0.75}, 
                                      use_complement=False).cluster()

In [None]:
import matplotlib.pyplot as plt
plt.style.use("./settings.mplstyle")

from plotting_functions import plot_clustered_similarity_matrices_comparison

In [None]:
plot_clustered_similarity_matrices_comparison(clus_pte, clus_soap, clus_dos, filename=None)

### Appendix: Investigate duplicate entries

In [None]:
# Get mids of the materials of the very compact DOS cluster
mids_identical_cluster = clus_dos.get_mids_by_cluster_label(0)

In [None]:
# Plot sub matrices of materials contained in this cluster  
# to verify that they are highly similar
plt.figure()
plt.imshow(dos_simat.get_sub_matrix(mids_identical_cluster))
plt.title("DOS similarity")
plt.colorbar()
plt.figure()
plt.imshow(soap_simat.get_sub_matrix(mids_identical_cluster))
plt.title("SOAP similarity", pad=30)
plt.colorbar()
plt.figure()
plt.imshow(pte_simat.get_sub_matrix(mids_identical_cluster))
plt.title("PTE similarity")
plt.colorbar()
plt.show()

In [None]:
# Print all unique formulas
set([db[mid].atoms.get_chemical_formula() for mid in mids_identical_cluster])

In [None]:
# Plor distribution of volumes
plt.figure(figsize=(15,5))
plt.hist([db[mid].atoms.get_volume() for mid in mids_identical_cluster], bins=30)
plt.show()

In [None]:
from processing_functions import get_total_energy_eV

In [None]:
from madas.utils import tqdm

In [None]:
# Plot distribution of energies
plt.figure(figsize=(15,5))
plt.hist([db.api.get_property(get_total_energy_eV, mid) for mid in tqdm(mids_identical_cluster)], bins=30)
plt.show()