In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ElMD import ElMD, elmd
from multiprocessing import Pool
from tqdm import tqdm
import gzip
import json
from pathlib import Path
from pymatgen.core import Composition
from pymatgen.core.structure import Structure, Lattice
import torch
from monty.json import MontyDecoder
from ast import literal_eval

# Load dataset 

In [2]:
# mp20 train+val
df_train = pd.read_csv('./mp_20/train.csv')[['pretty_formula']]
df_val = pd.read_csv('./mp_20/val.csv')[['pretty_formula']]
df_combined = pd.concat([df_train, df_val], axis=0, ignore_index=True)
df_combined = df_combined.rename(columns={'pretty_formula': 'formula'})
df_combined

Unnamed: 0,formula
0,Na3MnCoNiO6
1,Nd(Al2Cu)4
2,LiMnIr2
3,LiCSN
4,La2EuS4
...,...
36178,WS2
36179,Y2ZnPt
36180,RbMgCoF6
36181,BPO4


In [3]:
# mp20 test
df_test = pd.read_csv('./mp_20/test.csv')[['pretty_formula']]
df_test = df_test.rename(columns={'pretty_formula': 'formula'})
df_test

Unnamed: 0,formula
0,GaTe
1,SmThCN
2,CuNi
3,NaTiVS4
4,Ho3TmMn8
...,...
9041,In6Ga2PtO8
9042,CuSe
9043,Tl6TeO12
9044,Sr2GdRuO6


In [4]:
# wyckoff transformer 
with gzip.open(Path("./WyckoffTransformer_mp_20.json.gz"), "rb") as f:
    datasets_json = json.load(f)

formulas = []
for entry in datasets_json:
   
    composition_str = "".join([f"{elem}{num}" for elem, num in zip(entry['species'], entry['numIons'])])

    composition = Composition(composition_str)
    reduced_formula = composition.reduced_formula
    
    formulas.append(reduced_formula)

df_wt = pd.DataFrame({'formula': formulas})
df_wt



Unnamed: 0,formula
0,Dy(InCu)6
1,K2NaCeCl6
2,K2AsAuS3
3,GdDy3(GaO4)3
4,Te7Mo4Se
...,...
9995,Cs2NdF6
9996,DyGaSi3
9997,LiCuO2
9998,Rb2NaLaF6


In [5]:
# diffcsp
def get_crystals_list(
        frac_coords, atom_types, lengths, angles, num_atoms):
    """
    args:
        frac_coords: (num_atoms, 3)
        atom_types: (num_atoms)
        lengths: (num_crystals)
        angles: (num_crystals)
        num_atoms: (num_crystals)
    """
    assert frac_coords.size(0) == atom_types.size(0) == num_atoms.sum()
    assert lengths.size(0) == angles.size(0) == num_atoms.size(0)

    start_idx = 0
    crystal_array_list = []
    for batch_idx, num_atom in enumerate(num_atoms.tolist()):
        cur_frac_coords = frac_coords.narrow(0, start_idx, num_atom)
        cur_atom_types = atom_types.narrow(0, start_idx, num_atom)
        cur_lengths = lengths[batch_idx]
        cur_angles = angles[batch_idx]

        crystal_array_list.append({
            'frac_coords': cur_frac_coords.detach().cpu().numpy(),
            'atom_types': cur_atom_types.detach().cpu().numpy(),
            'lengths': cur_lengths.detach().cpu().numpy(),
            'angles': cur_angles.detach().cpu().numpy(),
        })
        start_idx = start_idx + num_atom
    return crystal_array_list
    

data = torch.load("./DiffCSP_mp_20.pt", map_location='cpu')
crystals_list = get_crystals_list(
    data['frac_coords'], data['atom_types'], data['lengths'], data['angles'], data['num_atoms'])

def get_structure(record):
    return Structure(
                lattice=Lattice.from_parameters(
                    *(record['lengths'].tolist() + record['angles'].tolist())),
                species=record['atom_types'], coords=record['frac_coords'], coords_are_cartesian=False)

df_csp = pd.DataFrame(crystals_list)
df_csp['structure'] = df_csp.apply(lambda row: get_structure(row), axis=1)
df_csp['formula'] = df_csp['structure'].apply(lambda struct: struct.composition.reduced_formula)
df_csp



Unnamed: 0,frac_coords,atom_types,lengths,angles,structure,formula
0,"[[0.8393228, 0.053963438, 0.013227344], [0.135...","[13, 8, 68, 8, 8, 8, 68, 13, 8, 68, 8, 8, 8, 8...","[5.2292614, 5.3434114, 7.463288]","[89.88423, 90.114876, 89.91222]","[[4.38947248 0.28834792 0.09050225] Al, [0.712...",TbEr3Al4O12
1,"[[0.20049211, 0.44073865, 0.5459221], [0.91459...","[22, 13, 13, 13, 13, 13, 13, 13, 13, 13, 74, 1...","[4.0407653, 4.9291196, 13.799562]","[90.206184, 89.917694, 114.078094]","[[-0.07616964 1.98341802 7.52683213] Ti, [ 1...",Ti2Al13W
2,"[[0.9976136, 0.42925078, 0.97523534], [0.49689...","[13, 13, 13, 74, 13, 13, 46, 46, 13, 13, 46, 1...","[6.7801504, 6.7895274, 6.829882]","[109.078476, 109.146194, 109.68871]","[[5.0197007 2.38936983 3.48968536] Al, [2.633...",Al11Pd3W
3,"[[0.08982345, 0.3458505, 0.8910983], [0.369892...","[3, 8, 8, 22, 8, 22, 8, 22, 8, 8, 8, 8]","[5.0952597, 5.104054, 5.806402]","[89.183624, 88.99343, 67.35607]","[[1.13688876 1.62911306 5.20726629] Li, [3.084...",LiTi3O8
4,"[[0.061776783, 0.64998174, 0.08496202], [0.606...","[47, 60, 50, 47, 60, 50]","[4.47149, 4.48399, 8.517711]","[105.0688, 105.26342, 90.03544]","[[ 0.05785525 2.80655018 -0.10674992] Ag, [2....",NdAgSn
...,...,...,...,...,...,...
9995,"[[0.32468364, 0.71995884, 0.5866922], [0.14207...","[8, 23, 8, 8, 8, 8, 8, 8, 23, 8, 23, 3, 3, 13,...","[5.1832223, 5.5291653, 6.7675705]","[89.6387, 90.452675, 92.28812]","[[1.52411779 3.97752606 3.98228688] O, [0.7212...",Li2AlV3O10
9996,"[[0.475546, 0.4323506, 0.09958547], [0.1430774...","[50, 70, 50, 50, 50, 50, 50, 50, 50]","[4.7478247, 7.227249, 7.571129]","[109.0774, 89.98152, 109.1082]","[[ 1.23525609 2.77040201 -0.26659223] Sn, [-1...",YbSn8
9997,"[[0.3793994, 0.24937421, 0.9897047], [0.871585...","[34, 16, 13, 16, 34, 34, 13, 34, 34, 13, 13, 3...","[7.3793917, 7.38119, 7.3917656]","[60.0584, 59.89376, 59.97979]","[[2.95385654 1.50374075 9.63874107] Se, [7.128...",Al3Se3S
9998,"[[0.6476849, 0.12345938, 0.97694474], [0.72115...","[13, 29, 16, 34, 34, 50, 16, 16, 34, 34, 16, 3...","[3.961785, 10.454192, 11.88459]","[79.7657, 79.99119, 78.832405]","[[ 2.74030439 1.25208284 12.28587371] Al, [3....",Al3VCuSn3(SeS)6


In [6]:
# diffcsp++
data = torch.load("./DiffCSP++_mp_20.pt", map_location='cpu')
crystals_list = get_crystals_list(
    data['frac_coords'], data['atom_types'], data['lengths'], data['angles'], data['num_atoms'])

def get_structure(record):
    return Structure(
                lattice=Lattice.from_parameters(
                    *(record['lengths'].tolist() + record['angles'].tolist())),
                species=record['atom_types'], coords=record['frac_coords'], coords_are_cartesian=False)

df_csp_pp = pd.DataFrame(crystals_list)
df_csp_pp['structure'] = df_csp_pp.apply(lambda row: get_structure(row), axis=1)
df_csp_pp['formula'] = df_csp_pp['structure'].apply(lambda struct: struct.composition.reduced_formula)
df_csp_pp



Unnamed: 0,frac_coords,atom_types,lengths,angles,structure,formula
0,"[[0.0, 0.0, 0.0], [0.6666667, 0.33333334, 0.33...","[7, 7, 7, 7, 7, 7, 27, 27, 27, 27, 27, 27]","[3.0407808, 3.0407808, 16.772078]","[90.0, 90.0, 119.997765]","[[0. 0. 0.] N, [1.52042468 0.8778176 5.590692...",CoN
1,"[[0.0, 0.0, 0.5], [0.5, 0.5, 0.5], [0.0, 0.0, ...","[8, 8, 26, 26, 3, 3, 3, 3, 8, 8, 8, 8, 26, 26,...","[11.490821, 3.7296257, 9.647919]","[90.0, 108.970924, 90.0]","[[0. 0. 4.82395935] O, [5.4333...",Li4Mn2Fe3O11
2,"[[0.356337, 0.56891173, 0.6107638], [0.356337,...","[8, 8, 8, 8, 8, 8, 83, 83, 8, 8, 8, 8, 39, 39]","[5.043304, 5.909466, 6.117374]","[90.0, 105.79059, 90.0]","[[1.72929757 3.36196441 3.24723539] O, [1.7292...",YBiO5
3,"[[0.0, 0.0, 0.0], [0.5, 0.5, 0.5], [0.25, 0.25...","[65, 65, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3...","[8.8827915, 8.8827915, 5.231132]","[90.0, 90.0, 90.0]","[[0. 0. 0.] Tb, [4.44139576 4.44139576 2.61556...",TbZn12
4,"[[0.25, 0.25, 0.25], [0.25, 0.25, 0.75], [0.25...","[56, 56, 56, 56, 56, 56, 56, 56, 62, 62, 62, 6...","[8.56255, 8.56255, 8.56255]","[90.0, 90.0, 90.0]","[[2.1406374 2.1406374 2.1406374] Ba, [2.140637...",Ba2SmWO6
...,...,...,...,...,...,...
9995,"[[0.0, 0.0, 0.0], [0.0, 0.0, 0.5], [0.33333334...","[65, 20, 5, 5, 27, 27, 27, 27, 27, 27, 5, 5]","[4.966578, 4.966578, 6.0165815]","[90.0, 90.0, 119.997765]","[[0. 0. 0.] Tb, [0. 0. 3.00829...",CaTb(Co3B2)2
9996,"[[0.0, 0.0, 0.2065411], [0.5, 0.0, 0.2065411],...","[57, 57, 57, 57, 47, 47, 47, 47, 8, 8, 8, 8, 8...","[9.000684, 5.8581533, 5.972623]","[90.0, 90.0, 90.0]","[[0. 0. 1.23359213] La, [4.500...",LaAgO3
9997,"[[0.37543264, 0.0, 0.25], [0.12456736, 0.0, 0....","[59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 5...","[8.572207, 8.572207, 8.572207]","[90.0, 90.0, 90.0]","[[3.21828648 0. 2.14305186] Pr, [1.067...",Pr3S4
9998,"[[0.33333334, 0.6666667, 0.5344624], [0.666666...","[49, 49, 56, 52, 52, 52, 52, 52, 52]","[6.7358375, 6.7358375, 8.068913]","[90.0, 90.0, 119.997765]",[[1.51729818e-04 3.88902528e+00 4.31253079e+00...,Ba(InTe3)2


In [7]:
# crystalformer 
with gzip.open(Path("./CrystalFormer_mp_20.csv.gz"), "rb") as f:
    df_cf = pd.read_csv(f)
    
decoder = MontyDecoder()
structures = df_cf['cif'].apply(lambda s: decoder.process_decoded(literal_eval(s)))
df_cf['structure'] = structures
df_cf['formula'] = df_cf['structure'].apply(lambda x: x.composition.reduced_formula)
df_cf

Unnamed: 0,cif,structure,formula
0,"{'@module': 'pymatgen.core.structure', '@class...","[[5.03535423 5.03535423 5.03535423] Cu, [5.035...",Cr2Cu(SeS)2
1,"{'@module': 'pymatgen.core.structure', '@class...","[[0. 0. 0.] Cu, [-3.14774869e-16 5.14066373e+...",GaCu5Se8
2,"{'@module': 'pymatgen.core.structure', '@class...","[[0. 0. 0.] Ce, [-2.45370641e-16 4.00720667e+...",CeAl2
3,"{'@module': 'pymatgen.core.structure', '@class...","[[0. 0. 0.] Ag, [-2.20974564e-16 3.60878849e+...",HoCu4Ag
4,"{'@module': 'pymatgen.core.structure', '@class...","[[0. 0. 0.] Lu, [-2.16015964e-16 3.52780841e+...",LuCu4Au
...,...,...,...
994,"{'@module': 'pymatgen.core.structure', '@class...","[[0.02640178 4.39544873 9.87101906] Sr, [0.026...",Sr2PrRuO6
995,"{'@module': 'pymatgen.core.structure', '@class...","[[0.08924678 1.83788644 0.03988495] Li, [0.089...",Li3SiO4
996,"{'@module': 'pymatgen.core.structure', '@class...","[[0.03401209 3.357707 8.55129861] V, [0.0340...",LiMnV2O5
997,"{'@module': 'pymatgen.core.structure', '@class...","[[0.0474941 1.43984048 0.02172881] P, [0.0474...",Ag2PHO3


# Compute ElMD

In [8]:
from ElMD import elmd
from joblib import Parallel, delayed

def compute_elmd(ref_formula, gen_formula):
    return elmd(ref_formula, gen_formula, metric="fast")

def compute_avg_elmd(df_ref, df_gen, n_jobs=-1):
    avg_distances = []

    def compute_avg_for_gen_formula(gen_formula):
        distances = df_ref['formula'].apply(lambda ref_formula: compute_elmd(ref_formula, gen_formula))
        return distances.mean()

    avg_distances = Parallel(n_jobs=n_jobs)(delayed(compute_avg_for_gen_formula)(gen_formula) for gen_formula in tqdm(df_gen['formula'], desc="Computing ElMD"))

    overall_average = sum(avg_distances) / len(avg_distances)
    
    return overall_average

In [9]:
def compute_nnd_elmd(df_ref, df_gen, k=5, n_jobs=-1):
    elmd_nnd = []

    def compute_nnd_for_gen_formula(gen_formula, idx):
        distances = df_ref['formula'].apply(lambda ref_formula: compute_elmd(ref_formula, gen_formula))

        mask = df_ref['formula'] == gen_formula
        distances[mask] = np.inf  

        k_nearest_distances = distances.nsmallest(k).values
        k_average_distance = k_nearest_distances.mean()  # Take the average of the k nearest distances

        return k_average_distance

    elmd_nnd = Parallel(n_jobs=n_jobs)(
        delayed(compute_nnd_for_gen_formula)(gen_formula, idx) 
        for idx, gen_formula in enumerate(tqdm(df_gen['formula'], desc="Computing ElMD k-NND"))
    )

    elmd_nnd_avg = sum(elmd_nnd) / len(elmd_nnd)

    return elmd_nnd_avg

In [10]:
import os
num_cores = os.cpu_count()
print(f"Number of available CPU cores: {num_cores}")

Number of available CPU cores: 96


## WyckoffTransformer

In [None]:
# ElMD Train+Val vs Gen
# Avg
elmd_wt_combined = compute_avg_elmd(df_combined, df_wt[:1000], n_jobs=-1)
elmd_wt_combined

In [32]:
# ElMD Test vs Gen
# Avg
elmd_wt_test = compute_avg_elmd(df_test, df_wt[:1000], n_jobs=48)
elmd_wt_test





Computing ElMD:   0%|          | 0/1000 [00:00<?, ?it/s][A[A[A[A



Computing ElMD:   5%|▍         | 48/1000 [00:01<00:32, 29.63it/s][A[A[A[A



Computing ElMD:   5%|▍         | 48/1000 [00:19<00:32, 29.63it/s][A[A[A[A



Computing ElMD:  10%|▉         | 96/1000 [04:15<47:10,  3.13s/it][A[A[A[A



Computing ElMD:  14%|█▍        | 144/1000 [09:08<1:03:59,  4.48s/it][A[A[A[A



Computing ElMD:  19%|█▉        | 192/1000 [13:49<1:07:37,  5.02s/it][A[A[A[A



Computing ElMD:  24%|██▍       | 240/1000 [19:37<1:13:46,  5.82s/it][A[A[A[A



Computing ElMD:  29%|██▉       | 288/1000 [24:56<1:12:28,  6.11s/it][A[A[A[A



Computing ElMD:  34%|███▎      | 336/1000 [30:37<1:11:09,  6.43s/it][A[A[A[A



Computing ElMD:  38%|███▊      | 384/1000 [35:40<1:05:41,  6.40s/it][A[A[A[A



Computing ElMD:  43%|████▎     | 432/1000 [40:53<1:00:55,  6.44s/it][A[A[A[A



Computing ElMD:  48%|████▊     | 480/1000 [46:18<56:40,  6.54s/it]  [A[A[A[A



Computing

22.25208140454697

In [33]:
# ElMD within Gen  
# Avg
elmd_wt_gen = compute_avg_elmd(df_wt[:1000], df_wt[:1000], n_jobs=48)
elmd_wt_gen





Computing ElMD:   0%|          | 0/1000 [00:00<?, ?it/s][A[A[A[A



Computing ElMD:  10%|▉         | 96/1000 [00:28<04:25,  3.41it/s][A[A[A[A



Computing ElMD:  14%|█▍        | 144/1000 [01:01<06:33,  2.18it/s][A[A[A[A



Computing ElMD:  19%|█▉        | 192/1000 [01:31<07:01,  1.92it/s][A[A[A[A



Computing ElMD:  24%|██▍       | 240/1000 [02:13<08:09,  1.55it/s][A[A[A[A



Computing ElMD:  29%|██▉       | 288/1000 [02:46<07:50,  1.51it/s][A[A[A[A



Computing ElMD:  34%|███▎      | 336/1000 [03:22<07:38,  1.45it/s][A[A[A[A



Computing ElMD:  38%|███▊      | 384/1000 [03:57<07:10,  1.43it/s][A[A[A[A



Computing ElMD:  43%|████▎     | 432/1000 [04:30<06:37,  1.43it/s][A[A[A[A



Computing ElMD:  48%|████▊     | 480/1000 [05:07<06:14,  1.39it/s][A[A[A[A



Computing ElMD:  53%|█████▎    | 528/1000 [05:48<05:59,  1.31it/s][A[A[A[A



Computing ElMD:  58%|█████▊    | 576/1000 [06:25<05:23,  1.31it/s][A[A[A[A



Computing ElMD:  62%|██

22.418923142971977

In [11]:
# ElMD Test vs Gen
# NND
elmd_wt_test = compute_nnd_elmd(df_test, df_wt[:1000], k=5, n_jobs=48)
elmd_wt_test

Computing ElMD k-NND: 100%|██████████| 1000/1000 [1:40:58<00:00,  6.06s/it]


1.7635092091999984

In [12]:
# ElMD within Gen  
# NND
elmd_wt_gen = compute_nnd_elmd(df_wt[:1000], df_wt[:1000], k=5, n_jobs=48)
elmd_wt_gen

Computing ElMD k-NND: 100%|██████████| 1000/1000 [11:17<00:00,  1.48it/s]


3.288956908799998

## DiffCSP

In [None]:
# ElMD Train+Val vs Gen
# Avg
elmd_csp_combined = compute_avg_elmd(df_combined, df_csp[:1000], n_jobs=-1)
elmd_csp_combined

In [34]:
# ElMD Test vs Gen
# Avg
elmd_csp_test = compute_avg_elmd(df_test, df_csp[:1000], n_jobs=48)
elmd_csp_test





Computing ElMD:   0%|          | 0/1000 [00:00<?, ?it/s][A[A[A[A



Computing ElMD:  10%|▉         | 96/1000 [03:09<29:41,  1.97s/it][A[A[A[A



Computing ElMD:  14%|█▍        | 144/1000 [08:50<58:36,  4.11s/it][A[A[A[A



Computing ElMD:  19%|█▉        | 192/1000 [14:29<1:10:10,  5.21s/it][A[A[A[A



Computing ElMD:  24%|██▍       | 240/1000 [20:10<1:14:23,  5.87s/it][A[A[A[A



Computing ElMD:  29%|██▉       | 288/1000 [25:21<1:12:04,  6.07s/it][A[A[A[A



Computing ElMD:  34%|███▎      | 336/1000 [30:10<1:07:03,  6.06s/it][A[A[A[A



Computing ElMD:  38%|███▊      | 384/1000 [36:20<1:07:28,  6.57s/it][A[A[A[A



Computing ElMD:  43%|████▎     | 432/1000 [41:50<1:03:07,  6.67s/it][A[A[A[A



Computing ElMD:  48%|████▊     | 480/1000 [46:57<57:05,  6.59s/it]  [A[A[A[A



Computing ElMD:  53%|█████▎    | 528/1000 [52:51<53:41,  6.82s/it][A[A[A[A



Computing ElMD:  58%|█████▊    | 576/1000 [58:05<47:38,  6.74s/it][A[A[A[A



Computing

22.364690712505887

In [35]:
# ElMD within Gen  
# Avg
elmd_csp_gen = compute_avg_elmd(df_csp[:1000], df_csp[:1000], n_jobs=48)
elmd_csp_gen





Computing ElMD:   0%|          | 0/1000 [00:00<?, ?it/s][A[A[A[A







Computing ElMD:  14%|█▍        | 144/1000 [03:57<26:41,  1.87s/it][A[A[A[A



Computing ElMD:  19%|█▉        | 192/1000 [03:58<15:54,  1.18s/it][A[A[A[A



Computing ElMD:  19%|█▉        | 192/1000 [04:12<15:54,  1.18s/it][A[A[A[A



Computing ElMD:  24%|██▍       | 240/1000 [04:32<12:51,  1.01s/it][A[A[A[A



Computing ElMD:  29%|██▉       | 288/1000 [05:28<12:42,  1.07s/it][A[A[A[A



Computing ElMD:  34%|███▎      | 336/1000 [06:11<11:12,  1.01s/it][A[A[A[A



Computing ElMD:  38%|███▊      | 384/1000 [07:01<10:27,  1.02s/it][A[A[A[A



Computing ElMD:  43%|████▎     | 432/1000 [07:46<09:24,  1.01it/s][A[A[A[A



Computing ElMD:  48%|████▊     | 480/1000 [08:28<08:19,  1.04it/s][A[A[A[A



Computing ElMD:  53%|█████▎    | 528/1000 [09:17<07:41,  1.02it/s][A[A[A[A



Computing ElMD:  58%|█████▊    | 576/1000 [10:02<06:48,  1.04it/s][A[A[A[A



Computing ElMD:  6

22.328423339893984

In [13]:
# ElMD Test vs Gen
# NND
elmd_csp_test = compute_nnd_elmd(df_test, df_csp[:1000], k=5, n_jobs=48)
elmd_csp_test

Computing ElMD k-NND: 100%|██████████| 1000/1000 [1:42:17<00:00,  6.14s/it]


1.7303922057999994

In [14]:
# ElMD within Gen  
# NND
elmd_csp_gen = compute_nnd_elmd(df_csp[:1000], df_csp[:1000], k=5, n_jobs=48)
elmd_csp_gen

Computing ElMD k-NND: 100%|██████████| 1000/1000 [16:09<00:00,  1.03it/s]


3.0691298354000027

## DiffCSP++

In [None]:
# ElMD Train+Val vs Gen
# Avg
elmd_csp_pp_combined = compute_avg_elmd(df_combined, df_csp_pp[:1000], n_jobs=-1)
elmd_csp_pp_combined

In [36]:
# ElMD Test vs Gen
# Avg
elmd_csp_pp_test = compute_avg_elmd(df_test, df_csp_pp[:1000], n_jobs=48)
elmd_csp_pp_test





Computing ElMD:   0%|          | 0/1000 [00:00<?, ?it/s][A[A[A[A



Computing ElMD:  10%|▉         | 96/1000 [03:09<29:42,  1.97s/it][A[A[A[A



Computing ElMD:  14%|█▍        | 144/1000 [08:41<57:31,  4.03s/it][A[A[A[A



Computing ElMD:  19%|█▉        | 192/1000 [14:56<1:13:21,  5.45s/it][A[A[A[A



Computing ElMD:  24%|██▍       | 240/1000 [20:19<1:14:35,  5.89s/it][A[A[A[A



Computing ElMD:  29%|██▉       | 288/1000 [26:12<1:15:41,  6.38s/it][A[A[A[A



Computing ElMD:  34%|███▎      | 336/1000 [31:59<1:13:33,  6.65s/it][A[A[A[A



Computing ElMD:  38%|███▊      | 384/1000 [38:14<1:12:02,  7.02s/it][A[A[A[A



Computing ElMD:  43%|████▎     | 432/1000 [43:29<1:05:04,  6.87s/it][A[A[A[A



Computing ElMD:  48%|████▊     | 480/1000 [49:17<1:00:34,  6.99s/it][A[A[A[A



Computing ElMD:  53%|█████▎    | 528/1000 [55:21<56:26,  7.17s/it]  [A[A[A[A



Computing ElMD:  58%|█████▊    | 576/1000 [1:01:16<51:09,  7.24s/it][A[A[A[A



Compu

21.741725516982296

In [37]:
# ElMD within Gen 
# Avg
elmd_csp_pp_gen = compute_avg_elmd(df_csp_pp[:1000], df_csp_pp[:1000], n_jobs=48)
elmd_csp_pp_gen





Computing ElMD:   0%|          | 0/1000 [00:00<?, ?it/s][A[A[A[A



Computing ElMD:  10%|▉         | 96/1000 [00:12<02:01,  7.46it/s][A[A[A[A



Computing ElMD:  14%|█▍        | 144/1000 [01:14<08:45,  1.63it/s][A[A[A[A



Computing ElMD:  19%|█▉        | 192/1000 [02:11<11:06,  1.21it/s][A[A[A[A



Computing ElMD:  24%|██▍       | 240/1000 [03:00<11:19,  1.12it/s][A[A[A[A



Computing ElMD:  29%|██▉       | 288/1000 [03:56<11:44,  1.01it/s][A[A[A[A



Computing ElMD:  34%|███▎      | 336/1000 [04:51<11:28,  1.04s/it][A[A[A[A



Computing ElMD:  38%|███▊      | 384/1000 [05:43<10:47,  1.05s/it][A[A[A[A



Computing ElMD:  43%|████▎     | 432/1000 [06:32<09:50,  1.04s/it][A[A[A[A



Computing ElMD:  48%|████▊     | 480/1000 [07:21<09:00,  1.04s/it][A[A[A[A



Computing ElMD:  53%|█████▎    | 528/1000 [08:19<08:32,  1.09s/it][A[A[A[A



Computing ElMD:  58%|█████▊    | 576/1000 [09:12<07:43,  1.09s/it][A[A[A[A



Computing ElMD:  62%|██

21.386666381964

In [15]:
# ElMD Test vs Gen
# NND
elmd_csp_pp_test = compute_nnd_elmd(df_test, df_csp_pp[:1000], k=5, n_jobs=48)
elmd_csp_pp_test

Computing ElMD k-NND: 100%|██████████| 1000/1000 [1:48:08<00:00,  6.49s/it]


1.8945838764000011

In [16]:
# ElMD within Gen  
# NND
elmd_csp_pp_gen = compute_nnd_elmd(df_csp_pp[:1000], df_csp_pp[:1000], k=5, n_jobs=48)
elmd_csp_pp_gen

Computing ElMD k-NND: 100%|██████████| 1000/1000 [16:12<00:00,  1.03it/s]


3.410304374800002

## CrystalFormer

In [None]:
# ElMD Train+Val vs Gen
# Avg
elmd_cf_combined = compute_avg_elmd(df_combined, df_cf[:1000], n_jobs=-1)
elmd_cf_combined

In [38]:
# ElMD Test vs Gen
# Avg
elmd_cf_test = compute_avg_elmd(df_test, df_cf[:1000], n_jobs=48)
elmd_cf_test





Computing ElMD:   0%|          | 0/999 [00:00<?, ?it/s][A[A[A[A



Computing ElMD:  10%|▉         | 96/999 [04:23<41:15,  2.74s/it][A[A[A[A



Computing ElMD:  14%|█▍        | 144/999 [09:41<1:02:13,  4.37s/it][A[A[A[A



Computing ElMD:  19%|█▉        | 192/999 [14:49<1:08:56,  5.13s/it][A[A[A[A



Computing ElMD:  24%|██▍       | 240/999 [20:29<1:13:27,  5.81s/it][A[A[A[A



Computing ElMD:  29%|██▉       | 288/999 [25:18<1:09:39,  5.88s/it][A[A[A[A



Computing ElMD:  34%|███▎      | 336/999 [31:12<1:10:15,  6.36s/it][A[A[A[A



Computing ElMD:  38%|███▊      | 384/999 [36:20<1:05:21,  6.38s/it][A[A[A[A



Computing ElMD:  43%|████▎     | 432/999 [41:42<1:01:15,  6.48s/it][A[A[A[A



Computing ElMD:  48%|████▊     | 480/999 [47:29<58:01,  6.71s/it]  [A[A[A[A



Computing ElMD:  53%|█████▎    | 528/999 [52:40<52:07,  6.64s/it][A[A[A[A



Computing ElMD:  58%|█████▊    | 576/999 [58:02<46:57,  6.66s/it][A[A[A[A



Computing ElMD:  62

21.985214077223795

In [39]:
# ElMD within Gen  
# Avg
elmd_cf_gen = compute_avg_elmd(df_cf[:1000], df_cf[:1000], n_jobs=48)
elmd_cf_gen





Computing ElMD:   0%|          | 0/999 [00:00<?, ?it/s][A[A[A[A



Computing ElMD:  10%|▉         | 96/999 [00:19<03:05,  4.87it/s][A[A[A[A



Computing ElMD:  14%|█▍        | 144/999 [01:07<07:34,  1.88it/s][A[A[A[A



Computing ElMD:  19%|█▉        | 192/999 [01:55<09:33,  1.41it/s][A[A[A[A



Computing ElMD:  24%|██▍       | 240/999 [02:48<10:41,  1.18it/s][A[A[A[A



Computing ElMD:  29%|██▉       | 288/999 [03:36<10:35,  1.12it/s][A[A[A[A



Computing ElMD:  34%|███▎      | 336/999 [04:24<10:17,  1.07it/s][A[A[A[A



Computing ElMD:  38%|███▊      | 384/999 [05:12<09:46,  1.05it/s][A[A[A[A



Computing ElMD:  43%|████▎     | 432/999 [05:58<09:01,  1.05it/s][A[A[A[A



Computing ElMD:  48%|████▊     | 480/999 [06:50<08:37,  1.00it/s][A[A[A[A



Computing ElMD:  53%|█████▎    | 528/999 [07:35<07:39,  1.03it/s][A[A[A[A



Computing ElMD:  58%|█████▊    | 576/999 [08:22<06:53,  1.02it/s][A[A[A[A



Computing ElMD:  62%|██████▏   | 62

21.873463503010523

In [17]:
# ElMD Test vs Gen
# NND
elmd_cf_test = compute_nnd_elmd(df_test, df_cf[:1000], k=5, n_jobs=48)
elmd_cf_test

Computing ElMD k-NND: 100%|██████████| 999/999 [1:42:24<00:00,  6.15s/it]


1.4900508942942934

In [18]:
# ElMD within Gen  
# NND
elmd_cf_gen = compute_nnd_elmd(df_cf[:1000], df_cf[:1000], k=5, n_jobs=48)
elmd_cf_gen

Computing ElMD k-NND: 100%|██████████| 999/999 [14:43<00:00,  1.13it/s]


3.04530350790791

# Results

In [20]:
data = {
    "Stats": ["ElMD (test vs gen)", "ElMD (within gen)"],
    "WyckoffTransfomer": [elmd_wt_test, elmd_wt_gen],
    "DiffCSP": [elmd_csp_test, elmd_csp_gen],
    "DiffCSP++": [elmd_csp_pp_test, elmd_csp_pp_gen],
    "CrytalFormer": [elmd_cf_test, elmd_cf_gen],
}

df = pd.DataFrame(data)

def format_significant_digits(value, digits=3):
    if isinstance(value, (float, int)):
        return f"{value:.{digits}g}"
    return value

numeric_cols = df.select_dtypes(include=['float64', 'float32', 'int64', 'int32']).columns
df[numeric_cols] = df[numeric_cols].applymap(format_significant_digits)
df

  df[numeric_cols] = df[numeric_cols].applymap(format_significant_digits)


Unnamed: 0,Stats,WyckoffTransfomer,DiffCSP,DiffCSP++,CrytalFormer
0,ElMD (test vs gen),1.76,1.73,1.89,1.49
1,ElMD (within gen),3.29,3.07,3.41,3.05
