In [1]:
from architector import build_complex, io_core
import json 
from itertools import combinations_with_replacement
from math import comb

core_geometries = io_core.Geometries().cn_geo_dict

metal_ox = {
    "Fe": [0, 1, 2, 3, 4, 5, 6, 7],
    "Pd": [1, 2, 3, 4, 5],
    "Zn": [0, 1, 2],
    "Cu": [0, 1, 2, 3, 4],
    "Li": [1],
    "Mg": [0, 1, 2],
#    "Fe": [2, 3, 4],
#    "Pd": [0, 1, 2, 3],
#    "Zn": [2],
#    "Cu": [1, 2, 3],
#    "Li": [1],
#    "Mg": [2],
}
coordination_numbers = list(range(2, 13))

ligands = {}
with open("../ligand_dictionaries/ligands.json", "r") as f:
    ligands.update(json.load(f))

for lig_name, tmp in ligands.items():
    print(lig_name, tmp["smiles"])
    
ligands_list = sorted(ligands.items())
ligand_subtypes = ['W', 'N', 'D']

choride Cl
methyl [C-1]([H])([H])[H]
methanediide [C-2]([H])[H]
ammonia [N][H][H][H]
amine [N-1]([H])[H]
imido [N-2][H]
water [O][H][H]
hydroxyl [O-1][H]
oxo [O-2]
phosphine P([H])([H])[H]
phosphido [P-1]([H])[H]
hydrogen sulfide [S]([H])[H]
thiol [S-1][H]
sulfido [S-2]


In [2]:
print(coordination_numbers)
for cn in coordination_numbers:
    n_coord_geo = comb(len(ligands_list) * len(ligand_subtypes) + cn - 1, cn)
    print(cn, n_coord_geo)

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2 903
3 13244
4 148995
5 1370754
6 10737573
7 73629072
8 450978066
9 2505433700
10 12777711870
11 60403728840
12 266783135710


In [3]:
for cn_max in range(2,13):
    coordination_numbers = list(range(2, cn_max+1))
    n_metal_ox = sum(len(x) for x in metal_ox.values())
    n_coord_geo = sum(comb(len(ligands_list) * len(ligand_subtypes) + cn - 1, cn) * len(core_geometries[cn]) for cn in coordination_numbers)
    print(f"For 2 to {cn_max} ligands, {n_metal_ox * n_coord_geo:.2e} structures would be analyzed.")

For 2 to 2 ligands, 6.77e+04 structures would be analyzed.
For 2 to 3 ligands, 1.06e+06 structures would be analyzed.
For 2 to 4 ligands, 1.22e+07 structures would be analyzed.
For 2 to 5 ligands, 1.15e+08 structures would be analyzed.
For 2 to 6 ligands, 1.19e+09 structures would be analyzed.
For 2 to 7 ligands, 1.04e+10 structures would be analyzed.
For 2 to 8 ligands, 1.01e+11 structures would be analyzed.
For 2 to 9 ligands, 4.14e+11 structures would be analyzed.
For 2 to 10 ligands, 1.37e+12 structures would be analyzed.
For 2 to 11 ligands, 5.90e+12 structures would be analyzed.
For 2 to 12 ligands, 1.92e+13 structures would be analyzed.


In [None]:
import categorical_dataset_design.architector_dataset_kmeans as doe

# expand candidates, include variant counts if you want explicit variant columns
ligand_types = [x[0] for x in ligands_list]
ligand_charges = [x[1]["charge"] for x in ligands_list]
n_points = doe.calculate_candidate_count(
    metal_ox, 
    core_geometries, 
    ligand_types,
    ligand_subtypes=['W', 'N', 'D'],
    include_variant_counts=False,
    forbid_fn=None
)
print("Total feasible candidates:", n_points)

Total feasible candidates: 19241532099000


In [None]:
n_points = doe.calculate_candidate_count(
    metal_ox, 
    core_geometries, 
    ligand_types,
    ligand_subtypes=['W', 'N', 'D'],
    include_variant_counts=False,
    forbid_fn=None,
    max_abs_charge=1,
    ligand_charges=ligand_charges,
)

print("Total feasible candidates:", n_points)

In [None]:
sys.exit()

In [None]:
doe.expand_candidates_with_variants(
    metal_ox, 
    core_geometries, 
    ligand_types, 
    ligand_subtypes=ligand_subtypes, 
    include_variant_counts=False,
    max_abs_charge=1,
    ligand_charges=ligand_charges,
)


In [None]:
desired_n_points = 50000 * 10

sel_sf, X_sf, design_sf = doe.space_filling_kmeans(
    df, 
    desired_n_points, 
    ligand_types=ligand_types, 
    ligand_subtypes=ligand_subtypes
)
print("Selected df indices:", sel_sf)

In [None]:
# optionally export selected runs
selected_runs = df.loc[sel_sf].reset_index(drop=True)
selected_runs.to_csv('selected_design.csv', index=False)
print("\nSelected runs saved to selected_design.csv")