## Define Chemical Space

In [1]:
import os
import json
from collections import defaultdict
import copy

import pickle
import pandas as pd
import rdkit
rdkit.__version__

from architector import build_complex
from nglview import show_rdkit

def view3D(mol):
    """Display an RDKit molecule in 3D using nglview"""
    return show_rdkit(mol)

ligands = {}
with open("../../ligand_dictionaries/ligands.json", "r") as f:
    ligands.update(json.load(f))

for lig_name, tmp in ligands.items():
    print(lig_name, tmp["smiles"])
    
LIGAND_DICT = dict(sorted(ligands.items()))

# Checkpoint file to save progress
os.makedirs("chk", exist_ok=True)
CHECKPOINT_FILE = "chk/dataset_checkpoint.pkl"
PROGRESS_LOG = "chk/dataset_progress.txt"

def save_checkpoint(data_dict, progress_state):
    """Save current dataframe and progress state"""
    with open(CHECKPOINT_FILE, 'wb') as f:
        pickle.dump({'data': data_dict, 'state': progress_state}, f)
    print(f"✓ Checkpoint saved: {len(data_dict['metals'])} complexes", flush=True)

def load_checkpoint():
    """Load previous checkpoint if it exists"""
    try:
        with open(CHECKPOINT_FILE, 'rb') as f:
            checkpoint = pickle.load(f)
            print(f"✓ Loaded checkpoint: {len(checkpoint['data']['metals'])} complexes")
            return checkpoint['data'], checkpoint['state']
    except FileNotFoundError:
        print("No checkpoint found, starting fresh")
        return None, None

def make_state_key(metal, ox_state, cn, ligand_labels, coreType):
    """Create unique key for current state"""
    return f"{metal}_{ox_state}_{cn}_{coreType}_{'_'.join(sorted(ligand_labels))}"

def ligand_dicts_from_labels(ligand_labels):
    """Process ligand labels into dicts that are usable by Architector""" 
    ligand_dicts = []
    for lig_type, lig_subtype in [x.split("|") for x in ligand_labels]:
        ligand_dicts.append(copy.deepcopy(LIGAND_DICT[lig_type]))
        if 'functional_inds' in ligand_dicts[-1]:
            if lig_subtype != "D": # if not electron donating
                if lig_type == "halide":
                    ligand_dicts[-1]['smiles'] = '[F-]' if lig_subtype == "W" else "[Cl-]"
                else:
                    ligand_dicts[-1]['smiles'].replace(
                        "[H]",
                        '[F]' if lig_subtype == "W" else '[C]([H])([H])[H]'
                    )
            del ligand_dicts[-1]['functional_inds']
    return ligand_dicts



hydride [H-1]
halide [Br-]
methyl [C-]([H])([H])[H]
methanediide [C-2]([H])[H]
ammonia [N]([H])([H])[H]
amine [N-]([H])[H]
imido [N-2][H]
water [O]([H])[H]
hydroxyl [O-][H]
oxo [O-2]
phosphine [P]([H])([H])[H]
phosphido [P-]([H])[H]
hydrogen sulfide [S]([H])[H]
thiol [S-][H]
sulfido [S-2]


In [2]:
# Initialize progress log file
if not os.path.exists(PROGRESS_LOG):
    with open(PROGRESS_LOG, 'w') as f:
        f.write("Progress log initialized\n")
    print("Progress log initialized")

In [3]:
LIGAND_DICT

{'amine': {'smiles': '[N-]([H])[H]',
  'coordList': [0],
  'ligType': 'mono',
  'bondType': 'X',
  'charge': -1,
  'functional_inds': [0, 0]},
 'ammonia': {'smiles': '[N]([H])([H])[H]',
  'coordList': [0],
  'ligType': 'mono',
  'bondType': 'L',
  'charge': 0,
  'functional_inds': [0, 0, 0]},
 'halide': {'smiles': '[Br-]',
  'coordList': [0],
  'ligType': 'mono',
  'bondType': 'X',
  'charge': -1},
 'hydride': {'smiles': '[H-1]',
  'coordList': [0],
  'ligType': 'mono',
  'bondType': 'X',
  'charge': -1},
 'hydrogen sulfide': {'smiles': '[S]([H])[H]',
  'coordList': [0],
  'ligType': 'mono',
  'bondType': 'L',
  'charge': 0,
  'functional_inds': [0, 0]},
 'hydroxyl': {'smiles': '[O-][H]',
  'coordList': [0],
  'ligType': 'mono',
  'bondType': 'X',
  'charge': -1,
  'functional_inds': [0]},
 'imido': {'smiles': '[N-2][H]',
  'coordList': [0],
  'ligType': 'mono',
  'bondType': 'X',
  'charge': -2,
  'functional_inds': [0]},
 'methanediide': {'smiles': '[C-2]([H])[H]',
  'coordList': [0]

## Create Dataset

In [4]:
df = pd.read_csv("../2_generate_subset_metadata/a_subset1/selected_design.csv", converters={'Ligand_multiset_variants': eval})
df.columns

Index(['Element', 'Ox', 'CN', 'Geometry', 'Charge', 'Ligand_multiset_variants',
       'MW', 'count_type_halide', 'count_type_methyl',
       'count_type_methanediide', 'count_type_ammonia', 'count_type_amine',
       'count_type_imido', 'count_type_water', 'count_type_hydroxyl',
       'count_type_oxo', 'count_type_phosphine', 'count_type_phosphido',
       'count_type_hydrogen sulfide', 'count_type_thiol', 'count_type_sulfido',
       'count_prop_N', 'count_var_halide_N', 'count_var_methyl_N',
       'count_var_methanediide_N', 'count_var_ammonia_N', 'count_var_amine_N',
       'count_var_imido_N', 'count_var_water_N', 'count_var_hydroxyl_N',
       'count_var_oxo_N', 'count_var_phosphine_N', 'count_var_phosphido_N',
       'count_var_hydrogen sulfide_N', 'count_var_thiol_N',
       'count_var_sulfido_N'],
      dtype='object')

### Strategy for Handling Kernel Crashes

Since the kernel can crash during `build_complex` (likely due to underlying C/Fortran code in xtb), we need to:
1. Save progress incrementally to disk
2. Restart from the last saved checkpoint if the kernel crashes

In [5]:
errors = defaultdict(list)
status = []
# Load checkpoint if exists
checkpoint_data, last_state = load_checkpoint()
if checkpoint_data:
    df_metals = checkpoint_data['metals']
    df_cn = checkpoint_data['cn']
    df_ligands = checkpoint_data['ligands']
    df_ox = checkpoint_data['ox']
    df_chg = checkpoint_data['chg']
    df_multiplicity = checkpoint_data['multiplicity']
    df_mol2 = checkpoint_data['mol2']
    df_geo = checkpoint_data['geo']
    df_energy = checkpoint_data['energy']
    errors = checkpoint_data.get('errors', defaultdict(list))
    print(f"Resuming from checkpoint with {len(df_metals)} existing complexes")
else:
    # Start fresh
    df_metals, df_cn, df_ligands, df_ox, df_chg, df_multiplicity, df_mol2, df_geo, df_energy = [], [], [], [], [], [], [], [], []
    last_state = None

✓ Loaded checkpoint: 110 complexes
Resuming from checkpoint with 110 existing complexes


In [6]:
# Track processed states to avoid duplicates
if os.path.exists(PROGRESS_LOG):
    with open(PROGRESS_LOG, "r") as f:
        # Read from the end and stop at the most recent STARTING line
        for line in reversed(f.readlines()):
            if line.startswith("STARTING: "):
                last_state = line.split("STARTING: ", 1)[1].strip()
                flag_repeat = True
                break
            else:
                flag_repeat = False
    print(f"Progress file found, last state {last_state}")
else:
    print("No progress file found")
    flag_repeat = False

Progress file found, last state Fe_0_1_single_thiol|N


In [8]:
print(len(df))

48089


In [9]:
# Track processed states to avoid duplicates
if os.path.exists(PROGRESS_LOG):
    with open(PROGRESS_LOG, "r") as f:
        # Read from the end and stop at the most recent STARTING line
        for line in reversed(f.readlines()):
            if line.startswith("STARTING: "):
                last_state = line.split("STARTING: ", 1)[1].strip()
                flag_repeat = True
                break
            else:
                flag_repeat = False
else:
    flag_repeat = False

checkpoint_counter = 0
CHECKPOINT_EVERY = 5  # Save every N successful complexes

n_errors = sum(len(v) for v in errors.values())
print(f"Analyzing {len(df)-len(df_metals)-n_errors} of {len(df)} structures ({len(df_metals)+n_errors} have been assessed)")

for i, row in df.iterrows():
    row = row.to_dict()
    metal = row["Element"]
    ox_state = row["Ox"]
    cn = row["CN"]
    coreType = row["Geometry"]
    ligand_labels = row["Ligand_multiset_variants"]
    state_key = make_state_key(metal, ox_state, cn, ligand_labels, coreType)
                    
    # Skip if already processed
    if flag_repeat:
        if state_key == last_state:
            flag_repeat = False
        else:
            continue
#    print(i, state_key, last_state)

    # Log to file in case kernel dies
    with open(PROGRESS_LOG, 'a') as f:
        f.write(f"STARTING: {state_key}\n")
    
    print(f"\nProcessing: {metal=}, {ox_state=}, {cn=}, {coreType=}, ligands={ligand_labels}", flush=True)

    ligand_dicts = ligand_dicts_from_labels(ligand_labels)
    inputDict = {
        "core": {"metal": metal, 'coreCN': cn, "coreType": [coreType]},
        "ligands": ligand_dicts,
        'parameters':{
            "debug": False,
            "metal_ox": ox_state,
            "full_method": "GFN2-xTB",
            'assemble_method':'GFN2-xTB', # Switch to GFN-FF for faster assembly, 
            'n_symmetries': 100, # build 5 structures with different ligand arrangements, duplicates will be filtered
            'n_conformers': 1,
            'return_only_1':True, # Return just one geometry with lowest energy
            'save_init_geos': True,
        },
    }

    try:
        print("  Calling build_complex...", flush=True)
        out = build_complex(inputDict)
        if len(out) > 0:
            print(f"  build_complex completed, {len(out)} options available, taking the first", flush=True)
            out = next(iter(out.values()))
        else:
            raise ValueError("No build_complex output was produced.")
        
        n_el = [out['xtb_n_unpaired_electrons'], out['calc_n_unpaired_electrons']]
        if len(set(n_el)) > 1:
            raise ValueError(f"N_unpaired_el should agree between keys: 'xtb_n_unpaired_electrons', 'calc_n_unpaired_electrons', 'metal_spin', resulting {n_el}")
        chg = [out['total_charge'], out['xtb_total_charge']]
        if len(set(chg)) > 1:
            raise ValueError(f"Charge should agree between keys: 'total_charge', 'xtb_total_charge', resulting {chg}")
        if ox_state != out["metal_ox"]:
            raise ValueError("Metal oxidation state is not equal to assigned.")
        status.append("complete")
    except Exception as e:
        error_msg = f"{type(e).__name__}: {str(e)[:100]}"
        print(f"  ERROR: {error_msg}", flush=True)
        errors[str(e)[:30]].append([metal, ox_state, cn, ligand_labels, coreType, e])
        with open(PROGRESS_LOG, 'a') as f:
            f.write(f"ERROR: {state_key} - {error_msg}\n")
        status.append("error")
    else:
        print("  SUCCESS", flush=True)
        df_metals.append(metal)
        df_ox.append(ox_state)
        df_cn.append(cn)
        df_geo.append(coreType)
        df_ligands.append(ligand_labels)
        df_chg.append(out['total_charge'])
        df_multiplicity.append(out['xtb_n_unpaired_electrons'] + 1)
        df_mol2.append(out['mol2string'])
        df_energy.append(out['energy'])
        
        checkpoint_counter += 1
    
    # Save checkpoint periodically
    if checkpoint_counter >= CHECKPOINT_EVERY:
        data_dict = {
            'metals': df_metals, 'cn': df_cn, 'ligands': df_ligands,
            'ox': df_ox, 'chg': df_chg, 'multiplicity': df_multiplicity,
            'mol2': df_mol2, 'geo': df_geo, 'energy': df_energy,
            'errors': errors
        }
        save_checkpoint(data_dict, state_key)
        checkpoint_counter = 0
                    
        with open(PROGRESS_LOG, 'a') as f:
            f.write(f"SUCCESS: {state_key}\n")


# Final checkpoint
data_dict = {
    'metals': df_metals, 'cn': df_cn, 'ligands': df_ligands,
    'ox': df_ox, 'chg': df_chg, 'multiplicity': df_multiplicity,
    'mol2': df_mol2, 'geo': df_geo, 'energy': df_energy,
    'errors': errors
}
#save_checkpoint(data_dict, "COMPLETED")

df_final = pd.DataFrame({
    "metal": df_metals,
    "oxidation_state": df_ox,
    "coordination_number": df_cn,
    "geometry": df_geo,
    "ligand_names": df_ligands,
    "total_charge": df_chg,
    "multiplicity": df_multiplicity,
    "mol2string": df_mol2,
    "xtb_energy": df_energy,
}) # Create a dataframe

print(f"\nGenerated {len(df_final)} complexes with {len(errors)} error types.")

Analyzing 47514 of 48089 structures (575 have been assessed)

Processing: metal='Fe', ox_state=0, cn=2, coreType='bent_109', ligands=['methyl|N', 'hydrogen sulfide|N']
  Calling build_complex...
  ERROR: ValueError: No build_complex output was produced.

Processing: metal='Fe', ox_state=0, cn=2, coreType='bent_109', ligands=['methyl|N', 'sulfido|N']
  Calling build_complex...
  build_complex completed, 1 options available, taking the first
  SUCCESS

Processing: metal='Fe', ox_state=0, cn=2, coreType='bent_109', ligands=['ammonia|N', 'ammonia|N']
  Calling build_complex...
  ERROR: ValueError: No build_complex output was produced.

Processing: metal='Fe', ox_state=0, cn=2, coreType='bent_109', ligands=['ammonia|N', 'amine|N']
  Calling build_complex...
  ERROR: ValueError: No build_complex output was produced.

Processing: metal='Fe', ox_state=0, cn=2, coreType='bent_109', ligands=['ammonia|N', 'water|N']
  Calling build_complex...
  ERROR: ValueError: No build_complex output was produ

KeyboardInterrupt: 

In [None]:
out

{}

In [None]:
len(df_final)

NameError: name 'df_final' is not defined

In [None]:
errors

NameError: name 'errors' is not defined

In [None]:
len(df_final)

310

In [None]:
df_final.head()

Unnamed: 0,metal,oxidation_state,coordination_number,geometry,ligand_names,total_charge,multiplicity,mol2string,xtb_energy
0,Fe,2,2,bent_109,"[amine, amine]",2,5,@<TRIPOS>MOLECULE\nbent_109_0_nunpairedes_4_ch...,-298.173926
1,Fe,2,2,bent_120,"[amine, amine]",2,5,@<TRIPOS>MOLECULE\nbent_109_0_nunpairedes_4_ch...,-298.174912
2,Fe,2,2,linear,"[amine, amine]",2,5,@<TRIPOS>MOLECULE\nbent_109_0_nunpairedes_4_ch...,-298.173683
3,Fe,2,2,bent_109,"[amine, bromide]",2,5,@<TRIPOS>MOLECULE\nbent_109_0_nunpairedes_4_ch...,-301.972133
4,Fe,2,2,bent_120,"[amine, bromide]",2,5,@<TRIPOS>MOLECULE\nbent_109_0_nunpairedes_4_ch...,-301.971854


In [None]:
# Analyze each column independently for unique values and their counts
columns_to_analyze = ['metal', 'oxidation_state', 'coordination_number', 'geometry', 'ligand_names', 'total_charge', 'multiplicity']

for col in columns_to_analyze:
    print(f"\nColumn: {col}")
    print(f"{'='*60}")
    value_counts = df[col].value_counts().sort_index()
    print(value_counts)
    print(f"Total unique values: {len(value_counts)}")


Column: metal
metal
Fe    310
Name: count, dtype: int64
Total unique values: 1

Column: oxidation_state
oxidation_state
0      1
2    309
Name: count, dtype: int64
Total unique values: 2

Column: coordination_number
coordination_number
2    310
Name: count, dtype: int64
Total unique values: 1

Column: geometry
geometry
bent_109    104
bent_120    103
linear      103
Name: count, dtype: int64
Total unique values: 3

Column: ligand_names
ligand_names
[amine, amine]                              4
[amine, bromide]                            3
[amine, choride]                            3
[amine, dimethylamine]                      3
[amine, fluoride]                           3
                                           ..
[phosphine, thiol]                          3
[phosphine, trimethylphosphine]             3
[thiol, thiol]                              3
[thiol, trimethylphosphine]                 3
[trimethylphosphine, trimethylphosphine]    3
Name: count, Length: 103, dtype: int64
T