## Targets from README.md
1. Curate metals dataset to match oxidation state. Use mendeleev to detect spin.
2. Curate ligands dataset to match denticity/charge states.

# Metals

In [2]:
import pandas as pd
import numpy as np
import mendeleev
import architector.io_ptable as io_ptable

In [None]:
# Identical to those in ../../ligands_metal_curate
metal_df = pd.read_pickle('metal_sample_dataframe.pkl')
ligands_df = pd.read_pickle('fixed_full_ligand_sample.pkl')

In [4]:
metal_df

Unnamed: 0,metal,ox,coreCNs,coreCN_counts_CSD
0,La,3,"[9, 8, 10, 6, 7, 5, 11, 4, 12]","[121, 75, 43, 39, 39, 14, 5, 5, 1]"
1,Ce,3,"[9, 8, 7, 6, 10, 4, 5]","[64, 52, 30, 28, 15, 6, 5]"
2,Ce,4,"[8, 9, 6, 7, 10, 4, 5]","[108, 69, 46, 35, 16, 9, 7]"
3,Pr,3,"[9, 8, 7, 6, 10, 5, 12, 4, 11]","[70, 33, 16, 14, 8, 2, 1, 1, 1]"
4,Nd,3,"[9, 8, 7, 6, 10, 5, 4, 12]","[122, 81, 64, 46, 16, 11, 7, 2]"
...,...,...,...,...
86,Pt,4,"[6, 4, 5]","[1041, 72, 11]"
87,Au,1,"[4, 2, 3, 5]","[83, 32, 3, 1]"
88,Au,3,"[4, 5, 2, 6]","[1096, 26, 3, 2]"
89,Hg,1,"[4, 5, 6, 8, 7, 3, 2]","[650, 170, 96, 24, 20, 12, 11]"


In [5]:
all_fracts = []
for i,row in metal_df.iterrows():
    fracts = np.array(row['coreCN_counts_CSD'])
    out_fracts = fracts / fracts.sum()
    all_fracts.append(np.round(out_fracts,3))
metal_df['coreCN_fracts'] = all_fracts

In [6]:
# Get rid of CNs found in less than 1 % of cases in CSD # -> Added
newrows = []
for i,row in metal_df.iterrows():
    newrow = row.copy()
    if row['coreCN_fracts'].shape[0] > 0:
        save_cn_inds = np.where(row['coreCN_fracts'] > 0.01)[0]
        newrow['coreCNs'] = np.array(row['coreCNs'])[save_cn_inds]
        newrow['coreCN_counts_CSD'] = np.array(row['coreCN_counts_CSD'])[save_cn_inds]
        newrow['coreCN_fracts'] = np.array(row['coreCN_fracts'])[save_cn_inds]
        newrow['total_count'] = np.sum(newrow['coreCN_counts_CSD'])
        newrows.append(newrow)
subset_cn_metal_df = pd.DataFrame(newrows)
gen_metal_df = subset_cn_metal_df
gen_metal_df.reset_index(drop=True, inplace=True)

In [7]:
refrow = gen_metal_df[gen_metal_df.metal == 'Sm'].iloc[0] # Add Pm with Sm numbers (neighbors of Sm and Nd) - both Sm and Nd have similar values anyway.
gen_metal_df.loc[90] = {'metal':'Pm',
                        'ox':3,
                        'coreCNs':refrow['coreCNs'],
                        'coreCN_counts_CSD':refrow['coreCN_counts_CSD'],
                        'coreCN_fracts':refrow['coreCN_fracts'],
                        'total_count':refrow['total_count']
                        }

In [8]:
rxn_m_swaps = []
for i,row in gen_metal_df.iterrows():
    out = {
        'metal':row['metal'],
        'ox':row['ox'],
        'uhf':mendeleev.element(row['metal']).ec.ionize(row['ox']).unpaired_electrons()
           }
    rxn_m_swaps.append(out)
rxn_mswap_df = pd.DataFrame(rxn_m_swaps)

In [9]:
rxn_mswap_df.to_csv('../2_inspect_rxns/rxn_m_swap_df.csv')

# Ligands

In [10]:
# Ensure smaller, more frequent ligands for rxn complexes.
f_ligand_df = ligands_df[(ligands_df.frequency > 30) & (ligands_df.natoms < 50) & (ligands_df.denticity < 3)] 
f_ligand_df.shape

(233, 14)

In [11]:
f_ligand_df

Unnamed: 0,uid,smiles,coordList,coord_atom_symols,coord_atom_types,non_coord_atom_symbols,non_coord_atom_types,charge,denticity,metal_ox_bound,frequency,selected_coord_type,selected_non_coord_type,natoms
0,[Te-]c1ccccc10,[Te-]c1ccccc1,[0],Te,Te,"C,C,C,C,C,C,H,H,H,H,H","C.ar,C.ar,C.ar,C.ar,C.ar,C.ar,H,H,H,H,H",-1,1,"Cu,1",59,Te,,12
1,C[Si](C)(C)[Si]([Te-])([Si](C)(C)C)[Si](C)(C)C5,C[Si](C)(C)[Si]([Te-])([Si](C)(C)C)[Si](C)(C)C,[5],Te,Te,"C,Si,C,C,Si,Si,C,C,C,Si,C,C,C,H,H,H,H,H,H,H,H,...","C.3,Si,C.3,C.3,Si,Si,C.3,C.3,C.3,Si,C.3,C.3,C....",-1,1,"Mg,2",39,Te,,41
10,c1ccncc13,c1ccncc1,[3],N,N.ar,"C,C,C,C,C,H,H,H,H,H","C.ar,C.ar,C.ar,C.ar,C.ar,H,H,H,H,H",0,1,"Cu,1",4246,N.ar,,11
11,"c1cnc2c(c1)ccc1cccnc212,12",c1cnc2c(c1)ccc1cccnc21,"[2, 12]","N,N","N.ar,N.ar","C,C,C,C,C,C,C,C,C,C,C,C,H,H,H,H,H,H,H,H","C.ar,C.ar,C.ar,C.ar,C.ar,C.ar,C.ar,C.ar,C.ar,C...",0,2,"Cu,1",2833,N.ar,,22
12,"c1ccc(nc1)c1ccccn14,11",c1ccc(nc1)c1ccccn1,"[4, 11]","N,N","N.ar,N.ar","C,C,C,C,C,C,C,C,C,C,H,H,H,H,H,H,H,H","C.ar,C.ar,C.ar,C.ar,C.ar,C.ar,C.ar,C.ar,C.ar,C...",0,2,"Ce,3",2541,N.ar,,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
680,Fc1cc2C(=O)C(=CN(C3CC3)c2cc1N1CC[NH2+]CC1)C(=O...,Fc1cc2C(=O)C(=CN(C3CC3)c2cc1N1CC[NH2+]CC1)C(=O...,"[5, 23]","O,O","O.2,O.co2","F,C,C,C,C,C,C,N,C,C,C,C,C,C,N,C,C,N,C,C,C,O,H,...","F,C.ar,C.ar,C.ar,C.ar,C.ar,C.ar,N.ar,C.3,C.3,C...",0,2,"Cu,2",31,,N.4,42
692,c1ccc2[c-]cccc2c14,c1ccc2[c-]cccc2c1,[4],C,C.2,"C,C,C,C,C,C,C,C,C,H,H,H,H,H,H,H","C.ar,C.ar,C.ar,C.ar,C.ar,C.ar,C.ar,C.ar,C.ar,H...",-1,1,"Ni,2",42,,H,17
702,"c1ccc([c-]c1)c1ccc2ccccc2n14,15",c1ccc([c-]c1)c1ccc2ccccc2n1,"[4, 15]","C,N","C.2,N.ar","C,C,C,C,C,C,C,C,C,C,C,C,C,C,H,H,H,H,H,H,H,H,H,H","C.ar,C.ar,C.ar,C.ar,C.ar,C.ar,C.ar,C.ar,C.ar,C...",-1,2,"Ir,3",77,,C.ar,26
710,COP(OC)OC2,COP(OC)OC,[2],P,P.3,"C,O,O,C,O,C,H,H,H,H,H,H,H,H,H","C.3,O.3,O.3,C.3,O.3,C.3,H,H,H,H,H,H,H,H,H",0,1,"Rh,3",112,,O.3,16


In [12]:
f_ligand_df.to_pickle('../2_inspect_rxns/rxn_ligands.pkl')