In [1]:
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
from molSimplify.Classes.mol3D import mol3D
from molSimplify.Classes.ligand import ligand_breakdown
from molSimplify.Informatics import autocorrelation
import json

from molSimplify.Informatics.RACassemble import get_rac155_graph_based

This notebook generates the RACs features for the structures in VSS-452 and CSD-76. Note that in order to generate these features, you need the `xyz` files of the structures, which are available at https://github.com/hjkgrp/dfa_recommender/tree/main/dfa_recommender/data/optgeos. 

In [2]:
#read in the charge and oxidation state information (only needed for CSD-76, included in VSS-452 construction)
csd_chargeox = pd.read_csv('../data/csd76_chargeox.csv').set_index('Unnamed: 0')

#get the charge and oxidation state information for VSS-452
vss452 = pd.read_csv('../data/VSS-452.csv')
vss_chargeox = pd.DataFrame(index=vss452['name'], columns=['ox_state', 'tot_lig_charge'])
#get a set of all ligands
ligands = []
for idx in vss_chargeox.index:
    comps = idx.split('_')
    if comps[0] not in ['Fe', 'Co', 'Cr', 'Mn']:
        #in VSS-452
        ligands.extend(comps[2:])

#dictionary mapping ligands to charges
lig_dict = {}
for lig in set(ligands):
    comps = lig.split('-')
    if lig not in ['4', '5', '6']:
        if comps[-1] == 'd1':
            lig_dict[comps[0]] = -1
        else:
            lig_dict[comps[0]] = 0

lig_charges = []
ox_states = []

for idx, row in vss_chargeox.iterrows():
    lig_charge = []
    comps = idx.split('_')
    ox_states.append(int(comps[1]))
    ligands = comps[2:]
    for lig in ligands:
        comps2 = lig.split('-')
        if lig not in ['4', '5', '6']:
            lig_charge.append(int(lig_dict[comps2[0]]))
    lig_charges.append(np.sum(lig_charge))

vss_chargeox['ox_state'] = ox_states
vss_chargeox['tot_lig_charge'] = lig_charges

In [3]:
#generate RACs from the xyzs

#this path should correspond to dfa_recommender/dfa_recommender/data/optgeos in the dfa_recommender repo
basepath = '../../vss_data/geometries/'
subfolder = 'CSD-76/'

#get the labels to construct the dataframe
for xyz in tqdm(os.listdir(basepath + subfolder)):
    if '.xyz' not in xyz:
        continue
    mol = mol3D()
    mol.readfromxyz(basepath + subfolder + xyz)
    rac = get_rac155_graph_based(mol)
    csd_racs = pd.DataFrame(index=[name.split('.')[0] for name in os.listdir(basepath + subfolder) if name[0] != '.' and 'Cr' not in name], columns=rac[0])
    break

#build a dataframe of all racs
for xyz in tqdm(os.listdir(basepath + subfolder)):
    if '.xyz' not in xyz:
        continue
    mol = mol3D()
    mol.readfromxyz(basepath + subfolder + xyz)
    rac = get_rac155_graph_based(mol)
    try:
        #some structures fail due to incompatible denticities
        for i, name in enumerate(rac[0]):
            csd_racs[name][xyz.split('.')[0]] = rac[-1][i]
    except:
        pass

#repeat for VSS-452
subfolder = 'VSS-452/'
for xyz in tqdm(os.listdir(basepath + subfolder)):
    if '.xyz' not in xyz:
        continue
    mol = mol3D()
    mol.readfromxyz(basepath + subfolder + xyz)
    rac = get_rac155_graph_based(mol)
    vss_racs = pd.DataFrame(index=[name.split('.')[0] for name in os.listdir(basepath + subfolder) if name[0] != '.'], columns=rac[0])
    break

for xyz in tqdm(os.listdir(basepath + subfolder)):
    if '.xyz' not in xyz:
        continue
    mol = mol3D()
    mol.readfromxyz(basepath + subfolder + xyz)
    rac = get_rac155_graph_based(mol)
    for i, name in enumerate(rac[0]):
        vss_racs[name][xyz.split('.')[0]] = rac[-1][i]

  0%|                                                    | 0/77 [00:00<?, ?it/s]
  squared_dist = np.sum((np.array(adjusted_coords) - np.array(plane_coords)) ** 2)
 10%|████▌                                       | 8/77 [00:01<00:13,  5.11it/s]

bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
Complex could not featurize to all rac155 descriptors


 17%|███████▎                                   | 13/77 [00:02<00:12,  5.18it/s]

bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
bad denticities: [4, 2]
min denticities: 2
Complex could not featurize to all rac155 descriptors


 21%|████████▉                                  | 16/77 [00:03<00:09,  6.38it/s]

Complex could not featurize to all rac155 descriptors


 26%|███████████▏                               | 20/77 [00:03<00:08,  6.64it/s]

Complex could not featurize to all rac155 descriptors


 34%|██████████████▌                            | 26/77 [00:05<00:09,  5.16it/s]

Complex could not featurize to all rac155 descriptors


 44%|██████████████████▉                        | 34/77 [00:06<00:08,  4.94it/s]

Complex could not featurize to all rac155 descriptors


 52%|██████████████████████▎                    | 40/77 [00:08<00:07,  4.96it/s]

Complex could not featurize to all rac155 descriptors


 57%|████████████████████████▌                  | 44/77 [00:09<00:06,  4.83it/s]

Complex could not featurize to all rac155 descriptors
Complex could not featurize to all rac155 descriptors


 73%|███████████████████████████████▎           | 56/77 [00:11<00:03,  6.37it/s]

Complex could not featurize to all rac155 descriptors


 87%|█████████████████████████████████████▍     | 67/77 [00:13<00:01,  5.71it/s]

Complex could not featurize to all rac155 descriptors


  squared_dist = np.sum((np.array(adjusted_coords) - np.array(plane_coords)) ** 2)
 97%|█████████████████████████████████████████▉ | 75/77 [00:15<00:00,  4.58it/s]

Complex could not featurize to all rac155 descriptors


 99%|██████████████████████████████████████████▍| 76/77 [00:15<00:00,  5.41it/s]

Complex could not featurize to all rac155 descriptors


100%|███████████████████████████████████████████| 77/77 [00:16<00:00,  4.78it/s]
  0%|                                                   | 0/453 [00:00<?, ?it/s]
100%|█████████████████████████████████████████| 453/453 [01:27<00:00,  5.20it/s]


In [4]:
csd_df = pd.concat([csd_chargeox, csd_racs], axis=1)
csd_df.to_csv('csd_racs.csv')

vss_df = pd.concat([vss_chargeox, vss_racs], axis=1)
vss_df.to_csv('vss_racs.csv')

To train RACs models, see the RACs subfolder of the ML training directory for example scripts.