This notebook is a tutorial for generating all molecular descriptors used in the paper **"Targeting the AI Design of Asymmetric Hydrogenation of Olefins: Database, Machine Learning and Relation Reasoning"**

# load dependence

In [1]:
import numpy as np
import pandas as pd
import glob
from ase import Atoms
from gendesc import generate2Ddesc,generate3Ddesc,getusidx,getmorganfp
from dscribe.descriptors import MBTR
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect

# read data file

For demonstration, we just operate on 5 entries data here

In [2]:
df = pd.read_csv('./data/data_demo.csv')   ### 
df

Unnamed: 0,Reactant SMILES,Product SMILES,Solvent SMILES,Additive SMILES,Metal,Ligand SMILES,Catalyst SMILES(RDKit),Pressure/atm,Temperature/C,S/C,ddG,ee,Scaffold type,Multi-scaffold type,Olefin Type
0,CCOC(=O)/C=C(\C)c1ccccc1,CCOC(=O)C[C@@H](C)c1ccccc1,ClCCl,,Ir,Cc1ccccc1P(c1ccccc1C)N1[C@H]2CC[C@H](C2)[C@@H]...,Cc1ccccc1P(c1ccccc1C)8->N1[C@H]2CC[C@H](C2)[C@...,49,25,200,2.720953,0.98,"P,N type","['P,N type']",tri-sub
1,C/C(=C\c1ccccc1)C(=O)Cc1ccccc1,C[C@@H](Cc1ccccc1)C(=O)Cc1ccccc1,ClCCl,,Ir,Cc1ccccc1P(c1ccccc1C)N1[C@H]2CC[C@H](C2)[C@@H]...,Cc1ccccc1P(c1ccccc1C)8->N1[C@H]2CC[C@H](C2)[C@...,49,25,200,2.477863,0.97,"P,N type","['P,N type']",tri-sub
2,CC(=O)NC1=C(C)CCCC1,CC(=O)N[C@H]1CCCC[C@H]1C,CCOC(C)=O,,Rh,CC(C)[C@@H](Oc1cccc2c1P(C(C)(C)C)[C@H]([C@@H]1...,CC(C)[C@@H](Oc1cccc2c1P(C(C)(C)C)8->[C@H]([C@@...,34,25,100,1.881852,0.92,double P type,['double P type'],tetra-sub
3,CC(=O)NC(C)=C(C)C,CC(=O)N[C@@H](C)C(C)C,CO,,Rh,CC(C)[C@@H](Oc1cccc2c1P(C(C)(C)C)[C@H]([C@@H]1...,CC(C)[C@@H](Oc1cccc2c1P(C(C)(C)C)8->[C@H]([C@@...,34,25,100,0.715489,0.54,double P type,['double P type'],tetra-sub
4,COC(=O)/C(=C\c1ccccc1)NC(C)=O,COC(=O)[C@@H](Cc1ccccc1)NC(C)=O,ClCCl,,Rh,Cc1ccccc1OP(Oc1ccccc1C)N(c1ccccc1C)N(c1ccccc1C...,Cc1ccccc1OP(Oc1ccccc1C)8->N(c1ccccc1C)N(c1cccc...,5,25,100,0.083035,0.07,binol type,"['binol type', 'double P type']",tri-sub


# generate 2D/3D descriptor map

*generate2Ddesc*,*generate3Ddesc* modules in *gendesc* can be used to generate 2D descriptors including **MolecularFingerprint (MF)**, **200 molecular descriptors built-in RDKit**, and **molecular descriptors built-in Mordred**, and 3D descriptors like **ACSF**, **MBTR**, **SOAP** and **LMBTR**. In addition, the paramters of these modules can be modified.

In [3]:
gen2d = generate2Ddesc(df)
rdkit_desc_map = gen2d.calc_rdkit_desc()
mf_desc_map = gen2d.calc_mf_desc()

In [4]:
gen3d = generate3Ddesc('./data/geoms_demo/')
acsf_desc_map = gen3d.calc_acsf_desc()
soap_desc_map = gen3d.calc_soap_desc()
lmbtr_desc_map = gen3d.calc_lmbtr_desc()
mbtr_desc_map  =gen3d.calc_mbtr_desc()

# generate reaction descritor

In this tutorial, we just generate MBTR+MF reaction descriptor for demonstration

In [5]:
re_smi = df['Reactant SMILES'].to_numpy()
pr_smi = df['Product SMILES'].to_numpy()
sol_smi = df['Solvent SMILES'].to_numpy()
cat_smi = df['Catalyst SMILES(RDKit)'].to_numpy()
press = df['Pressure/atm'].to_numpy().reshape(-1,1)
temp = df['Temperature/C'].to_numpy().reshape(-1,1)
s_c = df['S/C'].to_numpy().reshape(-1,1)
tag = df['ddG'].to_numpy().reshape(-1,1)

re_desc_1,re_desc_2 = np.array([mbtr_desc_map[tmp_smi] for tmp_smi in re_smi]),\
                      np.array([mf_desc_map[tmp_smi] for tmp_smi in re_smi])
pr_desc_1,pr_desc_2 = np.array([mbtr_desc_map[tmp_smi] for tmp_smi in pr_smi]),\
                      np.array([mf_desc_map[tmp_smi] for tmp_smi in pr_smi])
sol_desc_1,sol_desc_2 = np.array([mbtr_desc_map[tmp_smi] for tmp_smi in sol_smi]),\
                      np.array([mf_desc_map[tmp_smi] for tmp_smi in sol_smi])
cat_desc_1,cat_desc_2 = np.array([mbtr_desc_map[tmp_smi] for tmp_smi in cat_smi]),\
                      np.array([mf_desc_map[tmp_smi] for tmp_smi in cat_smi])
react_desc = np.concatenate([re_desc_1,re_desc_2,pr_desc_1,pr_desc_2,
                             sol_desc_1,sol_desc_2,cat_desc_1,cat_desc_2,press,temp,s_c],axis=1)
usidx = getusidx(react_desc)
react_desc = react_desc[:,usidx]
react_desc = (react_desc-react_desc.min(axis=0))/(react_desc.max(axis=0)-react_desc.min(axis=0))

In [6]:
react_desc.shape

(5, 3330)

In addition, **MBTR** and **MF** can be calculated with original API in **dscribe** and **rdkit**. Here, we use *target substrate set* and *test set* in the paper for demonstration.
For calculating **MBTR** and **MF**, we need define some paramters firstly.

In [7]:
k1={
        "geometry": {"function": "atomic_number"},
        "grid": {"min": 0, "max": 8, "n": 50, "sigma": 0.1},
    }
k2={
        "geometry": {"function": "inverse_distance"},
        "grid": {"min": 0, "max": 4, "n": 50, "sigma": 0.1},
        "weighting": {"function": "exponential", "scale": 0.5, "cutoff": 1e-3},
    }
k3={
        "geometry": {"function": "cosine"},
        "grid": {"min": -1, "max": 4, "n": 50, "sigma": 0.1},
        "weighting": {"function": "exponential", "scale": 0.5, "cutoff": 1e-3},
    }
mbtr = MBTR(
    species=["H","B","C","N","O","F","P","S","Fe","Rh"],
    k1=k1,
    k2=k2,
    k3=k3,
    periodic=False,
    normalization="l2_each",
)

In [8]:
df_2 = pd.read_csv('./data/hl_target_substrate_test_set_demo.csv')
cat_trj_files = glob.glob('./data/hl_cat_geoms_demo/*/xtb.trj')
cat_name_smi_df = pd.read_csv('./data/hl_cat_deom_name_smi_map.csv')
cat_fn_smi_map = {cat_name_smi_df['file'].to_list()[i]:cat_name_smi_df['SMILES'].to_list()[i]
    for i in range(len(cat_name_smi_df['SMILES'].to_list()))}
cat_smi_ = cat_name_smi_df['SMILES'].to_list()
cat_smiles = df_2['Catalyst SMILES(RDKit)'].to_list()

Calculate MBTR descriptors based on 3D structures of molecules

In [9]:
file_fn = []
all_mbtr = []
for tmp_file in cat_trj_files:
    tmp_fn = tmp_file.split('/')[-2]
    file_fn.append(tmp_fn)
    with open(tmp_file,'r') as fr:
        lines = fr.readlines()
    atom_num = eval(lines[0].strip())
    mol_num = int(len(lines)/(atom_num+2))
    tmp_desc = []
    for i in range(mol_num):
        coord_string = lines[(2+atom_num)*i+2:(2+atom_num)*(i+1)]
        syms = [tmp_item.split()[0] for tmp_item in coord_string]
        coord = np.array([list(map(eval,coord_string[i].strip().split()[1:])) 
                          for i in range(len(coord_string))])
        tmp_atom = Atoms(syms,coord)
        tmp_desc.append(mbtr.create(tmp_atom).reshape(-1,))
    tmp_desc = np.mean(tmp_desc,axis=0)
    all_mbtr.append(tmp_desc)
all_mbtr = np.array(all_mbtr)
all_mbtr = all_mbtr[:,np.where(all_mbtr.max(axis=0) - all_mbtr.min(axis=0)!=0)[0]]
all_mbtr = (all_mbtr-all_mbtr.min(axis=0))/(all_mbtr.max(axis=0)-all_mbtr.min(axis=0))
file_mbtr_desc_map = {tmp_fn:tmp_desc for tmp_fn,tmp_desc in zip(file_fn,all_mbtr)}

Calculate MF descriptors based on 2D topological structures of molecules

In [10]:
cat_mols = [Chem.MolFromSmiles(tmp_smi) for tmp_smi in cat_smi_]
cat_fp = np.array([getmorganfp(tmp_mol,6,2048) for tmp_mol in cat_mols])
cat_fp = cat_fp[:,np.where(cat_fp.max(axis=0) - cat_fp.min(axis=0)!=0)[0]]
cat_fp = (cat_fp-cat_fp.min(axis=0))/(cat_fp.max(axis=0)-cat_fp.min(axis=0))

Save these descriptors, we will use these descriptors in the following tutorials

In [11]:
cat_smi_mf_map = {tmp_smi:tmp_desc for tmp_smi,tmp_desc in zip(cat_smi_,cat_fp)}
cat_smi_mbtr_map = {cat_fn_smi_map[tmp_key]:file_mbtr_desc_map[tmp_key] for tmp_key in file_mbtr_desc_map}
cat_smi_mf_df = pd.DataFrame.from_dict(cat_smi_mf_map).T
cat_smi_mbtr_df = pd.DataFrame.from_dict(cat_smi_mbtr_map).T
cat_smi_mf_df.to_csv('./data/cat_mf_desc.csv')
cat_smi_mbtr_df.to_csv('./data/cat_mbtr_desc.csv')

In [12]:
cat_smi_mf_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1591,1592,1593,1594,1595,1596,1597,1598,1599,1600
C=Cc1ccc(Np8->2oc3ccc4ccccc4c3c3c(ccc4ccccc43)o2)cc1.[Rh+]8,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CC(C)(C)[C@H](Op9->1oc2ccc3ccccc3c2c2c(ccc3ccccc32)o1)P(c1ccccc1)8->c1ccccc1.[Rh+]89,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
CC(C)N(C(C)C)P8->1OC(c2ccccc2)(c2ccccc2)[C@@H]2OC3(CCCC3)O[C@H]2C(c2ccccc2)(c2ccccc2)O1.[Rh+]8,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CC(C)N(C(C)C)p18->oc2c(P(c3ccccc3)c3ccccc3)cc3ccccc3c2c2c(o1)c(P(c1ccccc1)c1ccccc1)cc1ccccc12.[Rh+]8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CC(C)N(CCCN(C(C)C)p8->1oc2ccc3ccccc3c2c2c(ccc3ccccc32)o1)p9->1oc2ccc3ccccc3c2c2c(ccc3ccccc32)o1.[Rh+]89,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C[C@@H](C1=C[C@H]([Fe]C2C=CC=C2)C=C1P(c1ccccc1)8->c1ccccc1)n1nnc(-c2ccccc2F)c1P(c1ccccc1)9->c1ccccc1.[Rh+]89,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
C[C@H](C1=C(P(c2ccccc2)8->c2ccccc2)C=C[C@@H]1[Fe]C1C=CC=C1)N(C)P(c1ccccc1)9->c1ccccc1.[Rh+]89,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
C[C@H](C1=C(P(c2ccccc2)8->c2ccccc2)C=C[C@@H]1[Fe]C1C=CC=C1)N(CC(C)(C)C)P(c1ccccc1)9->c1ccccc1.[Rh+]89,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cc1cc(C)cc(P(c2cc(C)cc(C)c2)8->N(C)[C@H](C)C2=C(P(c3ccccc3)9->c3ccccc3)C=C[C@@H]2[Fe]C2C=CC=C2)c1.[Rh+]89,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
cat_smi_mbtr_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7790,7791,7792,7793,7794,7795,7796,7797,7798,7799
COC(=O)[C@@H](Np8->1oc2ccc3ccccc3c2c2c(ccc3ccccc32)o1)C(C)C.[Rh+]8,0.343847,0.343847,0.343847,0.343847,0.343847,0.343847,0.343847,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CP(c1cc2c(cc1P(C)9->C(C)(C)C)OCO2)8->C(C)(C)C.[Rh+]89,0.824281,0.824281,0.824281,0.824281,0.824281,0.824281,0.824281,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
O=C(Nc1cccc(COp8->2oc3ccc4ccccc4c3c3c(ccc4ccccc43)o2)c1)c1ccccc1.[Rh+]8,0.074911,0.074911,0.074911,0.074912,0.074912,0.074911,0.074911,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CN(C)P8->1Oc2cccc3c2C2(CC3)CCc3cccc(c32)O1.[Rh+]8,0.474662,0.474661,0.474661,0.474661,0.474661,0.474661,0.474661,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c1ccc(CNp8->2oc3ccc4ccccc4c3c3c(ccc4ccccc43)o2)n9->c1.[Rh+]89,0.111940,0.111939,0.111939,0.111939,0.111939,0.111939,0.111939,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
c1ccc(COc2ccccc2P(CCP(c2ccccc2)9->c2ccccc2OCc2ccccc2)8->c2ccccc2)cc1.[Rh+]89,0.329040,0.329040,0.329040,0.329040,0.329040,0.329040,0.329040,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CP(c1nc2ccccc2nc1P(C)8->C(C)(C)C)9->C(C)(C)C.[Rh+]89,0.787718,0.787718,0.787718,0.787718,0.787718,0.787718,0.787718,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CC1CCCC(C)N1p8->1oc2ccc3ccccc3c2c2c(ccc3ccccc32)o1.[Rh+]8,0.393363,0.393363,0.393363,0.393363,0.393363,0.393363,0.393363,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
COc1ccccc1Np8->1oc2ccc3ccccc3c2c2c(ccc3ccccc32)o1.[Rh+]8,0.125296,0.125296,0.125296,0.125296,0.125296,0.125296,0.125296,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
