This notebook is a tutorial for generating all molecular descriptors used in the paper *Towards Data-driven Design of Asymmetric Hydrogenation of Olefins: Database and Hierarchical Learning*

# Load dependence

In [1]:
import numpy as np
import pandas as pd
import glob,os
from ase import Atoms
from gendesc import generate2Ddesc,generate3Ddesc,getusidx,getmorganfp
from mlutils import process_desc,molformatconversion,maxminscale
from dscribe.descriptors import MBTR
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect

# Read data file

For demonstration, we just operate on 5 entries data here

In [2]:
df = pd.read_csv('./data/demo_data.csv',index_col=0)   ### 
re_smi = df['Reactant SMILES'].to_numpy()
pr_smi = df['Product SMILES'].to_numpy()
sol_smi = df['Solvent SMILES'].to_numpy()
cat_smi = df['Catalyst SMILES(RDKit)'].to_numpy()
press = df['Pressure/atm'].to_numpy().reshape(-1,1)
temp = df['Temperature/C'].to_numpy().reshape(-1,1)
s_c = df['S/C'].to_numpy().reshape(-1,1)
tag = df['ddG'].to_numpy().reshape(-1,1)
df

Unnamed: 0,Reactant SMILES,Product SMILES,Solvent SMILES,Additive SMILES,Metal,Ligand SMILES,Catalyst SMILES(RDKit),Axial tag,Pressure/atm,Temperature/C,S/C,ddG,ee
1,CCOC(=O)/C=C(\C)c1ccccc1,CCOC(=O)C[C@@H](C)c1ccccc1,ClCCl,,Ir,Cc1ccccc1P(c1ccccc1C)N1[C@H]2CC[C@H](C2)[C@@H]...,Cc1ccccc1P(c1ccccc1C)8->N1[C@H]2CC[C@H](C2)[C@...,0,49,25,200,2.720953,0.98
2,CON(C)C(=O)/C=C(\C)c1ccccc1,CON(C)C(=O)C[C@@H](C)c1ccccc1,CO,,Ir,CC(C)(C)[C@H]1COC(C2=CCC[C@@]23CCC=C3P(c2ccccc...,CC(C)(C)[C@H]1COC(C2=CCC[C@@]23CCC=C3P(c2ccccc...,0,50,25,100,0.240092,0.2
3,CON(C)C(=O)/C=C(\C)c1ccccc1,CON(C)C(=O)C[C@@H](C)c1ccccc1,ClCCl,,Ir,CC(C)C[C@H]1COC(C2=CCC[C@@]23CCC=C3P(c2ccccc2)...,CC(C)C[C@H]1COC(C2=CCC[C@@]23CCC=C3P(c2ccccc2)...,0,50,25,50,1.369995,0.82
4,CON(C)C(=O)/C=C(\C)c1ccc(C)cc1,CON(C)C(=O)C[C@@H](C)c1ccc(C)cc1,ClCCl,,Ir,C1=C(C2=N[C@@H](Cc3ccccc3)CO2)[C@@]2(CC1)CCC=C...,C1=C(C2=N9->[C@@H](Cc3ccccc3)CO2)[C@@]2(CC1)CC...,0,50,25,50,2.477863,0.97
5,C=C(CO)c1ccc(Cl)cc1,C[C@@H](CO)c1ccc(Cl)cc1,ClCCl,,Ir,C[C@@H]1OC=N[C@@H]1C(Cc1ccccc1)(Cc1ccccc1)OP(C...,C[C@@H]1OC=N9->[C@@H]1C(Cc1ccccc1)(Cc1ccccc1)O...,0,10,-20,200,1.383259,0.88


# Generate 3D geometries

We need generate 3D geometries for 3D descriptors. For convenience, we stored GFN0-xTB-generated geometries in "demo_geoms.csv". We convert them into ".sdf" and ".xyz" files

In [3]:
demo_smi_geoms_map = pd.read_csv('./data/demo_geoms.csv',index_col=0)

In [4]:
smiles_keys = list(demo_smi_geoms_map.keys())
if not os.path.exists('./data/geoms_demo/'):
    os.mkdir('./data/geoms_demo/')
for idx,tmp_key in enumerate(smiles_keys):
    try:
        geoms_list = eval(demo_smi_geoms_map[tmp_key][0])[1].split('\n')
    except:
        geoms_list = [tmp_item.strip() for tmp_item in eval(demo_smi_geoms_map[tmp_key][0])[1]]
    if tmp_key in re_smi:
        tmp_type = 'react'
    elif tmp_key in pr_smi:
        tmp_type = 'prod'
    elif tmp_key in sol_smi:
        tmp_type = 'sol'
    elif tmp_key in cat_smi:
        tmp_type = 'cat'
    geoms_list[1] = '%s %s'%(tmp_key,tmp_type)
    geoms_string = '\n'.join(geoms_list)
    with open('./data/geoms_demo/geom_%d.xyz'%idx,'w') as fw:
        fw.writelines(geoms_string)
    molformatconversion('./data/geoms_demo/geom_%d.xyz'%idx,'./data/geoms_demo/geom_%d.sdf'%idx,'xyz','sdf')

1 molecules converted
1 molecules converted
1 molecules converted
1 molecules converted
1 molecules converted
1 molecules converted
1 molecules converted
1 molecules converted
1 molecules converted
1 molecules converted
1 molecules converted
1 molecules converted
1 molecules converted
1 molecules converted
1 molecules converted


# Generate 2D/3D descriptor map

*generate2Ddesc*,*generate3Ddesc* modules in *gendesc* can be used to generate 2D descriptors including **MolecularFingerprint (MF)**, **200 (or 208) molecular descriptors built-in RDKit**, and **molecular descriptors built-in Mordred**, and 3D descriptors like **ACSF**, **MBTR**, **SOAP** and **LMBTR**. In addition, the paramters of these modules can be modified.

In [5]:
gen2d = generate2Ddesc(df)
rdkit_desc_map = gen2d.calc_rdkit_desc()
radius = 4
nBits = 2048
mf_desc_map = gen2d.calc_mf_desc(radius,nBits=nBits,useChirality=True)

In [6]:
gen3d = generate3Ddesc('./data/geoms_demo/')
### ACSF Parameters ###
rcut=6.0
g2_params=[[1, 1], [1, 2], [1, 3]]
g4_params=[[1, 1, 1], [1, 2, 1], [1, 1, -1], [1, 2, -1]]
acsf_desc_map = gen3d.calc_acsf_desc(rcut,g2_params,g4_params)
### SOAP Parameters ###
rcut = 6.0
nmax = 4
lmax = 3
soap_desc_map = gen3d.calc_soap_desc(rcut,nmax,lmax)
### LMBTR Parameters ###
k2={
        "geometry": {"function": "inverse_distance"},
        "grid": {"min": 0, "max": 1, "n": 10, "sigma": 0.1},
        "weighting": {"function": "exponential", "scale": 0.5, "cutoff": 1e-3},
    }
k3={
        "geometry": {"function": "cosine"},
        "grid": {"min": -1, "max": 1, "n": 10, "sigma": 0.1},
        "weighting": {"function": "exponential", "scale": 0.5, "cutoff": 1e-3},
    }
lmbtr_desc_map = gen3d.calc_lmbtr_desc(k2,k3)
### MBTR Parameters ###
k1={
        "geometry": {"function": "atomic_number"},
        "grid": {"min": 0, "max": 8, "n": 10, "sigma": 0.1},
    }
k2={
        "geometry": {"function": "inverse_distance"},
        "grid": {"min": 0, "max": 4, "n": 10, "sigma": 0.1},
        "weighting": {"function": "exponential", "scale": 0.5, "cutoff": 1e-3},
    }
k3={
        "geometry": {"function": "cosine"},
        "grid": {"min": -1, "max": 4, "n": 10, "sigma": 0.1},
        "weighting": {"function": "exponential", "scale": 0.5, "cutoff": 1e-3},
    }
mbtr_desc_map  = gen3d.calc_mbtr_desc(k1,k2,k3)

descriptor map demonstration

In [7]:
print('SMILES:\n %s'%re_smi[0])
print('Top 50 vector of MBTR:')
print(mbtr_desc_map[re_smi[0]][:50])

SMILES:
 CCOC(=O)/C=C(\C)c1ccccc1
Top 50 vector of MBTR:
[0.0000000e+00 7.5882059e-01 3.2569974e-04 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 8.5463310e-03 6.4215040e-01 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 4.7834283e-07 1.0844851e-01
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]


# Generate reaction descritor

In this tutorial, we just generate MBTR+MF reaction descriptor for demonstration

In [8]:
re_desc_1,re_desc_2 = np.array([mbtr_desc_map[tmp_smi] for tmp_smi in re_smi]),\
                      np.array([mf_desc_map[tmp_smi] for tmp_smi in re_smi])
pr_desc_1,pr_desc_2 = np.array([mbtr_desc_map[tmp_smi] for tmp_smi in pr_smi]),\
                      np.array([mf_desc_map[tmp_smi] for tmp_smi in pr_smi])
sol_desc_1,sol_desc_2 = np.array([mbtr_desc_map[tmp_smi] for tmp_smi in sol_smi]),\
                      np.array([mf_desc_map[tmp_smi] for tmp_smi in sol_smi])
cat_desc_1,cat_desc_2 = np.array([mbtr_desc_map[tmp_smi] for tmp_smi in cat_smi]),\
                      np.array([mf_desc_map[tmp_smi] for tmp_smi in cat_smi])
react_desc = np.concatenate([re_desc_1,re_desc_2,pr_desc_1,pr_desc_2,
                             sol_desc_1,sol_desc_2,cat_desc_1,cat_desc_2,press,temp,s_c],axis=1)
react_desc = maxminscale(process_desc(react_desc)) ### Use "process_desc" function to drop "NaN" value
                                                   ### Use "maxminscale" function to standard descriptor 

In [9]:
react_desc.shape

(5, 1836)