# Identifying  KNIME RDKit descriptors

In [1]:
# In order to match descriptors directly computed from RDKit and from KNIME we are going to compare results from both procedures using two molecules

In [5]:
import pandas as pd
import numpy as np

In [2]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors



In [50]:
# Read the molecules to use rdkit
SDF_MIN_DIR = '../../ARCHIVOS/CRISTALES/LIGS_ERK2/FOURCHES_LIGS/3d_minimized/sdf/'

# Read the molecules using rdkit
csar_1 = Chem.SDMolSupplier(SDF_MIN_DIR + 'CSAR_erk2_1.sdf')[0]
model_53 = Chem.SDMolSupplier(SDF_MIN_DIR + 'erk2_53.sdf')[0]

## WHIM Descriptors

In [52]:
# List of RDKit descriptors

whim_axial = [ "L1u", "L2u", "L3u", "P1u", "P2u", "G1u", "G2u", "G3u", "E1u", "E2u", "E3u", 
               "L1m", "L2m", "L3m", "P1m", "P2m", "G1m", "G2m", "G3m", "E1m", "E2m", "E3m", 
               "L1v", "L2v", "L3v", "P1v", "P2v", "G1v", "G2v", "G3v", "E1v", "E2v", "E3v", 
               "L1e", "L2e", "L3e", "P1e", "P2e", "G1e", "G2e", "G3e", "E1e", "E2e", "E3e", 
               "L1p", "L2p", "L3p", "P1p", "P2p", "G1p", "G2p", "G3p", "E1p", "E2p", "E3p", 
               "L1i", "L2i", "L3i", "P1i", "P2i", "G1i", "G2i", "G3i", "E1i", "E2i", "E3i", 
               "L1s", "L2s", "L3s", "P1s", "P2s", "G1s", "G2s", "G3s", "E1s", "E2s", "E3s"]
whim_global = ["Tu", "Tm", "Tv", "Te", "Tp", "Ti", "Ts", 
               "Au", "Am", "Av", "Ae", "Ap", "Ai", "As", 
               "Gu", "Gm", 
               "Ku", "Km", "Kv", "Ke", "Kp", "Ki", "Ks", 
               "Du", "Dm", "Dv", "De", "Dp", "Di", "Ds", 
               "Vu", "Vm", "Vv", "Ve", "Vp", "Vi", "Vs"]

rdkit_whim_names = whim_axial + whim_global

In [55]:
# Compute the Whim descriptors using rdkit
# 
df_rdk_csar_1 = pd.DataFrame(rdMolDescriptors.CalcWHIM(csar_1), rdkit_whim_names, columns=['csar_1'])

### Comparing KNIME Whim Descriptors against RDKit WHIM

In [175]:
# Load the results generated using KNIME and the node WHIM 3D
whim_knime_all = pd.read_csv('./knime/whim_rdki_knime.csv')
# For comparation purposes we'll only keep one molecule
# Descriptors start from the eigth colum
whim_knime = whim_knime_all.iloc[[0, 1], 7:]
whim_knime.index = ['csar_1', 'csar_10']

In [176]:
# Name of the whim descriptors in Knime
whim_knime.columns

Index(['Unit Weights.Wlambda1', 'Unit Weights.Wlambda2',
       'Unit Weights.wlambda3', 'Unit Weights.Wnu1', 'Unit Weights.Wnu2',
       'Unit Weights.Wgamma1', 'Unit Weights.Wgamma2', 'Unit Weights.Wgamma3',
       'Unit Weights.Weta1', 'Unit Weights.Weta2', 'Unit Weights.Weta3',
       'Unit Weights.WT', 'Unit Weights.WA', 'Unit Weights.WV',
       'Unit Weights.WK', 'Unit Weights.WG', 'Unit Weights.WD',
       'Atomic Masses.Wlambda1', 'Atomic Masses.Wlambda2',
       'Atomic Masses.wlambda3', 'Atomic Masses.Wnu1', 'Atomic Masses.Wnu2',
       'Atomic Masses.Wgamma1', 'Atomic Masses.Wgamma2',
       'Atomic Masses.Wgamma3', 'Atomic Masses.Weta1', 'Atomic Masses.Weta2',
       'Atomic Masses.Weta3', 'Atomic Masses.WT', 'Atomic Masses.WA',
       'Atomic Masses.WV', 'Atomic Masses.WK', 'Atomic Masses.WG',
       'Atomic Masses.WD', 'Atomic Polarizabilities.Wlambda1',
       'Atomic Polarizabilities.Wlambda2', 'Atomic Polarizabilities.wlambda3',
       'Atomic Polarizabilities.Wnu1', 

#### WHIM KNIME has the following:
- **Weight schemes:**
    1. Unit Weights = unweighted (u)
    - Atomic Masses = m
    - Van der Waals volumes = v
    - Atomic electronegativites = e
    - Atomic polarizabilities = p

It doesn't have *Electropological state indices* and *Topological I-state*

**Descriptors:**
Note: The order here is inverse to the reported by rdkit, i.e., lambda1 = L3)
    
> - **Axial:**
    1. Lambda: 3, 2, 1 => **L**
    2. Nu: 2, 1        => **P**
    3. Gamma: 3, 2, 1  => **G**
    4. Eta: 3, 2, 1    => **E**
    
> - **Global:** Are the same as RDKit
    - T, A, V, K, G, D
    
Descriptors are sorted first by the weighted schema and the by the kind of descriptor.

In [177]:
# Let's convert the Knime names to the RDKit names
whim_knime_names = whim_knime.columns
weight_schemes_whim = {
    'Unit Weights': 'u',
    'Atomic Masses': 'm',
    'VdW Volumes': 'v',
    'Atomic Polarizabilities': 'p',
    'Atomic Electronegativities': 'e'
}
descriptors_whim = {
    'lambda1': 'L3', 'lambda2': 'L2', 'lambda3': 'L1',
    'nu1': 'P1', 'nu2': 'P2',
    'eta1': 'E3', 'eta2': 'E2', 'eta3': 'E1',
    'gamma1': 'G3', 'gamma2': 'G2', 'gamma3': 'G1',
    'T': 'T', 'A': 'A', 'V': 'V', 'K': 'K', 'G': 'G', 'D': 'D'
}
# funtion to convert names
convert_name = lambda s, d: descriptors_whim[d[1:]] + weight_schemes_whim[s]

# Use the dictionaries to create a new list of names
whim_knime_names_converted = [convert_name(*i.split('.')) for i in whim_knime_names]

In [178]:
# list(zip(whim_knime_names_converted, whim_knime_names))

In [165]:
# Let's create a dataframe with the values of the csar_1 molecule
df_knime_csar_1 = pd.DataFrame(list(zip(whim_knime_names, whim_knime.loc['csar_1'])), 
                             whim_knime_names_converted, columns=['knime_name', 'csar_1'])
# identify null values
df_knime_csar_1['csar_1'] = pd.to_numeric(df_knime_csar_1['csar_1'], errors='coerce')

#### The following list shows which WHIM descriptors have diferente values when computed with KNIME and RDKit

In [173]:
# ommit null values
names_ = df_knime_csar_1.dropna().index
# Iterate over the index and compare KNIME and RDKIT
print('Desc', '\t', 'Knime', '\t\t', 'rdkit', '\t\t', 'Diff')
print('-'*50)
for i in names_:
    # compare
    a = df_knime_csar_1.loc[i, 'csar_1']
    b =  df_rdk_csar_1.loc[i, 'csar_1']
    if abs(a - b) >= 0.01: # Rdkit returns values with 3 decimal positions
        print(i, '\t', round(a, 2), '\t\t', round(b, 2), '\t\t', round(abs(a - b), 3))

Desc 	 Knime 		 rdkit 		 Diff
--------------------------------------------------
P1u 	 0.05 		 0.61 		 0.561
Ku 	 0.21 		 0.42 		 0.212
Du 	 1.3 		 0.43 		 0.865
P1m 	 0.05 		 0.63 		 0.573
Km 	 0.22 		 0.44 		 0.219
Dm 	 1.28 		 0.42 		 0.851
P1p 	 0.05 		 0.58 		 0.535
Kp 	 0.19 		 0.42 		 0.236
Dp 	 1.34 		 0.45 		 0.893
L3v 	 0.63 		 0.64 		 0.015
L2v 	 4.5 		 4.39 		 0.103
L1v 	 7.36 		 7.48 		 0.125
P1v 	 0.05 		 0.6 		 0.548
E2v 	 0.51 		 0.48 		 0.022
E1v 	 0.46 		 0.48 		 0.018
Tv 	 12.48 		 12.52 		 0.037
Av 	 40.54 		 40.52 		 0.02
Vv 	 73.85 		 74.22 		 0.371
Kv 	 0.19 		 0.42 		 0.231
Dv 	 1.33 		 0.44 		 0.885
P1e 	 0.05 		 0.63 		 0.573
Ae 	 38.92 		 38.94 		 0.013
Ve 	 71.87 		 71.9 		 0.022
Ke 	 0.22 		 0.44 		 0.22
De 	 1.28 		 0.43 		 0.851


### D-MOMENTS 

#### KNIME
Descriptors computed using [3D D-Moments](https://hub.knime.com/egonw/extensions/org.openscience.cdk.knime.feature/latest/org.openscience.cdk.knime.nodes.descriptors.distance3d.Distance3dNodeFactory). These descriptors are known as *'Ultra-fast shape recognition'* (**USR**), and were initially purposed by [Ballester and Graham (2007)](https://royalsocietypublishing.org/doi/10.1098/rspa.2007.1823). The intuition is the following:
> - USR is based on the idea that the shape of a molecule is determined by the relative position of its atoms.
- Therefore, molecular alignment (superposition) is not needed.
- The molecule is considered as a bound of particles (*atoms*) instead of a solid body.
- A dimensionality reduction is applied: The three dimensional shape information is retain by a set of one-dimensional distributions.
- These distributions are taken from all the atomic distances (euclidean distances) to the four different locations:
    - **Ctd**: *Molecular centroid.*
    - **Cst**: *Closest atom to Ctd*
    - **Fct**: *Farthest atom to Ctd*
    - **Ftf**: *Farthest atom to Fct*
- These locations are independent of molecular orientation and position.
- Mean, standard deviation and skewness are used over the atom distances from these locations to let to compare molecules with different number of atoms.

In [198]:
# Load the results generated using KNIME and the node WHIM 3D
moments_knime_all = pd.read_csv('./knime/3dmoments_rdki_knime.csv')
# For comparation purposes we'll only keep one molecule
# Descriptors start from the eigth colum
moments_knime = moments_knime_all.iloc[[0, 1], 7:]
moments_knime.index = ['csar_1', 'csar_10']

These descriptors are the following:

In [213]:
moments_knime.loc[['csar_1']].T

Unnamed: 0,csar_1
Ctd-Mean,3.350091
Ctd-Sigma,1.272858
Ctd-Skewness,-0.008724
Cst-Mean,3.446493
Cst-Sigma,1.989122
Cst-Skewness,-0.585295
Fct-Mean,6.098801
Fct-Sigma,7.086076
Fct-Skewness,-0.673957
Ftf-Mean,4.936628


#### RDKIT
RDKit computes the USR descriptors through `rdMolDescriptors.GetUSR()` class.

In [211]:
rdkit_usr = pd.DataFrame(rdMolDescriptors.GetUSR(csar_1),columns=['csar_1'], index=moments_knime.T.index)
rdkit_usr

Unnamed: 0,csar_1
Ctd-Mean,3.350091
Ctd-Sigma,1.103411
Ctd-Skewness,-0.210489
Cst-Mean,3.446493
Cst-Sigma,1.379362
Cst-Skewness,-0.855285
Fct-Mean,6.098801
Fct-Sigma,2.603456
Fct-Skewness,-0.896458
Ftf-Mean,4.936629


As we can see, the means between KNIME and RDKit implementations are the same, but the sigmas are not, this is because rdkit reports standard deviations instead variances:

In [217]:
# Let's get the square of these values:
rdkit_usr.loc[rdkit_usr.index.str.contains('Sigma')] ** 2

Unnamed: 0,csar_1
Ctd-Sigma,1.217516
Cst-Sigma,1.902638
Fct-Sigma,6.777985
Ftf-Sigma,6.581306
