# Identifying  KNIME RDKit descriptors

In [1]:
# In order to match descriptors directly computed from RDKit and from KNIME we are going to compare results from both procedures using two molecules

In [5]:
import pandas as pd
import numpy as np

In [2]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors



In [287]:
# Read the molecules to use rdkit
SDF_MIN_DIR = '../../ARCHIVOS/CRISTALES/LIGS_ERK2/FOURCHES_LIGS/3d_minimized/sdf/'

# Read the molecules using rdkit
csar_1 = Chem.SDMolSupplier(SDF_MIN_DIR + 'CSAR_erk2_1.sdf')[0]
csar_10 = Chem.SDMolSupplier(SDF_MIN_DIR + 'CSAR_erk2_10.sdf')[0]
model_53 = Chem.SDMolSupplier(SDF_MIN_DIR + 'erk2_53.sdf')[0]

## WHIM Descriptors

[RDKit source code](https://github.com/rdkit/rdkit/blob/master/Code/GraphMol/Descriptors/WHIM.cpp)

In [52]:
# List of RDKit descriptors

whim_axial = [ "L1u", "L2u", "L3u", "P1u", "P2u", "G1u", "G2u", "G3u", "E1u", "E2u", "E3u", 
               "L1m", "L2m", "L3m", "P1m", "P2m", "G1m", "G2m", "G3m", "E1m", "E2m", "E3m", 
               "L1v", "L2v", "L3v", "P1v", "P2v", "G1v", "G2v", "G3v", "E1v", "E2v", "E3v", 
               "L1e", "L2e", "L3e", "P1e", "P2e", "G1e", "G2e", "G3e", "E1e", "E2e", "E3e", 
               "L1p", "L2p", "L3p", "P1p", "P2p", "G1p", "G2p", "G3p", "E1p", "E2p", "E3p", 
               "L1i", "L2i", "L3i", "P1i", "P2i", "G1i", "G2i", "G3i", "E1i", "E2i", "E3i", 
               "L1s", "L2s", "L3s", "P1s", "P2s", "G1s", "G2s", "G3s", "E1s", "E2s", "E3s"]
whim_global = ["Tu", "Tm", "Tv", "Te", "Tp", "Ti", "Ts", 
               "Au", "Am", "Av", "Ae", "Ap", "Ai", "As", 
               "Gu", "Gm", 
               "Ku", "Km", "Kv", "Ke", "Kp", "Ki", "Ks", 
               "Du", "Dm", "Dv", "De", "Dp", "Di", "Ds", 
               "Vu", "Vm", "Vv", "Ve", "Vp", "Vi", "Vs"]

rdkit_whim_names = whim_axial + whim_global

In [307]:
# Compute the Whim descriptors using rdkit
# 
df_rdk_whim = pd.DataFrame(rdMolDescriptors.CalcWHIM(csar_1), rdkit_whim_names, columns=['csar_1_RDK'])
df_rdk_whim['csar_10_RDK'] = rdMolDescriptors.CalcWHIM(csar_10)

# Merge both dataframes
df_rdk_whim

Unnamed: 0,csar_1_RDK,csar_10_RDK
L1u,7.625,29.166
L2u,4.167,1.991
L3u,0.649,0.695
P1u,0.613,0.916
P2u,0.335,0.062
...,...,...
Vv,74.223,148.297
Ve,71.895,157.005
Vp,74.450,142.584
Vi,69.902,152.210


### Comparing KNIME Whim Descriptors against RDKit WHIM

In [308]:
# Load the results generated using KNIME and the node WHIM 3D
whim_knime_all = pd.read_csv('./knime/whim_rdki_knime.csv')
# For comparation purposes we'll only keep one molecule
# Descriptors start from the eigth colum
whim_knime = whim_knime_all.iloc[[0, 1], 7:]
whim_knime.index = ['csar_1_KNM', 'csar_10_KNM']

In [309]:
# Name of the whim descriptors in Knime
whim_knime.columns

Index(['Unit Weights.Wlambda1', 'Unit Weights.Wlambda2',
       'Unit Weights.wlambda3', 'Unit Weights.Wnu1', 'Unit Weights.Wnu2',
       'Unit Weights.Wgamma1', 'Unit Weights.Wgamma2', 'Unit Weights.Wgamma3',
       'Unit Weights.Weta1', 'Unit Weights.Weta2', 'Unit Weights.Weta3',
       'Unit Weights.WT', 'Unit Weights.WA', 'Unit Weights.WV',
       'Unit Weights.WK', 'Unit Weights.WG', 'Unit Weights.WD',
       'Atomic Masses.Wlambda1', 'Atomic Masses.Wlambda2',
       'Atomic Masses.wlambda3', 'Atomic Masses.Wnu1', 'Atomic Masses.Wnu2',
       'Atomic Masses.Wgamma1', 'Atomic Masses.Wgamma2',
       'Atomic Masses.Wgamma3', 'Atomic Masses.Weta1', 'Atomic Masses.Weta2',
       'Atomic Masses.Weta3', 'Atomic Masses.WT', 'Atomic Masses.WA',
       'Atomic Masses.WV', 'Atomic Masses.WK', 'Atomic Masses.WG',
       'Atomic Masses.WD', 'Atomic Polarizabilities.Wlambda1',
       'Atomic Polarizabilities.Wlambda2', 'Atomic Polarizabilities.wlambda3',
       'Atomic Polarizabilities.Wnu1', 

#### WHIM KNIME has the following:
- **Weight schemes:**
    1. Unit Weights = unweighted (u)
    - Atomic Masses = m
    - Van der Waals volumes = v
    - Atomic electronegativites = e
    - Atomic polarizabilities = p

It doesn't have *Electropological state indices* and *Topological I-state*

**Descriptors:**
Note: The order here is inverse to the reported by rdkit, i.e., lambda1 = L3)
    
> - **Axial:**
    1. Lambda: 3, 2, 1 => **L**
    2. Nu: 2, 1        => **P**
    3. Gamma: 3, 2, 1  => **G**
    4. Eta: 3, 2, 1    => **E**
    
> - **Global:** Are the same as RDKit
    - T, A, V, K, G, D
    
Descriptors are sorted first by the weighted schema and the by the kind of descriptor.

In [310]:
# Let's convert the Knime names to the RDKit names
whim_knime_names = whim_knime.columns
weight_schemes_whim = {
    'Unit Weights': 'u',
    'Atomic Masses': 'm',
    'VdW Volumes': 'v',
    'Atomic Polarizabilities': 'p',
    'Atomic Electronegativities': 'e'
}
descriptors_whim = {
    'lambda1': 'L3', 'lambda2': 'L2', 'lambda3': 'L1',
    'nu1': 'P1', 'nu2': 'P2',
    'eta1': 'E3', 'eta2': 'E2', 'eta3': 'E1',
    'gamma1': 'G3', 'gamma2': 'G2', 'gamma3': 'G1',
    'T': 'T', 'A': 'A', 'V': 'V', 'K': 'K', 'G': 'G', 'D': 'D'
}
# funtion to convert names
convert_name = lambda s, d: descriptors_whim[d[1:]] + weight_schemes_whim[s]

# Use the dictionaries to create a new list of names
whim_knime_names_converted = [convert_name(*i.split('.')) for i in whim_knime_names]

In [178]:
# list(zip(whim_knime_names_converted, whim_knime_names))

In [317]:
# Let's create a dataframe with the values of the csar_1 molecule
df_knime_csar_1 = pd.DataFrame(list(zip(whim_knime_names, whim_knime.loc['csar_1_KNM'])), 
                             whim_knime_names_converted, columns=['knime_name', 'csar_1_KNM'])
# identify null values
df_knime_csar_1['csar_1_KNM'] = pd.to_numeric(df_knime_csar_1['csar_1_KNM'], errors='coerce')

#### The following list shows which WHIM descriptors have diferente values when computed with KNIME and RDKit

In [323]:
# ommit null values
names_ = df_knime_csar_1.dropna().index
# Iterate over the index and compare KNIME and RDKIT
print('Desc', '\t', 'Knime', '\t\t', 'rdkit', '\t\t', 'Diff')
print('-'*50)
for i in names_:
    # compare
    a = df_knime_csar_1.loc[i, 'csar_1_KNM']
    b = df_rdk_whim.loc[i, 'csar_1_RDK']
    if abs(a - b) >= 0.2: # Rdkit returns values with 3 decimal positions
        print(i, '\t', round(a, 2), '\t\t', round(b, 2), '\t\t', round(abs(a - b), 3))

Desc 	 Knime 		 rdkit 		 Diff
--------------------------------------------------
P1u 	 0.05 		 0.61 		 0.561
Ku 	 0.21 		 0.42 		 0.212
Du 	 1.3 		 0.43 		 0.865
P1m 	 0.05 		 0.63 		 0.573
Km 	 0.22 		 0.44 		 0.219
Dm 	 1.28 		 0.42 		 0.851
P1p 	 0.05 		 0.58 		 0.535
Kp 	 0.19 		 0.42 		 0.236
Dp 	 1.34 		 0.45 		 0.893
P1v 	 0.05 		 0.6 		 0.548
Vv 	 73.85 		 74.22 		 0.371
Kv 	 0.19 		 0.42 		 0.231
Dv 	 1.33 		 0.44 		 0.885
P1e 	 0.05 		 0.63 		 0.573
Ke 	 0.22 		 0.44 		 0.22
De 	 1.28 		 0.43 		 0.851


## Compute WHIM descriptors (unweighted) using numpy

In [549]:
def get_mol_coords(mol, centered = True):
    num_atoms = len(mol.GetAtoms())
    coords = pd.DataFrame({
       'x': [mol.GetConformer().GetAtomPosition(i).x for i in range(num_atoms)],
       'y': [mol.GetConformer().GetAtomPosition(i).y for i in range(num_atoms)],
       'z': [mol.GetConformer().GetAtomPosition(i).z for i in range(num_atoms)]})
    if centered:
        coords = coords - coords.mean(axis =  0)
    return coords

def calc_cov(mol):
    coords = get_mol_coords(mol)
    n, m = coords.shape
    cov = np.dot(coords.T, coords) / (n)
    return cov

def calc_eigen(mol):
    eigen_values, eigen_vectors = np.linalg.eig(calc_cov(mol))
    return eigen_values, eigen_vectors

### Let's view which are the RDKit and KNIME results for the unweighted descriptors:
- Differences, RDK rounds to until the third decimal position. 

In [558]:
csar_1_KNM_whim = df_knime_csar_1[df_knime_csar_1.index.str.contains('u')]['csar_1_KNM']
csar_1_RDK_whim = df_rdk_whim[df_rdk_whim.index.str.contains('u')]['csar_1_RDK']
df_whim = pd.DataFrame([csar_1_RDK_whim, csar_1_KNM_whim])
print('WHIM descriptor values of "csar_1" molecules using KNIME and RDKit')
df_whim

WHIM descriptor values of "csar_1" molecules using KNIME and RDKit


Unnamed: 0,L1u,L2u,L3u,P1u,P2u,G1u,G2u,G3u,E1u,E2u,E3u,Tu,Au,Gu,Ku,Du,Vu
csar_1_RDK,7.625,4.167,0.649,0.613,0.335,0.181,0.181,0.181,0.505,0.438,0.355,12.441,39.425,0.181,0.422,0.433,72.494
csar_1_KNM,7.624702,4.166611,0.649312,0.052193,0.33492,,,,0.50461,0.43817,0.355002,12.440624,39.425401,,0.209666,1.297782,72.494112


### Compute L1u, L2u and L3u (Axial dimensions)
>> $ L_k = \lambda_k$

Where $k_1 = 1, 2, 3$. *L1, L2* and *L3* are the eigen values of the weighted covariance matrix.

In [554]:
def calc_Lu(mol):
    eigen_values, _ = calc_eigen(mol)
    L1, L2, L3 = eigen_values
    return np.array([L1, L2, L3])

In [555]:
mol = csar_1
calc_Lu(mol).round(6)
# The values are the same to RDKit and KNIME

array([7.624702, 4.166611, 0.649312])

In [559]:
df_whim[['L1u', 'L2u', 'L3u']]

Unnamed: 0,L1u,L2u,L3u
csar_1_RDK,7.625,4.167,0.649
csar_1_KNM,7.624702,4.166611,0.649312


### Compute T (global dimension)
>> $T = \lambda_1 + \lambda_2 + \lambda_3$

In [564]:
def calc_Tu(mol):
    L1, L2, L3 =calc_Lu(mol)
    return np.sum([L1, L2, L3])

In [561]:
calc_Tu(mol).round(6)

12.440624

In [563]:
df_whim[['Tu']]
# The values are the same to RDKit and KNIME

Unnamed: 0,Tu
csar_1_RDK,12.441
csar_1_KNM,12.440624


### Compute A (global dimension)
>> $A = \lambda_1\lambda_2  + \lambda_1\lambda_3 + \lambda_2\lambda_3$

In [565]:
def calc_Au(mol):
    L1, L2, L3 = calc_Lu(mol)
    return np.sum([L1*L2, L1*L3, L2*L3])

In [567]:
calc_Au(mol).round(6)

39.425401

In [566]:
df_whim[['Au']]
# The values are the same to RDKit and KNIME

Unnamed: 0,Au
csar_1_RDK,39.425
csar_1_KNM,39.425401


### Compute V (global dimension)
>> $V = \prod^3_{k=1} (1 + \lambda_k) -1 = T + A + \lambda_1*\lambda_2*\lambda_3$

In [788]:
def calc_Vu(mol):
    Lk = calc_Lu(mol)
    # Vu = np.array([(1 + k) for k in [Lk]]).prod() - 1
    Vu = (1 + Lk).prod() - 1
    return Vu

In [789]:
calc_Vu(mol).round(6)

72.494112

In [581]:
df_whim[['Vu']]
# The values are the same to RDKit and KNIME

Unnamed: 0,Vu
csar_1_RDK,72.494
csar_1_KNM,72.494112


### Compute P (Axial shape)
>> $P_k = \nu_k = \frac{\lambda_k}{\sum_k \lambda_k}$,  with $k$ = 1, 2, 3

These descriptors are *directional WHIM shape* and represent the eigenvalue ratios. Accordingly to , only P1 and P2 are taking into account due to P1 + P2 + P3 = 1, which means that only two of these parameters are independent.

In [786]:
def calc_Pu(mol):
    Lk = calc_Lu(mol)
    # Pu = [k / np.sum(Lk) for k in [Lk]][0]
    Pu = Lk / Lk.sum()
    return Pu

In [787]:
calc_Pu(mol)

array([0.6128874 , 0.33491976, 0.05219284])

In [629]:
df_whim[['P1u', 'P2u']]
# The values are NOT the same, RDKit reports only P for the first and second eigenvecors,
# but KNIME seems to repor P for the second and third eigenvalues 

Unnamed: 0,P1u,P2u
csar_1_RDK,0.613,0.335
csar_1_KNM,0.052193,0.33492


### Compute K (global shape)
>> $K = \frac{3}{4} \dot \sum^3_{k=1} | \frac{\lambda_k}{\sum_k \lambda_k} - \frac{1}{3}|$

This descriptor is related to the sphericity  of the molecule. For an ideal spherical molecule K = 0, and each $\lambda$ has the same value (with a ratio of $\frac{1}{3}$). A planar molecule will have a $\lambda = 0$ and K will range between 0.5 and 1. Finally, a ideal straight molecule will have $\lambda_2$ and $\lambda_3$ equal to 0, and K = 1.

In [782]:
def calc_Ku(mol):
    Pu = calc_Pu(mol)
    # Ku = (3/4) * np.sum([np.abs(k - 1/3) for k in Pu])
    Ku = (3/4) * np.abs(Pu - 1/3).sum()
    return Ku

In [781]:
calc_Ku(mol)

0.42171073372202184

In [640]:
df_whim[['Ku']]
# KNIME differs from the result

Unnamed: 0,Ku
csar_1_RDK,0.422
csar_1_KNM,0.209666


In [662]:
# Problably it is because the KNIME implementation ommits P1u
P1u, P2u, P3u = calc_Pu(mol)
(3/4) * np.sum([np.abs(k - 1/3) for k in [P2u, P3u]])

0.21204518510869136

### Compute E (Axial density)
>> $E_k = \eta_k =  \frac{\lambda_k^2 * N}{\sum{t^4}}$,

where $N$ in the number of atoms, and $t$ refers to the atomic coordinates with respect to the principal axes.

This descriptor is related to the kurtosis calculated from the $t_k$ scores (projected coordinates onto the $k$ eigenvector). And represents the quantity of unfilled space per projected atom. Low values of kurtosis are obtained when the data points (atom coordinates projections) assume opposite values with respect to the centre of the scores. When there are more extreme values in $t$ the kurtosis ($\kappa$) value increases. When $\kappa$ tends to infinity $E$ tends to 0.

In [790]:
def calc_Eu(mol):
    coords = get_mol_coords(mol).values
    eigen_vects = calc_eigen(mol)[1]
    t =  np.dot(coords, eigen_vects)
    L = calc_Lu(mol)
    N = len(coords)
    E = (L**2 * N) / (t**4).sum(axis = 0)
    return E

In [791]:
calc_Eu(mol)
# The values are the same in KNIME and RDKit implementations

array([0.50460956, 0.43817022, 0.3550019 ])

In [740]:
df_whim[['E1u', 'E2u', 'E3u']]

Unnamed: 0,E1u,E2u,E3u
csar_1_RDK,0.505,0.438,0.355
csar_1_KNM,0.50461,0.43817,0.355002


### Compute D (Global density)
As compute by KNIME:
>> $D = E_1 + E_2 + E_3 = \eta_1 + \eta_2 + \eta_3$

As compute by RDKit ([Source code](https://github.com/rdkit/rdkit/blob/master/Code/GraphMol/Descriptors/WHIM.cpp)):
>> $D = (E_1 + E_2 + E_3) / 3 = (\eta_1 + \eta_2 + \eta_3) / 3$

**D** is the total density of atoms within a molecule. However, RDKit computes the average of that density. Apparently this is what the software DRAGON 6 does.

In [800]:
def calc_Du(mol, mean = False):
    E = calc_Eu(mol)
    if mean:
        return E.mean()
    else:
        return E.sum()

In [803]:
# KNIME Result
calc_Du(mol, mean = False)

1.2977816747150561

In [802]:
# RDKit result
calc_Du(mol, mean = True)

0.4325938915716854

In [795]:
df_whim[['Du']]

Unnamed: 0,Du
csar_1_RDK,0.433
csar_1_KNM,1.297782


In [804]:
#### G

In [807]:
N = len(coords)
eigen_vects = calc_eigen(mol)[1]
t =  np.dot(coords, eigen_vects)
t.round(3).T # Round to three as a tolerance value

array([[ 4.091,  3.728,  2.416,  1.908,  0.542, -0.573, -1.561, -1.099,
         0.25 , -2.854, -3.515, -3.231, -4.522, -5.272, -4.326,  3.014,
         3.319,  1.797,  0.784,  0.263,  0.766,  1.784,  2.29 ],
       [-1.142,  0.172,  0.197, -1.15 , -1.577, -0.949, -1.514, -2.613,
        -2.688, -0.955, -1.156, -0.27 ,  0.397, -0.   ,  1.905, -1.937,
        -3.379,  1.496,  1.639,  2.9  ,  4.041,  3.923,  2.661],
       [-0.684, -0.567, -0.262, -0.234,  0.078, -0.332,  0.435,  1.123,
         0.898,  0.542,  1.551, -0.588, -0.666, -1.933, -0.584, -0.467,
        -0.523,  0.041,  0.997,  1.312,  0.69 , -0.251, -0.577]])

In [808]:
df_whim[['G1u', 'G2u', 'G3u']]

Unnamed: 0,G1u,G2u,G3u
csar_1_RDK,0.181,0.181,0.181
csar_1_KNM,,,


***
### USR or 3D D-MOMENTS 

Descriptors computed using [3D D-Moments](https://hub.knime.com/egonw/extensions/org.openscience.cdk.knime.feature/latest/org.openscience.cdk.knime.nodes.descriptors.distance3d.Distance3dNodeFactory). These descriptors are known as *'Ultra-fast shape recognition'* (**USR**), and were initially purposed by [Ballester and Graham (2007)](https://royalsocietypublishing.org/doi/10.1098/rspa.2007.1823). The intuition is the following:
> - USR is based on the idea that the shape of a molecule is determined by the relative position of its atoms.
- Therefore, molecular alignment (superposition) is not needed.
- The molecule is considered as a bound of particles (*atoms*) instead of a solid body.
- A dimensionality reduction is applied: The three dimensional shape information is retain by a set of one-dimensional distributions.
- These distributions are taken from all the atomic distances (euclidean distances) to the four different locations:
    - **Ctd**: *Molecular centroid.*
    - **Cst**: *Closest atom to Ctd*
    - **Fct**: *Farthest atom to Ctd*
    - **Ftf**: *Farthest atom to Fct*
- These locations are independent of molecular orientation and position.
- Mean, standard deviation and skewness are used over the atom distances from these locations to let to compare molecules with different number of atoms.

#### KNIME

In [477]:
# Load the results generated using KNIME and the node WHIM 3D
moments_knime_all = pd.read_csv('./knime/3dmoments_rdki_knime.csv')
# For comparation purposes we'll only keep one molecule
# Descriptors start from the eigth colum
moments_knime = moments_knime_all.iloc[[0, 1], 7:]
moments_knime.index = ['csar_1', 'csar_10']
moments_knime = moments_knime.T

These descriptors are the following:

In [499]:
moments_knime.loc[:, ['csar_1']]

Unnamed: 0,csar_1
Ctd-Mean,3.350091
Ctd-Sigma,1.272858
Ctd-Skewness,-0.008724
Cst-Mean,3.446493
Cst-Sigma,1.989122
Cst-Skewness,-0.585295
Fct-Mean,6.098801
Fct-Sigma,7.086076
Fct-Skewness,-0.673957
Ftf-Mean,4.936628


### RDKIT
RDKit computes the USR descriptors through `rdMolDescriptors.GetUSR()` class. Nevertheless, rdkit reports standard deviation and cubic root of skewness instead of variance and skewness. Apparently because this way the 'moments' (descriptors) will have the same units (Angstroms) ([Source](https://github.com/rdkit/rdkit/pull/1417/files)).

Moreover, **variance and skewness are calculated using the sample variance formula which uses the Bessel's correction (n - 1), while RDKit implementation computes the variance without that correction.**

In [509]:
rdkit_usr = pd.DataFrame(rdMolDescriptors.GetUSR(csar_1),
                         columns=['csar_1'], index=moments_knime.index)

In [493]:
# Join two dataframes to compare
df_usr_csar_1 = pd.concat([rdkit_usr, moments_knime['csar_1']], axis=1)
df_usr_csar_1.columns = ['csar_1_RDK', 'csar_1_KNIME']

#### Comparing means

In [494]:
df_usr_csar_1.loc[df_usr_csar_1.index.str.contains('Mean')]

Unnamed: 0,csar_1_RDK,csar_1_KNIME
Ctd-Mean,3.350091,3.350091
Cst-Mean,3.446493,3.446493
Fct-Mean,6.098801,6.098801
Ftf-Mean,4.936629,4.936628


#### Comparing Sigmas

In [502]:
# Let's get the square of these values:
df_usr_csar_1.loc[df_usr_csar_1.index.str.contains('Sigma')]

Unnamed: 0,csar_1_RDK,csar_1_KNIME
Ctd-Sigma,1.103411,1.272858
Cst-Sigma,1.379362,1.989122
Fct-Sigma,2.603456,7.086076
Ftf-Sigma,2.565406,6.880456


RDKit reports standard deviation instead of variances. Let's convert stds from RDKit to variance:

In [506]:
df_usr_csar_1.loc[df_usr_csar_1.index.str.contains('Sigma')][['csar_1_RDK']] ** 2

Unnamed: 0,csar_1_RDK
Ctd-Sigma,1.217516
Cst-Sigma,1.902638
Fct-Sigma,6.777985
Ftf-Sigma,6.581306


However, these still are not the same values. This is due to KNIME computes variance using the sample variance formula:
$$\sigma^2_{KNIME} = \sum_{i=1}^N \frac{(x - \bar{x})^2}{N -1}$$

#### Comparing Skewness

First of all, RDKit computes the cubic root of skewness.

In [507]:
# Let's get the square of these values:
df_usr_csar_1.loc[df_usr_csar_1.index.str.contains('Skewness')]

Unnamed: 0,csar_1_RDK,csar_1_KNIME
Ctd-Skewness,-0.210489,-0.008724
Cst-Skewness,-0.855285,-0.585295
Fct-Skewness,-0.896458,-0.673957
Ftf-Skewness,-0.188228,-0.006238


In [513]:
# Let's convert cubic root of skewness to skewness fot the RDKit values
df_usr_csar_1.loc[df_usr_csar_1.index.str.contains('Skewness')][['csar_1_RDK']] ** 3
# values are now in the same order of magnitude, but are not the same:

Unnamed: 0,csar_1_RDK
Ctd-Skewness,-0.009326
Cst-Skewness,-0.625653
Fct-Skewness,-0.720427
Ftf-Skewness,-0.006669


For some reason KNIME computes skewness using the following formula:

$$ Skewness_{knime} = \frac{\sum^N_{i=1} (x - \bar{x})^3} {N * \sigma^3_{n-1 dof}} $$

### Directly compute the CDT descriptors
#### I was able to find these results by directly computing *Molecular centroid* (Ctd) using numpy:

In [470]:
from scipy.stats import skew, kurtosis

def get_mol_coords(mol, centered = True):
    num_atoms = len(mol.GetAtoms())
    coords = pd.DataFrame({
       'x': [mol.GetConformer().GetAtomPosition(i).x for i in range(num_atoms)],
       'y': [mol.GetConformer().GetAtomPosition(i).y for i in range(num_atoms)],
       'z': [mol.GetConformer().GetAtomPosition(i).z for i in range(num_atoms)]},
            index = [mol.GetAtomWithIdx(i).GetSymbol() for i in range(num_atoms)])
    if centered:
        coords = coords - coords.mean(axis =  0)
    return coords

def get_molecular_centroid(mol):
    coords = get_mol_coords(mol)
    return coords.mean(axis=0)

def get_distances_from_centroid(mol):
    coords = get_mol_coords(mol)
    centroid = get_molecular_centroid(mol)
    distances = np.linalg.norm(coords - centroid, axis = 1)
    return distances

def get_ctd_mean(mol):
    distances = get_distances_from_centroid(mol)
    return distances.mean()

def get_ctd_var(mol, ddof = 0):
    distances = get_distances_from_centroid(mol)
    return distances.var(ddof = ddof)

def get_ctd_skew(mol, ddof_N = 0, ddof_std = 1):
    distances = get_distances_from_centroid(mol)
    N = len(distances)
    u = distances.mean()
    #skew_ = skew(distances, bias = False)
    skew_ = ( np.sum( (distances - u) ** 3 )  ) / ( (N - ddof_N) * (distances.std(ddof = ddof_std) ** 3 ))
    return skew_

In [533]:
# Calculate 'Manually' molecular centroid moments:
# Define the molecule (and rdkit instance)
mol = csar_1

# Compute cdt - mean (like RDKit)
print('Rdkit descriptors:')
print('*'*50)
print('Cdt-mean:\t', get_ctd_mean(mol).round(6))
print('Cdt-sigma:\t', (get_ctd_var(mol, ddof = 1)**0.5).round(6))
print('Cdt-skewness:\t', np.cbrt(get_ctd_skew(mol, ddof_N = 0, ddof_std = 0)).round(6))

print('\n')

# Compute cdt - mean (Like KNIME)
print('KNIME descriptors:')
print('*'*50)
print('Cdt-mean:\t', get_ctd_mean(mol).round(6))
print('Cdt-sigma:\t', (get_ctd_var(mol, ddof = 1)).round(6))
print('Cdt-skewness:\t', get_ctd_skew(mol, ddof_N = 0, ddof_std = 1).round(6))

Rdkit descriptors:
**************************************************
Cdt-mean:	 3.350091
Cdt-sigma:	 1.12821
Cdt-skewness:	 -0.210489


KNIME descriptors:
**************************************************
Cdt-mean:	 3.350091
Cdt-sigma:	 1.272858
Cdt-skewness:	 -0.008724
