# Feature selection:
**Selecting molecular descriptors following the Ash & Fourches (2017) procedure** (it is assumed that these steps were made independently for each set of descriptors):
1. **Low variance filter:** Features in the lower variance quartile were discarded.
2. **Correlation filters:** For any pair of descriptors with $|r| > 0.9$ the descriptor with largest mean of $|r|$ was removed.

#### Additionaly they did the following analysis (not necessarily to drop features):
1. **Pearson correlation between each descriptor and pKi values.**  
2. **Paired t-test between active and inactive ligands using each set of descriptors.**

In [6]:
import pandas as pd
import numpy as np
import pickle

### Load the data

In [80]:
file_ = './main_table_of_Fourches_ligs_ERK2.pkl'
with open(file_, 'rb') as f:
    df_erk2_mols = pickle.load(f)
df_erk2_mols = df_erk2_mols.set_index('Name')

# MACC Keys

In [301]:
#  Let's extract the MACCS Keys as a Data frame
s = df_erk2_mols.maccs.map(lambda x: list(map(np.float, x)))
df_maccs_all = pd.DataFrame.from_dict(dict(zip(s.index, s.values))).T
# We have to clarify that MACCS Keys from rdkit includes a dummy key at the begining due to the 0 indexing
# Let's drop it
df_maccs_all = df_maccs_all.drop([0], axis=1)
df_maccs_all.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,157,158,159,160,161,162,163,164,165,166
CSAR_erk2_18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
CSAR_erk2_20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
CSAR_erk2_17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
CSAR_erk2_16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
CSAR_erk2_15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


#### Variance Threshold

In [302]:
from sklearn.feature_selection import VarianceThreshold

In [303]:
# How many bits have only zeros
(df_maccs_all.sum().values == 0).sum()
# We can start by droping these 37 features

37

Use the VarianceThreshold class:

In [304]:
sel_var = VarianceThreshold(0)
df_maccs_filt1 = sel_var.fit_transform(df_maccs_all)
df_maccs_filt1.shape

(86, 120)

In [305]:
n = 166
p = df_maccs_all.sum()/n
var = p*(1-p)*n
var.values
q = np.quantile(var.values, 0.25)
df_maccs_filt2 = df_maccs_all.iloc[:, var.values > q]
df_maccs_filt2.shape

(86, 124)

## RDKit 2D Descriptors

In [349]:
x = pd.read_csv('knime/2d_rdki_knime.csv')
x = x.iloc[[0], 6:]
x

Unnamed: 0,SlogP,SMR,LabuteASA,TPSA,AMW,ExactMW,NumLipinskiHBA,NumLipinskiHBD,NumRotatableBonds,NumHBD,...,MQN33,MQN34,MQN35,MQN36,MQN37,MQN38,MQN39,MQN40,MQN41,MQN42
0,3.47962,86.9627,133.654253,72.95,310.357,310.142976,6,1,3,1,...,0,0,2,1,0,0,0,0,0,0


In [334]:
x['NumLipinskiHBA']

0      6
1      9
2      7
3     10
4      7
      ..
81     7
82     7
83     8
84     7
85     9
Name: NumLipinskiHBA, Length: 86, dtype: int64

In [335]:
x['NumHBA']

0     5
1     6
2     7
3     7
4     5
     ..
81    5
82    5
83    6
84    5
85    7
Name: NumHBA, Length: 86, dtype: int64

In [337]:
df_rdkit_all['NumHAcceptors']

CSAR_erk2_18    5.0
CSAR_erk2_20    6.0
CSAR_erk2_17    7.0
CSAR_erk2_16    7.0
CSAR_erk2_15    5.0
               ... 
erk2_05         7.0
erk2_04         9.0
erk2_03         8.0
erk2_02         5.0
erk2_01         5.0
Name: NumHAcceptors, Length: 86, dtype: float64

In [336]:
from rdkit.Chem import Descriptors
# Get the descriptor names but ommit all descriptors related to fragments inside molecules (all of them start with 'fr_')
names_of_all_rdkit_descriptors = [x[0] for x in Descriptors._descList if x[0][:3] != 'fr_']
np.array(names_of_all_rdkit_descriptors)

array(['MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex',
       'MinAbsEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt',
       'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons',
       'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge',
       'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2',
       'FpDensityMorgan3', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n',
       'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n',
       'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1',
       'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10',
       'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14',
       'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6',
       'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10',
       'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6',
       'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10',
       'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2', 

In [332]:
s = df_erk2_mols['2d_rdkit'].map(lambda x: x)
df_rdkit_all = pd.DataFrame.from_dict(dict(zip(s.index, s.values))).T
df_rdkit_all.columns = names_of_all_rdkit_descriptors
df_rdkit_all

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount,MolLogP,MolMR
CSAR_erk2_18,12.715656,-0.475203,12.715656,0.182531,0.492792,393.491,366.275,393.216475,152.0,0.0,...,5.0,4.0,7.0,7.0,0.0,0.0,0.0,3.0,3.37224,113.7864
CSAR_erk2_20,12.732601,-0.584261,12.732601,0.017665,0.346052,443.935,417.727,443.172417,164.0,0.0,...,6.0,5.0,9.0,9.0,0.0,0.0,0.0,3.0,3.07972,120.0882
CSAR_erk2_17,12.673357,-3.485752,12.673357,0.055118,0.318735,464.935,443.767,464.103352,164.0,0.0,...,7.0,5.0,12.0,8.0,0.0,0.0,0.0,3.0,1.77302,116.9029
CSAR_erk2_16,12.773795,-0.577508,12.773795,0.254042,0.319141,468.970,447.802,468.113523,164.0,0.0,...,7.0,4.0,10.0,7.0,0.0,0.0,0.0,4.0,4.40544,125.4344
CSAR_erk2_15,12.672667,-0.471951,12.672667,0.180730,0.505335,379.464,354.264,379.200825,146.0,0.0,...,5.0,4.0,7.0,7.0,0.0,0.0,0.0,3.0,2.98374,109.1914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
erk2_05,12.390456,-0.323510,12.390456,0.047547,0.779551,356.382,336.222,356.148455,136.0,0.0,...,7.0,2.0,8.0,7.0,0.0,0.0,0.0,2.0,2.13258,96.4131
erk2_04,12.554039,-3.563305,12.554039,0.020421,0.629308,434.474,412.298,434.126005,160.0,0.0,...,9.0,2.0,11.0,8.0,0.0,0.0,0.0,2.0,1.53608,109.5229
erk2_03,12.497493,-0.602134,12.497493,0.004419,0.606162,447.879,425.703,447.130946,164.0,0.0,...,8.0,3.0,11.0,4.0,0.0,0.0,0.0,3.0,3.34838,112.8792
erk2_02,14.174431,-0.549608,14.174431,0.002430,0.506804,422.387,406.259,422.107813,156.0,0.0,...,5.0,1.0,8.0,3.0,0.0,0.0,0.0,5.0,4.22340,108.0335


In [262]:
(sel_var.variances_ == 0).sum()

46

In [267]:
# Variance for a Bernoulli random variable


42

In [273]:
x = df_maccs_all.var(ddof=0).values
q = np.quantile(x, 0.25)
(x <= q).sum()

46

In [274]:
(x == 0).sum()

46