#### Some machine learning algorithms are sensitive to feature scales and outliers (such as SVM), so normally we do descriptor preprocessing before developing any QSAR models

# Scaling (normalization)

In [4]:
## min-max scaling
from sklearn.preprocessing import MinMaxScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = MinMaxScaler()
print(scaler.fit(data))
print(scaler.data_max_)
print(scaler.transform(data))
print(scaler.transform([[2, 2]]))

MinMaxScaler(copy=True, feature_range=(0, 1))
[ 1. 18.]
[[0.   0.  ]
 [0.25 0.25]
 [0.5  0.5 ]
 [1.   1.  ]]
[[1.5 0. ]]


# Standardization

In [5]:
from sklearn.preprocessing import StandardScaler
data = [[0, 0], [0, 0], [1, 1], [1, 1]]
scaler = StandardScaler()
print(scaler.fit(data))
print(scaler.mean_)
print(scaler.transform(data))
print(scaler.transform([[2, 2]]))

StandardScaler(copy=True, with_mean=True, with_std=True)
[0.5 0.5]
[[-1. -1.]
 [-1. -1.]
 [ 1.  1.]
 [ 1.  1.]]
[[3. 3.]]


# Pipeline for preprocessing of descriptors

In [6]:
from sklearn import pipeline
pipeline = pipeline.Pipeline([
        ('scaling', MinMaxScaler()),
        ('std_scaler', StandardScaler()),
    ])

In [1]:
from rdkit import Chem
import pandas as pd
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
import os
currentDirectory = os.getcwd()
d = os.path.join(currentDirectory, "Datasets", "example.csv")
dataset = pd.read_csv(d, index_col = 0)
molecules = [Chem.MolFromSmiles(mol) for mol in dataset.SMILES]
calculator = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors.descList])
X = pd.DataFrame([list(calculator.CalcDescriptors(mol)) for mol in molecules],
                     index=dataset.index,
                     columns=list(calculator.GetDescriptorNames()))

train_set_X, test_set_X = train_test_split(X, test_size=0.2, random_state=42)

In [11]:
# For modeling set, fit and transform are used for preprocessing descriptors
train_X_prepared = pipeline.fit_transform(train_set_X)

In [12]:
# For test set, only transform is used for preprocessing descriptors
test_X_prepared = pipeline.transform(test_set_X)