# Normalize
converting two variables to a range 0-1

In [1]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler


data = np.array([
    [100, 0.001],
    [8, 0.05],
    [50, 0.005],
    [88, 0.07],
    [4, 0.1]
])
print(data)

scaler = MinMaxScaler()
scaled = scaler.fit_transform(data)
print(scaled)

[[1.0e+02 1.0e-03]
 [8.0e+00 5.0e-02]
 [5.0e+01 5.0e-03]
 [8.8e+01 7.0e-02]
 [4.0e+00 1.0e-01]]
[[1.         0.        ]
 [0.04166667 0.49494949]
 [0.47916667 0.04040404]
 [0.875      0.6969697 ]
 [0.         1.        ]]


# Standardize
rescaling the distribution of values so that the mean of observed values is 0 and the standard deviation is 1

In [2]:
scaler = StandardScaler()
scaled = scaler.fit_transform(data)
print(scaled)

[[ 1.26398112 -1.16389967]
 [-1.06174414  0.12639634]
 [ 0.         -1.05856939]
 [ 0.96062565  0.65304778]
 [-1.16286263  1.44302493]]


mean value in each column is assigned a value of 0.0 if present and the values are centered around 0.0 with values both positive and negative

# Q&A:
Q: Should I Normalize or Standardize?
A: If the distribution of the quantity is normal, then it should be standardized, otherwise, the data should be normalized.

# Robust Scaler
- Removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range). The IQR is the range between the 1st quartile (25th quantile) and the 3rd quartile (75th quantile)

- Resulting variable has a zero mean and median and a standard deviation of 1, although not skewed by outliers and the outliers are still present with the same relative relationships to other values

In [3]:
scaler = RobustScaler()
scaled = scaler.fit_transform(data)
print(scaled)

[[ 0.625      -0.75384615]
 [-0.525       0.        ]
 [ 0.         -0.69230769]
 [ 0.475       0.30769231]
 [-0.575       0.76923077]]


In [4]:
np.median(scaled)

0.0