In [None]:
""" What? Rescaling-Standardisation-Normalisation-Binarisation

Many ML algorithms make assumptions about your data. 
It is often a very good idea to prepare your data in such way to 
best expose the structure of the problem to the machine learning 
algorithms that you intend to use.
"""

In [74]:
# Import python modules
from pandas import read_csv
from numpy import set_printoptions
from IPython.display import Markdown, display
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, Binarizer

In [None]:
# Additional cosmetic function
def printInBold(string, c = "blue"):    
    colorstr = "<span style='color:{}'>{}</span>".format(c, '**'+ string + '**' )    
    display(Markdown(colorstr))

In [46]:
# Reading-in the data
filename = './datasetCollections/pima-indians-diabetes.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(filename, names=names)
array = dataframe.values
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]

In [66]:
# [1] RESCALING

"""
When your data is comprised of attributes with varying scales
you may want to rescale your inputs. 
"""
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
# summarize transformed data
set_printoptions(precision = 3)

# Comparison
printInBold("Original input data:")
print(X)
printInBold("RESCALED input data:")
print(rescaledX)

<span style='color:blue'>**Original input data:**</span>

[[  6.    148.     72.    ...  33.6     0.627  50.   ]
 [  1.     85.     66.    ...  26.6     0.351  31.   ]
 [  8.    183.     64.    ...  23.3     0.672  32.   ]
 ...
 [  5.    121.     72.    ...  26.2     0.245  30.   ]
 [  1.    126.     60.    ...  30.1     0.349  47.   ]
 [  1.     93.     70.    ...  30.4     0.315  23.   ]]


<span style='color:blue'>**RESCALED input data:**</span>

[[0.353 0.744 0.59  ... 0.501 0.234 0.483]
 [0.059 0.427 0.541 ... 0.396 0.117 0.167]
 [0.471 0.92  0.525 ... 0.347 0.254 0.183]
 ...
 [0.294 0.608 0.59  ... 0.39  0.071 0.15 ]
 [0.059 0.633 0.492 ... 0.449 0.116 0.433]
 [0.059 0.467 0.574 ... 0.453 0.101 0.033]]


In [67]:
# [2] STANDARDISATION

"""
Standardization is a useful technique to transform attributes with a Gaussian distribution 
and differing means and standard deviations to a standard Gaussian distribution 
with a mean of 0 and a standard deviation of 1. 
"""

scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
# summarize transformed data
set_printoptions(precision=3)

# Comparison
printInBold("Original input data:")
print(X)
printInBold("STANDARDISED input data:")
print(rescaledX)

<span style='color:blue'>**Original input data:**</span>

[[  6.    148.     72.    ...  33.6     0.627  50.   ]
 [  1.     85.     66.    ...  26.6     0.351  31.   ]
 [  8.    183.     64.    ...  23.3     0.672  32.   ]
 ...
 [  5.    121.     72.    ...  26.2     0.245  30.   ]
 [  1.    126.     60.    ...  30.1     0.349  47.   ]
 [  1.     93.     70.    ...  30.4     0.315  23.   ]]


<span style='color:blue'>**STANDARDISED input data:**</span>

[[ 0.64   0.848  0.15  ...  0.204  0.468  1.426]
 [-0.845 -1.123 -0.161 ... -0.684 -0.365 -0.191]
 [ 1.234  1.944 -0.264 ... -1.103  0.604 -0.106]
 ...
 [ 0.343  0.003  0.15  ... -0.735 -0.685 -0.276]
 [-0.845  0.16  -0.471 ... -0.24  -0.371  1.171]
 [-0.845 -0.873  0.046 ... -0.202 -0.474 -0.871]]


In [68]:
# [3] NORMALISATION

"""
Normalizing refers to rescaling each observation (row) to have 
a length of 1 (called a unit norm or a vector with the length 
of 1 in linear algebra). This pre-processing method can be useful 
for sparse datasets (lots of zeros) with attributes of varying scales
"""

scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
# summarize transformed data
set_printoptions(precision=3)

# Comparison
printInBold("Original input data:")
print(X)
printInBold("NORMALISED input data:")
print(normalizedX)

<span style='color:blue'>**Original input data:**</span>

[[  6.    148.     72.    ...  33.6     0.627  50.   ]
 [  1.     85.     66.    ...  26.6     0.351  31.   ]
 [  8.    183.     64.    ...  23.3     0.672  32.   ]
 ...
 [  5.    121.     72.    ...  26.2     0.245  30.   ]
 [  1.    126.     60.    ...  30.1     0.349  47.   ]
 [  1.     93.     70.    ...  30.4     0.315  23.   ]]


<span style='color:blue'>**NORMALISED input data:**</span>

[[0.034 0.828 0.403 ... 0.188 0.004 0.28 ]
 [0.008 0.716 0.556 ... 0.224 0.003 0.261]
 [0.04  0.924 0.323 ... 0.118 0.003 0.162]
 ...
 [0.027 0.651 0.388 ... 0.141 0.001 0.161]
 [0.007 0.838 0.399 ... 0.2   0.002 0.313]
 [0.008 0.736 0.554 ... 0.241 0.002 0.182]]


In [72]:
# [4] DIFFERENCE between standardisation and normalisation

rescaledX = # ?scaler.transform(X)
normalizedX = #?scaler.transform(X)

"""
As you can see there is no difference. Are they always the same?
"""

print(rescaledX - normalizedX)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [76]:
# [5] BINARISATION

"""
You can transform your data using a binary threshold. 
All values above the threshold are marked 1 and all equal 
to or below are marked as 0. This is called binarizing your 
data or thresholding your data. It can be useful when you have
probabilities that you want to make crisp values.
"""

binarizer = Binarizer(threshold = 0.0).fit(X)
binaryX = binarizer.transform(X)

# Comparison
printInBold("Original input data:")
print(X)
printInBold("NORMALISED input data:")
print(binaryX)

<span style='color:blue'>**Original input data:**</span>

[[  6.    148.     72.    ...  33.6     0.627  50.   ]
 [  1.     85.     66.    ...  26.6     0.351  31.   ]
 [  8.    183.     64.    ...  23.3     0.672  32.   ]
 ...
 [  5.    121.     72.    ...  26.2     0.245  30.   ]
 [  1.    126.     60.    ...  30.1     0.349  47.   ]
 [  1.     93.     70.    ...  30.4     0.315  23.   ]]


<span style='color:blue'>**NORMALISED input data:**</span>

[[1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 ...
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]]
