In [1]:
# Normalization examples

import numpy as np
import statistics

In [2]:
# min-max normalization
# v_norm = [(v - min_v)/(max_v - min_v)] * (new_max - new_min) + new_min

def min_max(data, new_min, new_max):
    # Get min and max values
    min_v = min(data)
    max_v = max(data)
    normalized_data = []
    for v in data:
        v_norm = ((v-min_v)/(max_v-min_v))*(new_max-new_min) + new_min
        normalized_data.append(v_norm)
    return normalized_data


In [3]:
data = [73600, 15000, 12000, 98000, 73600, 81000, 14300, 20100, 31200, 25000, 76430, 67900]

norm_d_min_max = min_max(data, 0, 1)
norm_d_min_max

[0.7162790697674418,
 0.03488372093023256,
 0.0,
 1.0,
 0.7162790697674418,
 0.8023255813953488,
 0.026744186046511628,
 0.0941860465116279,
 0.22325581395348837,
 0.1511627906976744,
 0.7491860465116279,
 0.65]

In [4]:
# z_score normalization
# v_norm = (v - mean)/std

def z_score(data):
    mean = statistics.mean(data)
    std = statistics.stdev(data)
    normalized_data = []
    for v in data:
        v_norm = (v-mean)/std
        normalized_data.append(v_norm)
    return normalized_data


In [5]:
data = [73600, 15000, 12000, 98000, 73600, 81000, 14300, 20100, 31200, 25000, 76430, 67900]

norm_d_z_score = z_score(data)
norm_d_z_score

[0.7705324699346557,
 -1.065772247108927,
 -1.159781021019008,
 1.5351371644033147,
 0.7705324699346557,
 1.0024207789128556,
 -1.087707627687946,
 -0.9059573314617893,
 -0.5581248679944895,
 -0.7524096674086569,
 0.8592140799898321,
 0.5919157995055018]

In [6]:
# Decimal scale normalization
# v_norm = v / 10^j, where j is the smallest integer such that max(|vi′|) < 1.

def dec_scale(data):
    max_v = max(data, key=abs)
    digit = len(str(max_v))
    div = pow(10, digit)
    normalized_data = []
    for v in data:
        v_norm = v/div
        normalized_data.append(v_norm)
    return normalized_data

In [7]:
data = [73600, 15000, 12000, 98000, 73600, 81000, 14300, 20100, 31200, 25000, 76430, 67900]

norm_d_dec_scale = dec_scale(data)
norm_d_dec_scale

[0.736,
 0.15,
 0.12,
 0.98,
 0.736,
 0.81,
 0.143,
 0.201,
 0.312,
 0.25,
 0.7643,
 0.679]

In [9]:
# Other more direct ways

# Use the MinMax scaler from sklearn
from sklearn.preprocessing import MinMaxScaler

# sklearn assumes that we have an array of observations of "n" features. Here we assume 1 feature.
# So we need to transform the data as follows:
# [[73600], 
#  [15000], 
#  [12000], 
#  [98000], 
#  [73600], 
#  [81000], 
#  [14300],
#  [20100], 
#  [31200], 
#  [25000], 
#  [76430], 
#  [67900]]

data = [73600, 15000, 12000, 98000, 73600, 81000, 14300, 20100, 31200, 25000, 76430, 67900]
data = np.reshape(data, (-1,1))

scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(data)
print(normalized_data)


[[0.71627907]
 [0.03488372]
 [0.        ]
 [1.        ]
 [0.71627907]
 [0.80232558]
 [0.02674419]
 [0.09418605]
 [0.22325581]
 [0.15116279]
 [0.74918605]
 [0.65      ]]


In [10]:
# If we had more attributes, we would scale all attributes
# e.g. in the following example with 2 attributes:

data = [
    [73600, 1500],
    [15000, 2500], 
    [12000, 3200], 
    [98000, 9800], 
    [73600, 5670], 
    [81000, 4890], 
    [14300, 2980],
    [20100, 5900], 
    [31200, 3650], 
    [25000, 6700], 
    [76430, 7160], 
    [67900, 3980]
]

scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(data)
print(normalized_data)

[[0.71627907 0.        ]
 [0.03488372 0.12048193]
 [0.         0.20481928]
 [1.         1.        ]
 [0.71627907 0.50240964]
 [0.80232558 0.40843373]
 [0.02674419 0.17831325]
 [0.09418605 0.53012048]
 [0.22325581 0.25903614]
 [0.15116279 0.62650602]
 [0.74918605 0.68192771]
 [0.65       0.29879518]]
