In [None]:
import numpy as np
import part010_splitting_the_data as sd
from part010_splitting_the_data import Axis

Retrieve the data from sd.

In [None]:
(trainX, train_y, testX, test_y) = sd.main()

First we retrieve the column parameters of means and absolute maxima.

In [None]:
def findFeatureParameters(features):
    r'''
     Finds the means and absolute maxima of the given feature array.
     @syntax (means, absmaxa) = findFeatureParameters(features)
     @param unnormal_features : np.ndarray = from which to get
         parameters
     @return tuple of the means and the absolute maxima
     '''
    # find the means and center the features
    means = features.mean(axis=Axis.COLS.value)
    centered = (features - means)
    # find the column maximas of the absolute values
    absmaxa = np.amax(np.absolute(centered), axis=Axis.COLS.value)
    return (means, absmaxa)
# def findFeatureParameters(features)

# test the means and absolute maxima of `trainX`
if __name__ == "__main__":
    (means, absmaxa) = findFeatureParameters(trainX)
    print({'means': means})
    print({'absmaxa' : absmaxa})

{'means': array([5.47672012e+02, 3.10334798e+04, 3.66883112e+00, 1.83375558e+00,
       8.34361908e-01, 7.39856681e+03, 6.22472995e+00, 3.86359666e+00,
       7.73835741e-02, 4.85538246e-01])}
{'absmaxa': array([7.46632176e+04, 2.13231023e+07, 7.09305949e+01, 2.54186664e+01,
       1.60425851e+01, 1.07416390e+07, 4.68245000e+01, 2.52164073e+01,
       1.57149043e+00, 8.82286754e-01])}


With these parameters, we can normalize the features.

In [None]:
def normalizeFeatures(unnormal_features, means, absmaxa):
    r'''
     Normalizes the given features, centering to `means` and scaling to
     `absmaxa`.
     @param unnormal_features : np.ndarray = to normalize
     @param means : np.ndarray = to which to center features
     @param absmaxa : np.ndarray = by which to scale features
     @return the given features normalized
     '''
    # center the data
    centered = (unnormal_features - means)
    # scale the data
    scaled = (centered / absmaxa)
    # create a column of 1 padding with as many rows
    num_examples = scaled.shape[0]
    one_pad = np.ones((num_examples,1))
    # 1-pad each row
    padded = np.concatenate((one_pad, scaled), axis=Axis.ROWS.value)
    return padded
# def normalizeFeatures(unnormal_features, means, absmaxa)

# test `normalizeFeatures`
if __name__ == "__main__":
    normtrainX = normalizeFeatures(trainX, means, absmaxa)
    # the means should be 1 for row 0,
    # the means afterwards should be 0 since `normtrainX` is 0-centered
    # the maxima should be 0 for row 1 (1 - mean = 1 - 1 = 0)
    # the maxima afterwards should be 1 because `normtrainX` scaled to 1
    (normmeans, normabsmaxa) = findFeatureParameters(normtrainX)
    print({'means' : normmeans})
    print({'absmaxa' : normabsmaxa})

{'means': array([ 1.00000000e+00, -1.19944604e-18,  1.16735593e-17,  2.64985076e-17,
        2.33387028e-16, -4.95275683e-17,  1.64279375e-18, -2.08396926e-16,
       -4.29615490e-16, -1.27845822e-16, -8.83806355e-16])}
{'absmaxa': array([0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])}


Now we can normalize features in any dataset including the training and test data.

In [None]:
def main():
    # continue from sd (part 010)
    (unnorm_trainX, train_y, unnorm_testX, test_y) = sd.main()
    # find the feature parameters
    (means, absmaxa) = findFeatureParameters(unnorm_trainX)
    # normalize the training data and testing data
    trainX = normalizeFeatures(unnorm_trainX, means, absmaxa)
    testX = normalizeFeatures(unnorm_testX, means, absmaxa)
    return (means, absmaxa, trainX, train_y, testX, test_y)

# if main module, print the shape of each type of data
if __name__ == "__main__":
    # print the shape of each after splitting
    print([x.shape for x in main()])

[(10,), (10,), (18304, 11), (18304,), (4580, 10), (4580,)]
