# Test of `standardize.py` functions from the `PCA` library

### Test:

- take out samples from original data
- scale data globally
- take out the same samples from scaled data
- uncenter and unscale the samples using global centers and scales
- make sure that the samples are now the same as the ones taken from original data

Test is performed on an *Iris* flower data set.

In [45]:
import pandas as pd
import numpy as np
import PCA.PCA as P
import PCA.post_processing as pp
import PCA.regression.standardize as stand
import PCA.regression.training_data_generation as tdg
from sklearn.datasets import load_iris

# Import data:
Xdf = pd.DataFrame(load_iris().data)
Xdf.head(10)
X = Xdf.to_numpy()

# Create a split to two sample set:
train_perc = 20
(n_obs, n_vars) = np.shape(X)
(idx_train, idx_test) = tdg.train_test_split_random(n_obs, train_perc, verbose=True)


Selected 30 training samples (20%) and 120 test samples (80%).



In [46]:
# Take out samples from original data:
X_train = X[idx_train]
X_test = X[idx_test]

In [47]:
# Scale data globally:
(Xs, centers, scales) = stand.z_score(X)
print(centers)
print(scales)
print(np.min(Xs))
print(np.max(Xs))
pd.DataFrame(Xs).head(5)

[5.84333333 3.05733333 3.758      1.19933333]
[0.82530129 0.43441097 1.75940407 0.75969263]
-2.43394714190809
3.0907752482994253


Unnamed: 0,0,1,2,3
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444


In [48]:
# Take out the same samples from scaled data:
Xs_train = Xs[idx_train]
Xs_test = Xs[idx_test]

In [49]:
# Uncenter and unscale the samples:
Xs_train_uu = stand.uncenter_unscale(Xs_train, centers, scales)
Xs_test_uu = stand.uncenter_unscale(Xs_test, centers, scales)

In [50]:
# Make sure that the samples are now the same as the ones taken from original data:
nrmse_train = pp.nrmse(X_train, Xs_train_uu)
r2_train = pp.r2(X_train, Xs_train_uu)
print('NRMSE train: ' + str(nrmse_train))
print('R2 train: ' + str(r2_train))

nrmse_test = pp.nrmse(X_test, Xs_test_uu)
r2_test = pp.r2(X_test, Xs_test_uu)
print('NRMSE test: ' + str(nrmse_test))
print('R2 test: ' + str(r2_test))

NRMSE train: 8.6705696683714e-18
R2 train: 1.0
NRMSE test: 9.842153398177984e-18
R2 test: 1.0


### Print a few rows to visualise:

In [55]:
pd.DataFrame(Xs_train_uu).head(5)

Unnamed: 0,0,1,2,3
0,5.0,3.6,1.4,0.2
1,4.9,3.1,1.5,0.1
2,4.3,3.0,1.1,0.1
3,5.1,3.5,1.4,0.3
4,5.1,3.3,1.7,0.5


In [56]:
pd.DataFrame(X_train).head(5)

Unnamed: 0,0,1,2,3
0,5.0,3.6,1.4,0.2
1,4.9,3.1,1.5,0.1
2,4.3,3.0,1.1,0.1
3,5.1,3.5,1.4,0.3
4,5.1,3.3,1.7,0.5


In [57]:
pd.DataFrame(Xs_test_uu).head(5)

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.4,3.9,1.7,0.4


In [58]:
pd.DataFrame(X_test).head(5)

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.4,3.9,1.7,0.4
