# <span style="color: steelblue;">Normalization using scaLR</span>

Keypoints

1. This notebook is designed as a tutorial for using normalization from a scaLR library.
2. Also, we have compared results using standard library like sklearn, scanpy for normalization etc.
3. These packages are built so to handle very large data say lakhs of samples with low resource constraints, which standard libraries can't handle at once.

# <span style="color: steelblue;">Normalization</span>

### <span style="color: steelblue;">Imports</span>

In [1]:
import sys
sys.path.append('/path/to/scaLR')

In [2]:
import pandas as pd
import numpy as np
import anndata

# scalr library normalization modules.
from scalr.data.preprocess import standard_scale, sample_norm
from scalr.utils.file_utils import read_data, write_data, write_chunkwise_data

# Scanpy library for sample-norm
import scanpy as sc
# Sklearn library for standard scaler object
from sklearn.preprocessing import StandardScaler
from os import path

%reload_ext autoreload
%autoreload 2

### <span style="color: steelblue;">Data generation</span>

In [3]:
# Setting seed for reproducibility.
np.random.seed(0)

In [4]:
# Anndata object is required for using pipeline normalization functions.
train_adata = anndata.AnnData(X=np.random.rand(100, 50))
train_adata.obs = pd.DataFrame(np.random.rand(train_adata.shape[0]), columns=['dummy'])
train_adata

AnnData object with n_obs × n_vars = 100 × 50
    obs: 'dummy'

In [None]:
write_data(train_adata, './train.h5ad')
write_chunkwise_data(datapath='./train.h5ad',
                     sample_chunksize=10,
                     dirpath='./data/train')

In [7]:
# Anndata object is required for using pipeline normalization functions.
val_adata = anndata.AnnData(X=np.random.rand(20, 50))
val_adata.obs = pd.DataFrame(np.random.rand(val_adata.shape[0]), columns=['dummy'])
val_adata

AnnData object with n_obs × n_vars = 20 × 50
    obs: 'dummy'

In [None]:
write_data(val_adata, './val.h5ad')
write_chunkwise_data(datapath='./val.h5ad',
                     sample_chunksize=10,
                     dirpath='./data/val')

In [9]:
# Anndata object is required for using pipeline normalization functions.
test_adata = anndata.AnnData(X=np.random.rand(20, 50))
test_adata.obs = pd.DataFrame(np.random.rand(test_adata.shape[0]), columns=['dummy'])
test_adata

AnnData object with n_obs × n_vars = 20 × 50
    obs: 'dummy'

In [None]:
write_data(test_adata, './test.h5ad')
write_chunkwise_data(datapath='./test.h5ad',
                     sample_chunksize=10,
                     dirpath='./data/test')

In [11]:
train_adata.X[:10, :10]

array([[0.5488135 , 0.71518937, 0.60276338, 0.54488318, 0.4236548 ,
        0.64589411, 0.43758721, 0.891773  , 0.96366276, 0.38344152],
       [0.57019677, 0.43860151, 0.98837384, 0.10204481, 0.20887676,
        0.16130952, 0.65310833, 0.2532916 , 0.46631077, 0.24442559],
       [0.67781654, 0.27000797, 0.73519402, 0.96218855, 0.24875314,
        0.57615733, 0.59204193, 0.57225191, 0.22308163, 0.95274901],
       [0.1494483 , 0.86812606, 0.16249293, 0.61555956, 0.12381998,
        0.84800823, 0.80731896, 0.56910074, 0.4071833 , 0.069167  ],
       [0.31179588, 0.69634349, 0.37775184, 0.17960368, 0.02467873,
        0.06724963, 0.67939277, 0.45369684, 0.53657921, 0.89667129],
       [0.35561274, 0.94043195, 0.76532525, 0.74866362, 0.90371974,
        0.08342244, 0.55219247, 0.58447607, 0.96193638, 0.29214753],
       [0.9065555 , 0.77404733, 0.33314515, 0.08110139, 0.40724117,
        0.23223414, 0.13248763, 0.05342718, 0.72559436, 0.01142746],
       [0.64557024, 0.03536244, 0.4304024

In [12]:
val_adata.X[:10, :10]

array([[0.77037589, 0.38012993, 0.62219227, 0.89340603, 0.15839147,
        0.00890616, 0.25675661, 0.79747194, 0.56682564, 0.94616086],
       [0.56175023, 0.21120622, 0.62191259, 0.33469926, 0.13247703,
        0.98440186, 0.24994142, 0.06197715, 0.27883662, 0.25170006],
       [0.62892557, 0.23211187, 0.26718466, 0.18380395, 0.57241143,
        0.44671866, 0.19356411, 0.93264093, 0.23922315, 0.7787389 ],
       [0.01133226, 0.49481513, 0.26726704, 0.16443269, 0.66570567,
        0.83554564, 0.54440575, 0.19379416, 0.75128533, 0.52133267],
       [0.75611165, 0.2240344 , 0.21169364, 0.68427305, 0.78063916,
        0.88636479, 0.44831369, 0.71629609, 0.17094955, 0.31475925],
       [0.92530067, 0.47039993, 0.05079764, 0.61044325, 0.45600704,
        0.08858767, 0.34796249, 0.61442608, 0.62770854, 0.88690303],
       [0.74273847, 0.99055173, 0.46307262, 0.29491718, 0.34666684,
        0.09193108, 0.45011323, 0.62712862, 0.62357741, 0.01276903],
       [0.90169098, 0.82538828, 0.4767652

In [13]:
test_adata.X[:10, :10]

array([[0.39300922, 0.14791115, 0.16613857, 0.62739702, 0.85816427,
        0.77896724, 0.33649381, 0.87521597, 0.84134592, 0.5242117 ],
       [0.02924269, 0.76688868, 0.53768803, 0.79992882, 0.40886803,
        0.66550279, 0.19880421, 0.19184457, 0.34026241, 0.51944051],
       [0.1279081 , 0.38554568, 0.48996963, 0.18237171, 0.70428385,
        0.84137876, 0.0044646 , 0.13322339, 0.28346284, 0.98411023],
       [0.7905697 , 0.88564525, 0.05074183, 0.98221686, 0.42646149,
        0.19298211, 0.97002278, 0.82518232, 0.92972355, 0.52315373],
       [0.52305469, 0.84693774, 0.10625275, 0.82951664, 0.6509179 ,
        0.21492954, 0.73690562, 0.74133329, 0.10800158, 0.26644681],
       [0.53085633, 0.57597739, 0.52906692, 0.01068652, 0.34873891,
        0.79046226, 0.34689564, 0.5093795 , 0.34663208, 0.19923791],
       [0.08791557, 0.90621296, 0.91842522, 0.99770599, 0.77788857,
        0.26263784, 0.26455887, 0.68821688, 0.67973266, 0.6603292 ],
       [0.96742008, 0.09691402, 0.9283330

## <span style="color: steelblue;">1. StandardScaler</span>

### <span style="color: steelblue;">scalr package - how to to use it?</span>

In [14]:
# Creating object for standard scaling normalization.
scalr_std_scaler = standard_scale.StandardScaler(with_mean=False)

print('\n1. `fit()` function parameters :', scalr_std_scaler.fit.__annotations__)
print('\n2. `transform()` function parameters :', scalr_std_scaler.transform.__annotations__)


1. `fit()` function parameters : {'data': typing.Union[anndata._core.anndata.AnnData, anndata.experimental.multi_files._anncollection.AnnCollection], 'sample_chunksize': <class 'int'>, 'return': None}

2. `transform()` function parameters : {'data': <class 'numpy.ndarray'>, 'return': <class 'numpy.ndarray'>}


In [15]:
# Datapath of original splitted data
datapath = './data'

# Datapath to store processed_data
processed_datapath = './processed_data_ss'

In [16]:
# Fitting object on train data.
## chunk size to process data in chunks - to extract required parameters from data. Enter value that can fit in your memory.
## It can be 2k, 3k , 5k, 10k etc...
sample_chunksize = 10
scalr_std_scaler.fit(read_data(path.join(datapath, 'train')), sample_chunksize)

# Transforming the test data using above created object & storing it at `preprocessed_datapath`.
scalr_std_scaler.process_data(path.join(datapath, 'test'),
                                          sample_chunksize,
                                          path.join(processed_datapath, 'test'))

In [17]:
# Reading transformed test data
test_adata_pipeline = read_data(path.join(processed_datapath, 'test'))

### <span style="color: steelblue;">sklearn package for standardscaling</span>
- Developers can ignore this section

In [18]:
# Standard scaling using sklearn package
sklearn_std_scaler = StandardScaler(with_mean=False)
sklearn_std_scaler.fit(train_adata.X)
test_adata_sklearn = sklearn_std_scaler.transform(test_adata.X)
test_adata_sklearn[:10, :10]

array([[1.41158441, 0.50525939, 0.60092889, 2.16354557, 2.71034692,
        2.64936446, 1.16691805, 3.114875  , 2.89359187, 1.7837114 ],
       [0.10503197, 2.61966535, 1.94483601, 2.75851238, 1.29133111,
        2.26345776, 0.68942789, 0.68277076, 1.17024462, 1.76747668],
       [0.45941182, 1.31701077, 1.77223692, 0.62889924, 2.2243452 ,
        2.8616338 , 0.01548265, 0.4741392 , 0.97489718, 3.3485873 ],
       [2.83951575, 3.02533371, 0.18353494, 3.38712306, 1.34689667,
        0.65635615, 3.36391656, 2.93680632, 3.1975439 , 1.78011149],
       [1.87867309, 2.89311019, 0.38431983, 2.86054441, 2.0557991 ,
        0.73100211, 2.55549565, 2.63838939, 0.37144354, 0.9066265 ],
       [1.9066945 , 1.96751893, 1.91365313, 0.0368519 , 1.10142484,
        2.68846046, 1.2029903 , 1.8128708 , 1.19215146, 0.6779378 ],
       [0.31576929, 3.09559228, 3.32197545, 3.44053652, 2.45681155,
        0.89326394, 0.91745676, 2.44934922, 2.33776483, 2.24687226],
       [3.47471519, 0.33105495, 3.3578125

### <span style="color: steelblue;">Comparing scalr library results with sklearn's library results</span>

In [19]:
# Checking if error is less than 1e-15
assert sum(
abs(scalr_std_scaler.train_mean -
    sklearn_std_scaler.mean_).flatten() < 1e-15
) == train_adata.shape[1], "Train data mean is not correctly calculated..."

assert sum(
abs(scalr_std_scaler.train_std - sklearn_std_scaler.scale_).flatten() <
1e-15) == train_adata.shape[
    1], "Train data standard deviation is not correctly calculated..."

## <span style="color: steelblue;">2. SampleNorm</span>

### <span style="color: steelblue;">scalr package - how to to use it?</span>

In [20]:
# Sample norm using pipeline
scalr_sample_norm = sample_norm.SampleNorm()

print('\n1. `transform()` function parameters :', scalr_sample_norm.transform.__annotations__)


1. `transform()` function parameters : {'data': <class 'numpy.ndarray'>, 'return': <class 'numpy.ndarray'>}


In [21]:
# Datapath of original splitted data
datapath = './data'

# Datapath to store processed_data
processed_datapath = './processed_data_sn'

In [22]:
# Fitting is not required on train data for sample-norm.
sample_chunksize = 10

# Transforming on test data.
scalr_sample_norm.process_data(path.join(datapath, 'test'),
                               sample_chunksize,
                               path.join(processed_datapath, 'test'))

In [23]:
# Reading transformed test data
test_data_sample_norm_pipeline = read_data(path.join(processed_datapath, 'test'))

### <span style="color: steelblue;">Scanpy package for sample-norm</span>
- Developers can ignore this section

In [24]:
test_adata = read_data(path.join(datapath, 'test'), backed=None)
test_adata = test_adata[:, :].to_adata()
test_adata

AnnData object with n_obs × n_vars = 20 × 50
    obs: 'dummy'

In [25]:
# Sample norm using scanpy package
test_data_sample_norm_sc = sc.pp.normalize_total(test_adata, target_sum=1, inplace=False)
test_data_sample_norm_sc['X'][:10, :10]

array([[0.01660122, 0.00624796, 0.00701791, 0.02650207, 0.03624998,
        0.03290459, 0.01421394, 0.03697027, 0.03553955, 0.02214339],
       [0.0011924 , 0.03127078, 0.02192486, 0.03261803, 0.01667207,
        0.02713665, 0.00810647, 0.00782269, 0.0138746 , 0.02118079],
       [0.00499638, 0.01506028, 0.01913932, 0.00712385, 0.02751091,
        0.03286615, 0.0001744 , 0.00520401, 0.0110727 , 0.03844156],
       [0.02966363, 0.03323104, 0.00190393, 0.03685458, 0.01600162,
        0.00724104, 0.03639704, 0.03096236, 0.03488494, 0.01962969],
       [0.02123093, 0.03437743, 0.00431283, 0.03367031, 0.02642093,
        0.00872405, 0.0299112 , 0.03009092, 0.00438381, 0.01081515],
       [0.0225382 , 0.02445387, 0.02246222, 0.00045371, 0.01480616,
        0.0335601 , 0.01472791, 0.02162637, 0.01471672, 0.0084589 ],
       [0.00330591, 0.03407657, 0.03453579, 0.03751701, 0.02925115,
        0.00987604, 0.00994828, 0.0258792 , 0.02556017, 0.02483054],
       [0.04212857, 0.00422035, 0.0404264

### <span style="color: steelblue;">Comparing scalr library results with scanpy library results</span>

In [26]:
# Checking if error is less than 1e-15
(abs(test_data_sample_norm_sc['X'] - test_data_sample_norm_pipeline[:, :].X) < 1e-15)[:10, :10]

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True]])