Keypoints

1. This notebook has been designed as a tutorial for using normalization from a scalr library.
2. Also, we have compared results using standard library like sklearn, scanpy etc.
3. These packages are built so to handle very large data say lakhs of samples with low resource constraints, which standard libraries can't handle at once.

# Imports

In [1]:
import sys
sys.path.append('/home/anand/bioc_repo/single_cell_classification/')

In [2]:
import pandas as pd
import numpy as np
import anndata

# scalr library normalization modules.
from _scalr.data.preprocess import standard_scale, sample_norm
from _scalr.utils.file_utils import read_data, write_data, write_chunkwise_data

# Scanpy library for sample-norm
import scanpy as sc
# Sklearn library for standard scaler object
from sklearn.preprocessing import StandardScaler
from os import path

%reload_ext autoreload
%autoreload 2

# Data generation

In [3]:
# Setting seed for reproducibility.
np.random.seed(0)

In [4]:
# Anndata object is required for using pipeline normalization functions.
train_adata = anndata.AnnData(X=np.random.rand(100, 50))
train_adata.obs = pd.DataFrame(np.random.rand(train_adata.shape[0]), columns=['dummy'])
train_adata

AnnData object with n_obs × n_vars = 100 × 50
    obs: 'dummy'

In [5]:
write_data(train_adata, '/home/anand/bioc_repo/single_cell_classification/tutorials/preprocessing/train.h5ad')
write_chunkwise_data(datapath='/home/anand/bioc_repo/single_cell_classification/tutorials/preprocessing/train.h5ad',
                     sample_chunksize=10,
                     dirpath='/home/anand/bioc_repo/single_cell_classification/tutorials/preprocessing/data/train')



In [6]:
# Anndata object is required for using pipeline normalization functions.
val_adata = anndata.AnnData(X=np.random.rand(20, 50))
val_adata.obs = pd.DataFrame(np.random.rand(val_adata.shape[0]), columns=['dummy'])
val_adata

AnnData object with n_obs × n_vars = 20 × 50
    obs: 'dummy'

In [7]:
write_data(val_adata, '/home/anand/bioc_repo/single_cell_classification/tutorials/preprocessing/val.h5ad')
write_chunkwise_data(datapath='/home/anand/bioc_repo/single_cell_classification/tutorials/preprocessing/val.h5ad',
                     sample_chunksize=10,
                     dirpath='/home/anand/bioc_repo/single_cell_classification/tutorials/preprocessing/data/val')



In [8]:
# Anndata object is required for using pipeline normalization functions.
test_adata = anndata.AnnData(X=np.random.rand(20, 50))
test_adata.obs = pd.DataFrame(np.random.rand(test_adata.shape[0]), columns=['dummy'])
test_adata

AnnData object with n_obs × n_vars = 20 × 50
    obs: 'dummy'

In [9]:
write_data(test_adata, '/home/anand/bioc_repo/single_cell_classification/tutorials/preprocessing/test.h5ad')
write_chunkwise_data(datapath='/home/anand/bioc_repo/single_cell_classification/tutorials/preprocessing/test.h5ad',
                     sample_chunksize=10,
                     dirpath='/home/anand/bioc_repo/single_cell_classification/tutorials/preprocessing/data/test')



In [10]:
train_adata.X[:10, :10]

array([[0.5488135 , 0.71518937, 0.60276338, 0.54488318, 0.4236548 ,
        0.64589411, 0.43758721, 0.891773  , 0.96366276, 0.38344152],
       [0.57019677, 0.43860151, 0.98837384, 0.10204481, 0.20887676,
        0.16130952, 0.65310833, 0.2532916 , 0.46631077, 0.24442559],
       [0.67781654, 0.27000797, 0.73519402, 0.96218855, 0.24875314,
        0.57615733, 0.59204193, 0.57225191, 0.22308163, 0.95274901],
       [0.1494483 , 0.86812606, 0.16249293, 0.61555956, 0.12381998,
        0.84800823, 0.80731896, 0.56910074, 0.4071833 , 0.069167  ],
       [0.31179588, 0.69634349, 0.37775184, 0.17960368, 0.02467873,
        0.06724963, 0.67939277, 0.45369684, 0.53657921, 0.89667129],
       [0.35561274, 0.94043195, 0.76532525, 0.74866362, 0.90371974,
        0.08342244, 0.55219247, 0.58447607, 0.96193638, 0.29214753],
       [0.9065555 , 0.77404733, 0.33314515, 0.08110139, 0.40724117,
        0.23223414, 0.13248763, 0.05342718, 0.72559436, 0.01142746],
       [0.64557024, 0.03536244, 0.4304024

In [11]:
val_adata.X[:10, :10]

array([[0.77037589, 0.38012993, 0.62219227, 0.89340603, 0.15839147,
        0.00890616, 0.25675661, 0.79747194, 0.56682564, 0.94616086],
       [0.56175023, 0.21120622, 0.62191259, 0.33469926, 0.13247703,
        0.98440186, 0.24994142, 0.06197715, 0.27883662, 0.25170006],
       [0.62892557, 0.23211187, 0.26718466, 0.18380395, 0.57241143,
        0.44671866, 0.19356411, 0.93264093, 0.23922315, 0.7787389 ],
       [0.01133226, 0.49481513, 0.26726704, 0.16443269, 0.66570567,
        0.83554564, 0.54440575, 0.19379416, 0.75128533, 0.52133267],
       [0.75611165, 0.2240344 , 0.21169364, 0.68427305, 0.78063916,
        0.88636479, 0.44831369, 0.71629609, 0.17094955, 0.31475925],
       [0.92530067, 0.47039993, 0.05079764, 0.61044325, 0.45600704,
        0.08858767, 0.34796249, 0.61442608, 0.62770854, 0.88690303],
       [0.74273847, 0.99055173, 0.46307262, 0.29491718, 0.34666684,
        0.09193108, 0.45011323, 0.62712862, 0.62357741, 0.01276903],
       [0.90169098, 0.82538828, 0.4767652

In [12]:
test_adata.X[:10, :10]

array([[0.39300922, 0.14791115, 0.16613857, 0.62739702, 0.85816427,
        0.77896724, 0.33649381, 0.87521597, 0.84134592, 0.5242117 ],
       [0.02924269, 0.76688868, 0.53768803, 0.79992882, 0.40886803,
        0.66550279, 0.19880421, 0.19184457, 0.34026241, 0.51944051],
       [0.1279081 , 0.38554568, 0.48996963, 0.18237171, 0.70428385,
        0.84137876, 0.0044646 , 0.13322339, 0.28346284, 0.98411023],
       [0.7905697 , 0.88564525, 0.05074183, 0.98221686, 0.42646149,
        0.19298211, 0.97002278, 0.82518232, 0.92972355, 0.52315373],
       [0.52305469, 0.84693774, 0.10625275, 0.82951664, 0.6509179 ,
        0.21492954, 0.73690562, 0.74133329, 0.10800158, 0.26644681],
       [0.53085633, 0.57597739, 0.52906692, 0.01068652, 0.34873891,
        0.79046226, 0.34689564, 0.5093795 , 0.34663208, 0.19923791],
       [0.08791557, 0.90621296, 0.91842522, 0.99770599, 0.77788857,
        0.26263784, 0.26455887, 0.68821688, 0.67973266, 0.6603292 ],
       [0.96742008, 0.09691402, 0.9283330

# 1. StandardScaler

## scalr package - how to to use it?

In [13]:
# Creating object for standard scaling normalization.
scalr_std_scaler = standard_scale.StandardScaler(with_mean=False)

print('\n1. `fit()` function parameters :', scalr_std_scaler.fit.__annotations__)
print('\n2. `transform()` function parameters :', scalr_std_scaler.transform.__annotations__)

INFO:absl:Applying Standard Scaler normalization on data.



1. `fit()` function parameters : {'data': typing.Union[anndata._core.anndata.AnnData, anndata.experimental.multi_files._anncollection.AnnCollection], 'sample_chunksize': <class 'int'>, 'return': None}

2. `transform()` function parameters : {'data': <class 'numpy.ndarray'>, 'return': <class 'numpy.ndarray'>}


In [14]:
# Datapath of original splitted data
datapath = '/home/anand/bioc_repo/single_cell_classification/tutorials/preprocessing/data'

# Datapath to store processed_data
processed_datapath = '/home/anand/bioc_repo/single_cell_classification/tutorials/preprocessing/processed_data_ss'

In [15]:
# Fitting object on train data.
## chunk size to process data in chunks - to extract required parameters from data. Enter value that can fit in your memory.
## It can be 2k, 3k , 5k, 10k etc...
sample_chunksize = 10
scalr_std_scaler.fit(read_data(path.join(datapath, 'train')), sample_chunksize)

# Transforming the test data using above created object & storing it at `preprocessed_datapath`.
scalr_std_scaler.process_data(path.join(datapath, 'test'),
                                          sample_chunksize,
                                          path.join(processed_datapath, 'test'))

INFO:absl:Calculating mean of data...
INFO:absl:`train_mean` will be set to zero during `transform()`, as `with_mean` is set to False!
INFO:absl:Calculating standard deviation of data...


In [16]:
# Reading transformed test data
test_adata_pipeline = read_data(path.join(processed_datapath, 'test'))

## sklearn package for standardscaling
- Developers can ignore this section

In [17]:
# Standard scaling using sklearn package
sklearn_std_scaler = StandardScaler(with_mean=False)
sklearn_std_scaler.fit(train_adata.X)
test_adata_sklearn = sklearn_std_scaler.transform(test_adata.X)
test_adata_sklearn

array([[1.41158441e+00, 5.05259385e-01, 6.00928887e-01, 2.16354557e+00,
        2.71034692e+00, 2.64936446e+00, 1.16691805e+00, 3.11487500e+00,
        2.89359187e+00, 1.78371140e+00, 1.68876233e+00, 2.17117486e-01,
        5.31501800e-01, 2.83467754e-01, 1.79832689e+00, 2.42814788e+00,
        2.24645582e+00, 3.05534450e+00, 2.26390656e+00, 2.97262030e+00,
        4.80383130e-01, 5.64986279e-01, 3.08674476e+00, 1.20756048e+00,
        6.92900894e-01, 3.19969601e+00, 3.34413723e+00, 2.34214862e+00,
        1.30912749e+00, 2.49992423e+00, 5.33703891e-02, 5.04840204e-01,
        1.28747038e+00, 2.77672984e+00, 1.03075073e+00, 2.35046654e-01,
        1.32045375e+00, 8.38129186e-01, 1.88717009e-01, 1.44761814e+00,
        1.18779429e+00, 2.52889227e+00, 2.11705459e-01, 1.25629711e+00,
        1.19896931e+00, 2.37440367e+00, 3.42723773e+00, 5.11247478e-01,
        2.66015880e+00, 2.19147346e+00],
       [1.05031966e-01, 2.61966535e+00, 1.94483601e+00, 2.75851238e+00,
        1.29133111e+00,

## Comparing scalr library results with sklearn's library results 

In [18]:
# Checking if error is less than 1e-15
assert sum(
abs(scalr_std_scaler.train_mean -
    sklearn_std_scaler.mean_).flatten() < 1e-15
) == train_adata.shape[1], "Train data mean is not correctly calculated..."

assert sum(
abs(scalr_std_scaler.train_std - sklearn_std_scaler.scale_).flatten() <
1e-15) == train_adata.shape[
    1], "Train data standard deviation is not correctly calculated..."

# 2. SampleNorm

## scalr package - how to to use it?

In [19]:
# Sample norm using pipeline
scalr_sample_norm = sample_norm.SampleNorm()

print('\n1. `transform()` function parameters :', scalr_sample_norm.transform.__annotations__)

INFO:absl:Applying Sample-wise normalization on data.



1. `transform()` function parameters : {'data': <class 'numpy.ndarray'>, 'return': <class 'numpy.ndarray'>}


In [20]:
# Datapath of original splitted data
datapath = '/home/anand/bioc_repo/single_cell_classification/tutorials/preprocessing/data'

# Datapath to store processed_data
processed_datapath = '/home/anand/bioc_repo/single_cell_classification/tutorials/preprocessing/processed_data_sn'

In [21]:
# Fitting is not required on train data for sample-norm.
sample_chunksize = 10

# Transforming on test data.
scalr_sample_norm.process_data(path.join(datapath, 'test'),
                               sample_chunksize,
                               path.join(processed_datapath, 'test'))

In [22]:
# Reading transformed test data
test_data_sample_norm_pipeline = read_data(path.join(processed_datapath, 'test'))

## Scanpy package for sample-norm
- Developers can ignore this section

In [23]:
test_adata = read_data(path.join(datapath, 'test'), backed=None)
test_adata = test_adata[:, :].to_adata()
test_adata

AnnData object with n_obs × n_vars = 20 × 50
    obs: 'dummy'

In [24]:
# Sample norm using scanpy package
test_data_sample_norm_sc = sc.pp.normalize_total(test_adata, target_sum=1, inplace=False)
test_data_sample_norm_sc

{'X': array([[1.66012225e-02, 6.24795992e-03, 7.01791007e-03, 2.65020695e-02,
         3.62499792e-02, 3.29045934e-02, 1.42139378e-02, 3.69702652e-02,
         3.55395502e-02, 2.21433865e-02, 2.01016262e-02, 2.46952890e-03,
         6.35837548e-03, 3.60796507e-03, 2.22515583e-02, 2.85820936e-02,
         2.61381601e-02, 3.68718124e-02, 2.78244496e-02, 3.50479362e-02,
         5.93174315e-03, 7.25063879e-03, 3.87949338e-02, 1.49831889e-02,
         9.16631124e-03, 3.85269954e-02, 4.09784818e-02, 2.69402226e-02,
         1.64039296e-02, 2.90783131e-02, 6.57185034e-04, 6.11283595e-03,
         1.55749796e-02, 3.51542496e-02, 1.28547012e-02, 2.44458427e-03,
         1.59982725e-02, 1.01908904e-02, 2.36804498e-03, 1.68510815e-02,
         1.48444984e-02, 3.09983590e-02, 2.66985418e-03, 1.60117547e-02,
         1.44648391e-02, 2.73225858e-02, 3.75069327e-02, 6.04007544e-03,
         3.44796242e-02, 2.57555127e-02],
        [1.19240492e-03, 3.12707794e-02, 2.19248560e-02, 3.26180298e-02,
    

## Comparing scalr library results with scanpy library results¶

In [25]:
# Checking if error is less than 1e-15
abs(test_data_sample_norm_sc['X'] - test_data_sample_norm_pipeline[:, :].X) < 1e-15

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  Tr