## sc.pp.normalize_total标准化

### 来自于sc.pp.normalize_total测试数据

In [None]:
import numpy as np
from anndata import AnnData
import scanpy as sc

In [None]:
arr = np.array(
    [
        [3, 3, 3, 6, 6],
        [1, 1, 1, 2, 2],
        [1, 22, 1, 2, 2],
    ],
    dtype="float32",
)
adata = AnnData(arr)

In [None]:
sc.pp.normalize_total(adata, target_sum=1, inplace=True)
adata

In [None]:
sc.pp.normalize_total(adata, target_sum=1, exclude_highly_expressed=True, max_fraction=0.2, inplace=False)['X']

### scanpy教程

In [None]:
from __future__ import annotations

import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc

adata = sc.read_10x_mtx(
    "data/filtered_gene_bc_matrices/hg19/",  # the directory with the `.mtx` file
    var_names="gene_symbols",  # use gene symbols for the variable names (variables-axis index)
    cache=True,  # write a cache file for faster subsequent reading
)
adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
sc.pp.filter_cells(adata, min_genes=200)  # this does nothing, in this specific case
sc.pp.filter_genes(adata, min_cells=3)

# annotate the group of mitochondrial genes as "mt"
adata.var["mt"] = adata.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True)

adata = adata[
    (adata.obs.n_genes_by_counts < 2500) & (adata.obs.n_genes_by_counts > 200) & (adata.obs.pct_counts_mt < 5),
    :,
].copy()
adata.layers["counts"] = adata.X.copy()

# breakpoint()
sc.pp.normalize_total(adata, target_sum=1e4)
adata

In [None]:
np.sum(adata.X, axis=1)

In [None]:
adata.obs

### 源码解析

对每一个细胞的计数进行标准化。

对于每一个细胞：

- 标准化值 = 单个基因的计数 / 细胞总的计数 * 目标值

先计算单个细胞总的计数，再用该细胞每个基因的计数去除，再乘以一个目标数，如果目标数是1百万（1e6），则是 CPM 标准化。

问题，单细胞标准化为什么不考虑基因长度，而只是考虑细胞总的计数。

如果细胞内某些基因的表达量过高，可以将其排除在细胞总计数之外。

一个细胞，类似于bulk的一个样本，因此标准化时以细胞为单位进行标准化。

一个细胞，是一个反应体系，每个细胞所测得的reads数不同，因此要按细胞进行标准化。

In [None]:
def normalize_total(  # noqa: PLR0912
    adata: AnnData,
    *,
    target_sum: float | None = None,
    exclude_highly_expressed: bool = False,
    max_fraction: float = 0.05,
    key_added: str | None = None,
    layer: str | None = None,
    inplace: bool = True,
    copy: bool = False,
) -> AnnData | dict[str, np.ndarray] | None:
    """Normalize counts per cell.

    Normalize each cell by total counts over all genes,
    so that every cell has the same total count after normalization.
    If choosing `target_sum=1e6`, this is CPM normalization.

    If `exclude_highly_expressed=True`, very highly expressed genes are excluded
    from the computation of the normalization factor (size factor) for each
    cell. This is meaningful as these can strongly influence the resulting
    normalized values for all other genes :cite:p:`Weinreb2017`.

    Similar functions are used, for example, by Seurat :cite:p:`Satija2015`, Cell Ranger
    :cite:p:`Zheng2017` or SPRING :cite:p:`Weinreb2017`.
    """