## 计算质控指标

In [None]:
import scanpy as sc
import seaborn as sns

In [None]:
pbmc = sc.datasets.pbmc3k()
pbmc.var["mito"] = pbmc.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(pbmc, qc_vars=["mito"], percent_top=None, log1p=False, inplace=True)

In [None]:
pbmc.var

## 源码解析

### def _choose_mtx_rep()

选择矩阵：

- layer当中的，adata.layers[layer]
- raw当中的，adata.raw.X
- 否则就用默认的 adata.X

注意 layer 和 raw 不能同时提供。

In [None]:
def _choose_mtx_rep(adata, *, use_raw: bool = False, layer: str | None = None):
    is_layer = layer is not None
    if use_raw and is_layer:
        msg = (
            "Cannot use expression from both layer and raw. You provided:"
            f"{use_raw=!r} and {layer=!r}"
        )
        raise ValueError(msg)
    if is_layer:
        return adata.layers[layer]
    elif use_raw:
        return adata.raw.X
    else:
        return adata.X

### def describe_obs()

描述观察：为每一个细胞计算相应指标
- 细胞中表达的基因数
- 细胞中所有计数总和
- 一组基因的子集的计数，以及子集计数占细胞总计数的百分比

以及表达的基因数和细胞子集中的计数的log1p，即非比例形式的值的log1p

In [None]:
def describe_obs(  # noqa: PLR0913
    adata: AnnData,
    *,
    expr_type: str = "counts",
    var_type: str = "genes",
    qc_vars: Collection[str] = (),
    percent_top: Collection[int] | None = (50, 100, 200, 500),
    layer: str | None = None,
    use_raw: bool = False,
    log1p: bool | None = True,
    inplace: bool = False,
    x=None,
    parallel=None,
) -> pd.DataFrame | None:
    """Describe observations of anndata.

    Calculates a number of qc metrics for observations in AnnData object. See
    section `Returns` for a description of those metrics.

    Note that this method can take a while to compile on the first call. That
    result is then cached to disk to be used later.

    Params
    ------
    {doc_adata_basic}
    {doc_qc_metric_naming}
    {doc_obs_qc_args}
    {doc_expr_reps}
    log1p
        Add `log1p` transformed metrics.
    inplace
        Whether to place calculated metrics in `adata.obs`.
    X
        Matrix to calculate values on. Meant for internal usage.

    Returns
    -------
    QC metrics for observations in adata. If inplace, values are placed into
    the AnnData's `.obs` dataframe.

    {doc_obs_qc_returns}

    """
    if parallel is not None:
        warn(
            "Argument `parallel` is deprecated, and currently has no effect.",
            FutureWarning,
            stacklevel=2,
        )
    # Handle whether X is passed
    if x is None:
        x = _choose_mtx_rep(adata, use_raw=use_raw, layer=layer)
        if isinstance(x, sparse.coo_matrix):
            x = sparse.csr_matrix(x)  # COO not subscriptable  # noqa: TID251
        if isinstance(x, CSBase):
            x.eliminate_zeros()
    
    # 创建一个临时数据框，索引使用ob_names
    obs_metrics = pd.DataFrame(index=adata.obs_names)

    """
    计算表达的基因数 n_genes_by_counts
    def axis_nnz(x, axis):
        np.count_nonzero(x, axis=axis)
    计算非0单元格数
    axis_nnz最终会调用 x.getnnz(axis=axis)，得到每一行，非0单元格的个数
    adata.X.getnnz(axis=1)，直接得到

    materialize_as_ndarray 确保返回值是一个数组
    """
    obs_metrics[f"n_{var_type}_by_{expr_type}"] = materialize_as_ndarray(
        axis_nnz(x, axis=1)
    )
    # 计算log1p
    if log1p:
        obs_metrics[f"log1p_n_{var_type}_by_{expr_type}"] = np.log1p(
            obs_metrics[f"n_{var_type}_by_{expr_type}"]
        )
    
    """
    计算总的表达计数 total_counts
    def axis_sum(x, axis, dtype):
        return np.sum(x, axis=axis, dtype=dtype)
    
    直接调用np.sum按行计数，得到是一个矩阵matrix，使用np.ravel展平成一维数组 array

    (Pdb) x is adata.X
    True
    (Pdb) np.sum(adata.X, axis=1)
    (Pdb) np.ravel(np.sum(adata.X, axis=1))

    """
    obs_metrics[f"total_{expr_type}"] = np.ravel(axis_sum(x, axis=1))

    # 计算log1p
    if log1p:
        obs_metrics[f"log1p_total_{expr_type}"] = np.log1p(
            obs_metrics[f"total_{expr_type}"]
        )
    
    # 计算百分比数值
    if percent_top:
        percent_top = sorted(percent_top)
        proportions = top_segment_proportions(x, percent_top)
        for i, n in enumerate(percent_top):
            obs_metrics[f"pct_{expr_type}_in_top_{n}_{var_type}"] = (
                proportions[:, i] * 100
            )
    """
    计算基因子集总的计数，以及在细胞总的计数中的百分比

    通过 x[:, adata.var[qc_var].values] 选取矩阵的子集，要求qc_var变量是一个一维布尔数组
    """
    for qc_var in qc_vars:
        # 指标总的计数：total_counts_mito
        obs_metrics[f"total_{expr_type}_{qc_var}"] = np.ravel(
            axis_sum(x[:, adata.var[qc_var].values], axis=1)
        )
        # 计算log1p
        if log1p:
            obs_metrics[f"log1p_total_{expr_type}_{qc_var}"] = np.log1p(
                obs_metrics[f"total_{expr_type}_{qc_var}"]
            )
        # 指标百分比计数：pct_counts_mito，用子集的总的计数，除以细胞总的计数，就得到了子集计数的百分比
        obs_metrics[f"pct_{expr_type}_{qc_var}"] = (
            obs_metrics[f"total_{expr_type}_{qc_var}"]
            / obs_metrics[f"total_{expr_type}"]
            * 100
        )
    
    # 原地修改，合并数据框
    if inplace:
        adata.obs[obs_metrics.columns] = obs_metrics
    else:
        return obs_metrics
    return None

### def describe_var()

描述变量：为每一个基因计算相应指标

就求这4列值：n_cells_by_counts  mean_counts  pct_dropout_by_counts  total_counts

- n_cells_by_counts，基因在多少细胞中表达
- mean_counts，基因在所有细胞中平均表达量
- pct_dropout_by_counts，基因在多少细胞中未表达 = 1 - 表达的细胞比例 (表达的细胞数/细胞总数)
- total_counts，基因总的表达计数

In [None]:
def describe_var(
    adata: AnnData,
    *,
    expr_type: str = "counts",
    var_type: str = "genes",
    layer: str | None = None,
    use_raw: bool = False,
    inplace: bool = False,
    log1p: bool = True,
    x: CSBase | sparse.coo_matrix | np.ndarray | None = None,
) -> pd.DataFrame | None:
    """Describe variables of anndata.

    Calculates a number of qc metrics for variables in AnnData object. See
    section `Returns` for a description of those metrics.

    Params
    ------
    {doc_adata_basic}
    {doc_qc_metric_naming}
    {doc_expr_reps}
    inplace
        Whether to place calculated metrics in `adata.var`.
    X
        Matrix to calculate values on. Meant for internal usage.

    Returns
    -------
    QC metrics for variables in adata. If inplace, values are placed into the
    AnnData's `.var` dataframe.

    {doc_var_qc_returns}

    """
    # Handle whether X is passed
    if x is None:
        x = _choose_mtx_rep(adata, use_raw=use_raw, layer=layer)
        if isinstance(x, sparse.coo_matrix):
            x = sparse.csr_matrix(x)  # COO not subscriptable  # noqa: TID251
        if isinstance(x, CSBase):
            x.eliminate_zeros()
    
    # 初始化一个数据框
    var_metrics = pd.DataFrame(index=adata.var_names)

    """
    计算基因在多少细胞中表达：axis_nnz(x, axis=0)，沿列方向求非0单元格的和
    最终调用：x.getnnz(axis=0)

    通过 _get_mean_var 计算基因表达的平均值
    """
    var_metrics[f"n_cells_by_{expr_type}"], var_metrics[f"mean_{expr_type}"] = (
        materialize_as_ndarray((axis_nnz(x, axis=0), _get_mean_var(x, axis=0)[0]))
    )
    # 计算log10
    if log1p:
        var_metrics[f"log1p_mean_{expr_type}"] = np.log1p(
            var_metrics[f"mean_{expr_type}"]
        )
    
    # 计算未表达的细胞百分比 = 1 - 表达细胞 / 细胞总数
    var_metrics[f"pct_dropout_by_{expr_type}"] = (
        1 - var_metrics[f"n_cells_by_{expr_type}"] / x.shape[0]
    ) * 100
    
    # 沿列方向求基因计数总和
    var_metrics[f"total_{expr_type}"] = np.ravel(axis_sum(x, axis=0))
    
    # 计算log1p
    if log1p:
        var_metrics[f"log1p_total_{expr_type}"] = np.log1p(
            var_metrics[f"total_{expr_type}"]
        )
    
    # 原地修改，合并表格
    if inplace:
        adata.var[var_metrics.columns] = var_metrics
        return None
    return var_metrics

### def calculate_qc_metrics

输入参数：
- adata，AnnData对象
- expr_type，表达类型，指矩阵中值的类型，通常为 counts 计数
- var_type，变量类型，指矩阵中列的类型，通常为 genes
- qc_vars：QC 指标，adata.var表中的列名，放到一个列表中
- percent_top：百分比指标
- layer，层名
- use_raw，使用 raw.X，与 layer 不能同时出现
- inplace，原地修改
- log1p，为所有计算的指标计算 log1p 值


In [None]:
def calculate_qc_metrics(
    adata: AnnData,
    *,
    expr_type: str = "counts",
    var_type: str = "genes",
    qc_vars: Collection[str] | str = (),
    percent_top: Collection[int] | None = (50, 100, 200, 500),
    layer: str | None = None,
    use_raw: bool = False,
    inplace: bool = False,
    log1p: bool = True,
    parallel: bool | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame] | None:
    """Calculate quality control metrics.

    计算质控指标

    Calculates a number of qc metrics for an AnnData object, see section
    `Returns` for specifics. Largely based on `calculateQCMetrics` from scater
    :cite:p:`McCarthy2017`. Currently is most efficient on a sparse CSR or dense matrix.

    为一个AnnData对象计算一系列质控指标数字。

    Note that this method can take a while to compile on the first call. That
    result is then cached to disk to be used later.

    Parameters
    ----------
    {doc_adata_basic}
    {doc_qc_metric_naming}
    {doc_obs_qc_args}
    {doc_expr_reps}
    inplace
        Whether to place calculated metrics in `adata`'s `.obs` and `.var`.
    log1p
        Set to `False` to skip computing `log1p` transformed annotations.

    Returns
    -------
    Depending on `inplace` returns calculated metrics
    (as :class:`~pandas.DataFrame`) or updates `adata`'s `obs` and `var`.

    {doc_obs_qc_returns}

    {doc_var_qc_returns}

    Example
    -------
    Calculate qc metrics for visualization.

    .. plot::
        :context: close-figs

        import scanpy as sc
        import seaborn as sns

        pbmc = sc.datasets.pbmc3k()
        pbmc.var["mito"] = pbmc.var_names.str.startswith("MT-")
        sc.pp.calculate_qc_metrics(pbmc, qc_vars=["mito"], inplace=True)
        sns.jointplot(
            data=pbmc.obs,
            x="log1p_total_counts",
            y="log1p_n_genes_by_counts",
            kind="hex",
        )

    .. plot::
        :context: close-figs

        sns.histplot(pbmc.obs["pct_counts_mito"])

    """
    if parallel is not None:
        warn(
            "Argument `parallel` is deprecated, and currently has no effect.",
            FutureWarning,
            stacklevel=2,
        )
    # Pass X so I only have to do it once
    # 选择一个矩阵文件，来自于 layer, raw，或者是adata.X，默认adata.X
    x = _choose_mtx_rep(adata, use_raw=use_raw, layer=layer)
    if isinstance(x, sparse.coo_matrix):
        x = sparse.csr_matrix(x)  # COO not subscriptable  # noqa: TID251
    if isinstance(x, CSBase):
        x.eliminate_zeros()

    # Convert qc_vars to list if str
    # 如果传入的质控变量是字符串，则转换成列表
    if isinstance(qc_vars, str):
        qc_vars = [qc_vars]

    # 计算观察的质控指标
    obs_metrics = describe_obs(
        adata,
        expr_type=expr_type,
        var_type=var_type,
        qc_vars=qc_vars,
        percent_top=percent_top,
        inplace=inplace,
        x=x,
        log1p=log1p,
    )

    # 计算变量的质控指标
    var_metrics = describe_var(
        adata,
        expr_type=expr_type,
        var_type=var_type,
        inplace=inplace,
        x=x,
        log1p=log1p,
    )

    # 如果不是原地修改，则返回计算的指标：观察的，变量的
    if not inplace:
        return obs_metrics, var_metrics

## 背景知识

- 矩阵求和
- 矩阵求非零元素的个数

都可以求整个矩阵，也可以按行或者列求。

### 矩阵求和
#### numpy.sum()

In [64]:
import numpy as np

arr = np.array([
    [1, 0, 0, 4],
    [0, 0, 3, 0],
    [2, 5, 0, 0],
    [0, 0, 0, 0]
])
arr

np.sum(arr)
np.sum(arr, axis=1)
np.sum(arr, axis=0)

array([3, 5, 3, 4])

### 矩阵求非零元素数量

返回数组中 非零元素的数量（nnz 即 "number of non-zero elements"）

#### numpy.count_nonzero()

In [None]:
import numpy as np

# 1. 创建一个示例二维数组
#    这个数组包含了零和非零元素（正数、负数、小数）
arr = np.array([
    [1, 0, 3, -5],
    [0, 0, 0, 7],
    [2.5, 4, 0, 0]
])

print("原始数组:")
print(arr)
print("-" * 20)

# 2. 计算整个数组中非零元素的总数
all_nnz = np.count_nonzero(arr)
col_nnz = np.count_nonzero(arr, axis=1)
row_nnz = np.count_nonzero(arr, axis=0)
print('all_nnz', all_nnz)
print('col_nnz', col_nnz)
print('row_nnz', row_nnz)

#### csr_matrix.getnnz

In [None]:
import numpy as np
from scipy.sparse import csr_matrix

# 1. 创建一个密集矩阵（作为原始数据）
dense_matrix = np.array([
    [1, 0, 0, 4],
    [0, 0, 3, 0],
    [2, 5, 0, 0],
    [0, 0, 0, 0]
])

print("原始密集矩阵:")
print(dense_matrix)
print("-" * 30)

# 2. 将密集矩阵转换为 CSR 格式的稀疏矩阵
csr_mat = csr_matrix(dense_matrix)
print("CSR 格式稀疏矩阵的表示:")
print(csr_mat)
print("-" * 30)

# 整体求，按行，按列求
all_nnz = csr_mat.getnnz()
col_nnz = csr_mat.getnnz(axis=1)
row_nnz = csr_mat.getnnz(axis=0)
print('all_nnz', all_nnz)
print('col_nnz', col_nnz)
print('row_nnz', row_nnz)