## scanpy基础过滤

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np

In [None]:
adata = sc.datasets.krumsiek11()
adata.obs_names_make_unique()

In [None]:
sc.pp.filter_cells(adata, min_genes=3)
sc.pp.filter_genes(adata, min_cells=3)

In [None]:
adata

In [None]:
df = pd.DataFrame(
    adata.X,
    index=adata.obs_names,
    columns=adata.var_names
    )
df

## 计数统计和是否表达统计

- 原始计数 adata.X
    - 按行求和，得到每个细胞总的计数 n_counts
    - 按列求和，得到每个基因总的计数 n_counts
- 基因是否表达 adata.X > 0
    - 按行求和，得到每个细胞表达的基因数 n_genes
    - 按列求和，得到每个基因在多少细胞中表达 n_cells

In [None]:
# 细胞总的表达计数
n_counts = np.sum(adata.X, axis=1)
print(n_counts.ndim)
print(n_counts.shape)
n_counts

In [None]:
# 表达的基因数
n_genes = np.sum(adata.X > 0, axis=1)
print(n_genes.ndim)
print(n_genes.shape)
n_genes

In [None]:
# 每一个基因的所有计数
n_counts = np.sum(adata.X, axis=0)
print(n_counts.ndim)
print(n_counts.shape)
n_counts

In [None]:
# 每一个基因表达的样本数
n_cells = np.sum(adata.X > 0, axis=0)
print(n_cells.ndim)
print(n_cells.shape)
n_cells

## 矩阵求和测试

In [None]:
# np.sum 不同 axis 的示例
arr = np.arange(12).reshape(3, 4)
print("原始矩阵:\n", arr)

print("整体求和 (axis=None):", np.sum(arr))
print("按列求和 (axis=0):", np.sum(arr, axis=0))

arr > 0

## 源码解析

### filter_cells()

副作用，会使 obs 增加两列数据：n_counts, n_genes，即总的计数，或者表达的基因数

In [None]:
def filter_cells(
    data: AnnData | CSBase | np.ndarray | DaskArray,
    *,
    min_counts: int | None = None,
    min_genes: int | None = None,
    max_counts: int | None = None,
    max_genes: int | None = None,
    inplace: bool = True,
    copy: bool = False,
) -> AnnData | tuple[np.ndarray, np.ndarray] | None:
    """Filter cell outliers based on counts and numbers of genes expressed.
    根据计数或者基因表达数量过滤异常细胞。

    For instance, only keep cells with at least `min_counts` counts or
    `min_genes` genes expressed. This is to filter measurement outliers,
    i.e. “unreliable” observations.

    以细胞为单位，可以计数表达的基因数，以及细胞表达的总计数
    - 细胞的某个基因是否表达：True - counts > 0, False - counts <= 0

    可以是：
    最少表达的基因数，最多表达的基因数。
    最少表达的计数总数，最多表达的计数总数。

    每次调用，只能提供一个过滤参数。

    Only provide one of the optional parameters `min_counts`, `min_genes`,
    `max_counts`, `max_genes` per call.

    Parameters
    ----------
    data
        The (annotated) data matrix of shape `n_obs` × `n_vars`.
        Rows correspond to cells and columns to genes.
    min_counts
        Minimum number of counts required for a cell to pass filtering.
    min_genes
        Minimum number of genes expressed required for a cell to pass filtering.
    max_counts
        Maximum number of counts required for a cell to pass filtering.
    max_genes
        Maximum number of genes expressed required for a cell to pass filtering.
    inplace
        Perform computation inplace or return result.

    Returns
    -------
    Depending on `inplace`, returns the following arrays or directly subsets
    and annotates the data matrix:

    cells_subset
        Boolean index mask that does filtering. `True` means that the
        cell is kept. `False` means the cell is removed.
    number_per_cell
        Depending on what was thresholded (`counts` or `genes`),
        the array stores `n_counts` or `n_cells` per gene.

    Examples
    --------
    >>> import scanpy as sc
    >>> adata = sc.datasets.krumsiek11()
    UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
        utils.warn_names_duplicates("obs")
    >>> adata.obs_names_make_unique()
    >>> adata.n_obs
    640
    >>> adata.var_names.tolist()  # doctest: +NORMALIZE_WHITESPACE
    ['Gata2', 'Gata1', 'Fog1', 'EKLF', 'Fli1', 'SCL',
     'Cebpa', 'Pu.1', 'cJun', 'EgrNab', 'Gfi1']
    >>> # add some true zeros
    >>> adata.X[adata.X < 0.3] = 0
    >>> # simply compute the number of genes per cell
    >>> sc.pp.filter_cells(adata, min_genes=0)
    >>> adata.n_obs
    640
    >>> int(adata.obs["n_genes"].min())
    1
    >>> # filter manually
    >>> adata_copy = adata[adata.obs["n_genes"] >= 3]
    >>> adata_copy.n_obs
    554
    >>> int(adata_copy.obs["n_genes"].min())
    3
    >>> # actually do some filtering
    >>> sc.pp.filter_cells(adata, min_genes=3)
    >>> adata.n_obs
    554
    >>> int(adata.obs["n_genes"].min())
    3

    """
    if copy:
        logg.warning("`copy` is deprecated, use `inplace` instead.")
    
    # 计算传入的参数个数
    n_given_options = sum(
        option is not None for option in [min_genes, min_counts, max_genes, max_counts]
    )
    # 传入参数个数不为1，则抛出异常
    if n_given_options != 1:
        msg = (
            "Only provide one of the optional parameters `min_counts`, "
            "`min_genes`, `max_counts`, `max_genes` per call."
        )
        raise ValueError(msg)

    # data 如果是 AnnData，则提取其矩阵 X 递归调用 filter_cells()
    if isinstance(data, AnnData):
        raise_not_implemented_error_if_backed_type(data.X, "filter_cells")
        adata = data.copy() if copy else data

        # cell_subset是标记过的一维布尔数组
        # number 是用于过滤的相应计数：n_counts，或者 n_genes
        # materialize_as_ndarray 只是将元组转换成np的数组，没修改什么
        cell_subset, number = materialize_as_ndarray(
            filter_cells(
                adata.X,
                min_counts=min_counts,
                min_genes=min_genes,
                max_counts=max_counts,
                max_genes=max_genes,
            ),
        )
        if not inplace:
            return cell_subset, number
        
        # 如果过滤的不是基因，那么一定是counts
        if min_genes is None and max_genes is None:
            adata.obs["n_counts"] = number
        else:
            adata.obs["n_genes"] = number
        adata._inplace_subset_obs(cell_subset)
        return adata if copy else None

    # 最小值，要么是counts，要么是基因
    min_number = min_counts if min_genes is None else min_genes
    # 最大值，要么是counts，要么是基因
    max_number = max_counts if max_genes is None else max_genes
    # 这里会调用 np.sum(data, axis=1)按行计算相应值
    # 如果既不是最小基因数，也不是最大基因数，那么一定是依据总的counts数，用原始表达量按行计算
    # 如果依据表达的基因数，则先将counts转换成是否表达，即是否大于0，大于0即表达，否则不表达
    number_per_cell = axis_sum(
        # 这里是精华，决定求计数的和，还是求是否表达的和
        data if min_genes is None and max_genes is None else data > 0, axis=1
    )
    if isinstance(number_per_cell, np.matrix):
        number_per_cell = number_per_cell.A1
    
    # 计算用于过滤的布尔数组，依据最小值，或者最大值
    if min_number is not None:
        cell_subset = number_per_cell >= min_number
    if max_number is not None:
        cell_subset = number_per_cell <= max_number

    # 取反，求所有被过滤掉的细胞的总和，大于0则作相应报告
    s = axis_sum(~cell_subset)
    if s > 0:
        msg = f"filtered out {s} cells that have "
        if min_genes is not None or min_counts is not None:
            msg += "less than "
            msg += (
                f"{min_genes} genes expressed"
                if min_counts is None
                else f"{min_counts} counts"
            )
        if max_genes is not None or max_counts is not None:
            msg += "more than "
            msg += (
                f"{max_genes} genes expressed"
                if max_counts is None
                else f"{max_counts} counts"
            )
        logg.info(msg)

    # 返回用于过滤的布尔数组，以及按行计算的数值：总的计数，或者表达的基因数
    return cell_subset, number_per_cell


### filter_genes()

In [None]:
def filter_genes(
    data: AnnData | CSBase | np.ndarray | DaskArray,
    *,
    min_counts: int | None = None,
    min_cells: int | None = None,
    max_counts: int | None = None,
    max_cells: int | None = None,
    inplace: bool = True,
    copy: bool = False,
) -> AnnData | tuple[np.ndarray, np.ndarray] | None:
    """Filter genes based on number of cells or counts.

    过滤基因，基于表达的细胞数量，或者基因总的计数

    Keep genes that have at least `min_counts` counts or are expressed in at
    least `min_cells` cells or have at most `max_counts` counts or are expressed
    in at most `max_cells` cells.

    基于表达的细胞数：最少 min_cells，最多 max_cells
    基于表达的总计数：最少 min_counts，最多 max_counts

    一次只能传统一个过滤指标

    Only provide one of the optional parameters `min_counts`, `min_cells`,
    `max_counts`, `max_cells` per call.

    Parameters
    ----------
    data
        An annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    min_counts
        Minimum number of counts required for a gene to pass filtering.
    min_cells
        Minimum number of cells expressed required for a gene to pass filtering.
    max_counts
        Maximum number of counts required for a gene to pass filtering.
    max_cells
        Maximum number of cells expressed required for a gene to pass filtering.
    inplace
        Perform computation inplace or return result.

    Returns
    -------
    Depending on `inplace`, returns the following arrays or directly subsets
    and annotates the data matrix

    gene_subset
        Boolean index mask that does filtering. `True` means that the
        gene is kept. `False` means the gene is removed.

        用于过滤的布尔索引数组
    number_per_gene
        Depending on what was thresholded (`counts` or `cells`), the array stores
        `n_counts` or `n_cells` per gene.
        基于什么指标过滤的？总的计数，表达的细胞数？

    """
    if copy:
        logg.warning("`copy` is deprecated, use `inplace` instead.")
    
    # 计算传入的过滤阈值
    n_given_options = sum(
        option is not None for option in [min_cells, min_counts, max_cells, max_counts]
    )

    # 一次调用只能传入一个阈值
    if n_given_options != 1:
        msg = (
            "Only provide one of the optional parameters `min_counts`, "
            "`min_cells`, `max_counts`, `max_cells` per call."
        )
        raise ValueError(msg)

    # 如果传入的是 AnnData，则用其矩阵，再递归调用本函数
    if isinstance(data, AnnData):
        raise_not_implemented_error_if_backed_type(data.X, "filter_genes")
        adata = data.copy() if copy else data
        gene_subset, number = materialize_as_ndarray(
            filter_genes(
                adata.X,
                min_cells=min_cells,
                min_counts=min_counts,
                max_cells=max_cells,
                max_counts=max_counts,
            )
        )

        # 如果不原地过滤，则直接返回两个值
        if not inplace:
            return gene_subset, number

        # 原地过滤。不是基于细胞数，就是基于总的计数
        # 为 var 增加一列，用于保存过滤针对的是什么值
        if min_cells is None and max_cells is None:
            adata.var["n_counts"] = number
        else:
            adata.var["n_cells"] = number
        
        # 原地过滤
        adata._inplace_subset_var(gene_subset)
        return adata if copy else None

    # 最小值，要么计数，要么细胞
    min_number = min_counts if min_cells is None else min_cells
    # 最大值，要么计数，要么细胞
    max_number = max_counts if max_cells is None else max_cells
    # 按列求和
    number_per_gene = axis_sum(
        # 这里是精华，如过不基于细胞，那么就是基于计数
        # 通过 data > 0 将每一个单元格标记为True / False
        # 按列求和，要么计数，要么表达的细胞数
        data if min_cells is None and max_cells is None else data > 0, axis=0
    )
    if isinstance(number_per_gene, np.matrix):
        number_per_gene = number_per_gene.A1
    
    # 生成过滤布尔索引数组，用于过滤基因
    if min_number is not None:
        gene_subset = number_per_gene >= min_number
    if max_number is not None:
        gene_subset = number_per_gene <= max_number

    # 计算被滤掉的基因数，如果大于0，则报告
    s = axis_sum(~gene_subset)
    if s > 0:
        msg = f"filtered out {s} genes that are detected "
        if min_cells is not None or min_counts is not None:
            msg += "in less than "
            msg += (
                f"{min_cells} cells" if min_counts is None else f"{min_counts} counts"
            )
        if max_cells is not None or max_counts is not None:
            msg += "in more than "
            msg += (
                f"{max_cells} cells" if max_counts is None else f"{max_counts} counts"
            )
        logg.info(msg)
    # 返回用于过滤的布尔索引数组，以及过滤依据的值：计数，细胞
    return gene_subset, number_per_gene

## Debug

In [None]:
import scanpy as sc

breakpoint()
adata = sc.datasets.krumsiek11()
adata.obs_names_make_unique()
#sc.pp.filter_cells(adata, min_genes=3)
sc.pp.filter_genes(adata, min_cells=3)
adata