# Leiden聚类

In [None]:
from __future__ import annotations

import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc

sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.set_figure_params(dpi=80, facecolor="white")
sc.logging.print_header()


adata = sc.read_10x_mtx(
    "data/filtered_gene_bc_matrices/hg19/",  # the directory with the `.mtx` file
    var_names="gene_symbols",  # use gene symbols for the variable names (variables-axis index)
    cache=True,  # write a cache file for faster subsequent reading
)
adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

sc.pp.filter_cells(adata, min_genes=200)  # this does nothing, in this specific case
sc.pp.filter_genes(adata, min_cells=3)

# annotate the group of mitochondrial genes as "mt"
adata.var["mt"] = adata.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True)

sc.pl.violin(
    adata,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True,
)

fig, axs = plt.subplots(1, 2, figsize=(10, 4), layout="constrained")
sc.pl.scatter(adata, x="total_counts", y="pct_counts_mt", show=False, ax=axs[0])
sc.pl.scatter(adata, x="total_counts", y="n_genes_by_counts", show=False, ax=axs[1]);

adata = adata[
    (adata.obs.n_genes_by_counts < 2500) & (adata.obs.n_genes_by_counts > 200) & (adata.obs.pct_counts_mt < 5),
    :,
].copy()
adata.layers["counts"] = adata.X.copy()

sc.pp.normalize_total(adata, target_sum=1e4)

sc.pp.log1p(adata)


sc.pp.highly_variable_genes(
    adata,
    layer="counts",
    n_top_genes=2000,
    min_mean=0.0125,
    max_mean=3,
    min_disp=0.5,
    flavor="seurat_v3",
)

sc.pl.highly_variable_genes(adata)


adata.layers["scaled"] = adata.X.toarray()
sc.pp.regress_out(adata, ["total_counts", "pct_counts_mt"], layer="scaled")
sc.pp.scale(adata, max_value=10, layer="scaled")

# PCA
sc.pp.pca(adata, layer="scaled", svd_solver="arpack")
sc.pl.pca(adata, annotate_var_explained=True, color="CST3")
sc.pl.pca_variance_ratio(adata, n_pcs=20)
sc.pl.pca_loadings(adata, components=(1, 2), include_lowest=True)

# 计算邻域图
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)

adata

## sc.tl.leiden()

In [None]:
sc.tl.leiden(
    adata,
    resolution=0.7,
    random_state=0,
    flavor="igraph",
    n_iterations=2,
    directed=False,
)
#adata.obs["leiden"] = adata.obs["leiden"].copy()
#adata.uns["leiden"] = adata.uns["leiden"].copy()
#adata.obsm["X_umap"] = adata.obsm["X_umap"].copy()

# 源码解析

## sc.tl.leiden()

使用 `adata.obsp['connectivities']` 作为输入。因此聚类只依赖于连接关系计算，不依赖umap

In [None]:
def leiden(  # noqa: PLR0912, PLR0913, PLR0915
    adata: AnnData,
    resolution: float = 1,
    *,
    restrict_to: tuple[str, Sequence[str]] | None = None,
    random_state: _LegacyRandom = 0,
    key_added: str = "leiden",
    adjacency: CSBase | None = None,
    directed: bool | None = None,
    use_weights: bool = True,
    n_iterations: int = -1,
    partition_type: type[MutableVertexPartition] | None = None,
    neighbors_key: str | None = None,
    obsp: str | None = None,
    copy: bool = False,
    flavor: Literal["leidenalg", "igraph"] = "leidenalg",
    **clustering_args,
) -> AnnData | None:
    """Cluster cells into subgroups :cite:p:`Traag2019`.

    将细胞聚类为亚群。

    Cluster cells using the Leiden algorithm :cite:p:`Traag2019`,
    an improved version of the Louvain algorithm :cite:p:`Blondel2008`.
    It was proposed for single-cell analysis by :cite:t:`Levine2015`.

    Leiden 算法是 Louvain 算法的优化版本，于2015年提出用于单细胞数据分析。

    This requires having run :func:`~scanpy.pp.neighbors` or
    :func:`~scanpy.external.pp.bbknn` first.

    运行Leiden之前，要先计算邻居 sc.pp.neighbors，或者 sc.external.pp.bbknn

    Parameters
    ----------
    resolution
        分辨率。
    flavor
        使用哪个包的实现。
    neighbors_key
        默认 .obsp['connectivities'] for connectivities
    obsp
        使用 .obsp[obsp] 作为邻接关系，不能与 neighbors_key 同时指定。
    """
    # 检查算法
    if flavor not in {"igraph", "leidenalg"}:
        msg = (
            f"flavor must be either 'igraph' or 'leidenalg', but {flavor!r} was passed"
        )
        raise ValueError(msg)

    pass

    # 关键，如果用户没有指定连接关系，则使用 adata 默认的
    if adjacency is None:
        # (Pdb) adjacency is adata.obsp['connectivities']
        # True
        adjacency = _utils._choose_graph(adata, obsp, neighbors_key)

    pass

    if flavor == "leidenalg":
        pass
    else:
        # igraph
        g = _utils.get_igraph_from_adjacency(adjacency, directed=False)

    # 保存结果到 adata.obs
    # key_added = 'leiden'
    groups = np.array(part.membership)
    adata.obs[key_added] = pd.Categorical(
        values=groups.astype("U"),
        categories=natsorted(map(str, np.unique(groups))),
    )

    # 保存信息到 adata.uns['leiden']
    # (Pdb) adata.uns['leiden']
    # {'params': {'resolution': 0.7, 'random_state': 0, 'n_iterations': 2}}
    adata.uns[key_added] = {}
    adata.uns[key_added]["params"] = dict(
        resolution=resolution,
        random_state=random_state,
        n_iterations=n_iterations,
    )
    
    pass

    return adata if copy else None

## _choose_graph()

In [None]:
def _choose_graph(
    adata: AnnData, obsp: str | None, neighbors_key: str | None
) -> CSBase:
    """Choose connectivities from neighbors or another obsp entry."""
    if obsp is not None and neighbors_key is not None:
        msg = "You can't specify both obsp, neighbors_key. Please select only one."
        raise ValueError(msg)

    # 优先考虑 obsp 参数
    if obsp is not None:
        return adata.obsp[obsp]
    else:
        # 再考虑 neighbors_key 参数
        neighbors = NeighborsView(adata, neighbors_key)
        if "connectivities" not in neighbors:
            msg = (
                "You need to run `pp.neighbors` first to compute a neighborhood graph."
            )
            raise ValueError(msg)
        # (Pdb) neighbors["connectivities"] is adata.obsp['connectivities']
        # True
        return neighbors["connectivities"]