## 读取10x数据

### sc.read_10_mtx()

In [5]:
from __future__ import annotations

import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc

adata = sc.read_10x_mtx(
    "data/filtered_gene_bc_matrices/hg19/",  # the directory with the `.mtx` file
    var_names="gene_symbols",  # use gene symbols for the variable names (variables-axis index)
    cache=True,  # write a cache file for faster subsequent reading
)
adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [6]:
adata

AnnData object with n_obs × n_vars = 2700 × 32738
    var: 'gene_ids'

### 源码解析

In [None]:
# scanpy.readwrite.py

def read_10x_mtx(
    path: Path | str,
    *,
    var_names: Literal["gene_symbols", "gene_ids"] = "gene_symbols",
    make_unique: bool = True,
    cache: bool = False,
    cache_compression: Literal["gzip", "lzf"] | None | Empty = _empty,
    gex_only: bool = True,
    prefix: str | None = None,
) -> AnnData:
    """Read 10x-Genomics-formatted mtx directory.

    Parameters
    ----------
    path
        Path to directory for `.mtx` and `.tsv` files,
        e.g. './filtered_gene_bc_matrices/hg19/'.
        文件路径，应包含3个文件：matrix.mtx, barcodes.tsv, genes.tsv
    var_names
        The variables index.
        变量名称，使用基因符号名称，还是基因ID，默认使用基因符号
    make_unique
        Whether to make the variables index unique by appending '-1',
        '-2' etc. or not.
        在重复变量名称后面加-1、-2这样的后缀。
    cache
        If `False`, read from source, if `True`, read from fast 'h5ad' cache.
        是否缓存原始文件到当前cache目录。
    cache_compression
        See the h5py :ref:`dataset_compression`.
        (Default: `settings.cache_compression`)
    gex_only
        Only keep 'Gene Expression' data and ignore other feature types,
        e.g. 'Antibody Capture', 'CRISPR Guide Capture', or 'Custom'
        只保留基因表达值，其他值忽略
    prefix
        Any prefix before `matrix.mtx`, `genes.tsv` and `barcodes.tsv`. For instance,
        if the files are named `patientA_matrix.mtx`, `patientA_genes.tsv` and
        `patientA_barcodes.tsv` the prefix is `patientA_`.
        (Default: no prefix)
        数据文件名称前面是否有前缀

    Returns
    -------
    An :class:`~anndata.AnnData` object

    """
    path = Path(path)
    prefix = "" if prefix is None else prefix

    # genes.tsv这样的格式为遗留格式
    is_legacy = (path / f"{prefix}genes.tsv").is_file()
    adata = _read_10x_mtx(
        path,
        var_names=var_names,
        make_unique=make_unique,
        cache=cache,
        cache_compression=cache_compression,
        prefix=prefix,
        is_legacy=is_legacy,
    )

    # 遗留格式只有基因表达，直接返回。新格式，没有设置 gex_only，即保留所有数据，也直接返回。
    if is_legacy or not gex_only:
        return adata
    
    # 新格式，设置了 gex_only，只保留基因表达数据
    gex_rows = adata.var["feature_types"] == "Gene Expression"
    return adata[:, gex_rows].copy()

In [None]:
def _read_10x_mtx(
    path: Path,
    *,
    var_names: Literal["gene_symbols", "gene_ids"] = "gene_symbols",
    make_unique: bool = True,
    cache: bool = False,
    cache_compression: Literal["gzip", "lzf"] | None | Empty = _empty,
    prefix: str = "",
    is_legacy: bool,
) -> AnnData:
    """Read mex from output from Cell Ranger v2- or v3+."""

    # 新格式文件以.gz结尾，遗留格式无.gz结尾
    suffix = "" if is_legacy else ".gz"

    # 读取表达数据
    adata = read(
        path / f"{prefix}matrix.mtx{suffix}",
        cache=cache,
        cache_compression=cache_compression,
    ).T  # transpose the data
    # 转置，结果为：行是细胞，列是基因

    # 读取基因表格：第1列 - ID，第2列 - 名称，第3列（新） - feature_types
    # 旧表格叫 genes.tsv，新表格叫 features.tsv，不只基因，还含有其他特征
    # 将基因名称或基因ID用作变量名称，即 adata.var 的索引
    # 当用基因名称作为变量索引时，如果设置了去重，会将重复名称后面加-1、-2这样的数字，但名称列表的元素总数不会变
    # 当用基因ID作为变量索引时，似乎不需要去重，或许是上游已经处理好了
    genes = pd.read_csv(
        path / f"{prefix}{'genes' if is_legacy else 'features'}.tsv{suffix}",
        header=None,
        sep="\t",
    )
    if var_names == "gene_symbols":
        var_names_idx = pd.Index(genes[1].values)
        if make_unique:
            var_names_idx = anndata.utils.make_index_unique(var_names_idx)
        adata.var_names = var_names_idx
        adata.var["gene_ids"] = genes[0].values
    elif var_names == "gene_ids":
        adata.var_names = genes[0].values
        adata.var["gene_symbols"] = genes[1].values
    else:
        msg = "`var_names` needs to be 'gene_symbols' or 'gene_ids'"
        raise ValueError(msg)
    if not is_legacy:
        adata.var["feature_types"] = genes[2].values
    

    # 读取barcodes，并将其赋给adata.obs_names，即adata.obs的索引
    barcodes = pd.read_csv(path / f"{prefix}barcodes.tsv{suffix}", header=None)
    adata.obs_names = barcodes[0].values
    return adata

In [None]:
# 仅读取表达矩阵
adata = sc.read('data/filtered_gene_bc_matrices/hg19/matrix.mtx')
# 基因列表
genes = pd.read_csv('data/filtered_gene_bc_matrices/hg19/genes.tsv', header=None, sep='\t')
# 细胞列表
barcodes = pd.read_csv('data/filtered_gene_bc_matrices/hg19/barcodes.tsv', header=None)

### 探索AnnData结构

obs, var 是一维注释（数据框）

In [None]:
class AnnData(metaclass=utils.DeprecationMixinMeta):  # noqa: PLW1641
    def _set_dim_index(self, value: pd.Index, attr: str):
        # Assumes _prep_dim_index has been run
        if self.is_view:
            self._init_as_actual(self.copy())
        getattr(self, attr).index = value
        for v in getattr(self, f"_{attr}m").values():
            if isinstance(v, pd.DataFrame):
                v.index = value

    @property
    def obs(self) -> pd.DataFrame | Dataset2D:
        """One-dimensional annotation of observations (`pd.DataFrame`)."""
        return self._obs

    @obs.setter
    def obs(self, value: pd.DataFrame | XDataset):
        self._set_dim_df(value, "obs")

    @obs.deleter
    def obs(self):
        self.obs = pd.DataFrame({}, index=self.obs_names)

    @property
    def obs_names(self) -> pd.Index:
        """Names of observations (alias for `.obs.index`)."""
        return self.obs.index

    @obs_names.setter
    def obs_names(self, names: Sequence[str]):
        names = self._prep_dim_index(names, "obs")
        self._set_dim_index(names, "obs")

    
    @property
    def var(self) -> pd.DataFrame | Dataset2D:
        """One-dimensional annotation of variables/ features (`pd.DataFrame`)."""
        return self._var

    @var.setter
    def var(self, value: pd.DataFrame | XDataset):
        self._set_dim_df(value, "var")

    @var.deleter
    def var(self):
        self.var = pd.DataFrame({}, index=self.var_names)

    @property
    def var_names(self) -> pd.Index:
        """Names of variables (alias for `.var.index`)."""
        return self.var.index

    @var_names.setter
    def var_names(self, names: Sequence[str]):
        names = self._prep_dim_index(names, "var")
        self._set_dim_index(names, "var")

    
    def var_names_make_unique(self, join: str = "-"):
        # Important to go through the setter so obsm dataframes are updated too
        self.var_names = utils.make_index_unique(self.var.index, join)

    def obs_names_make_unique(self, join: str = "-"):
        # Important to go through the setter so obsm dataframes are updated too
        self.obs_names = utils.make_index_unique(self.obs.index, join)

In [49]:
adata.var_names_make_unique()
adata

AnnData object with n_obs × n_vars = 2700 × 32738
    var: 'gene_ids'