Sanity Check

In [41]:
# import scanpy as sc
import pandas as pd
from scipy import io
import numpy as np

# 1) 检查 RNA AnnData
adata_rna = sc.read_h5ad(
    r"D:\HuaweiMoveData\Users\hya\Desktop\CMML\ICA2\Results\RNA_preprocessed.h5ad"
)
print("=== RNA preprocessed AnnData ===")
print(adata_rna)

# 2) 检查 ATAC AnnData
adata_atac = sc.read_h5ad(
    r"D:\HuaweiMoveData\Users\hya\Desktop\CMML\ICA2\Results\ATAC_preprocessed.h5ad"
)
print("\n=== ATAC preprocessed AnnData ===")
print(adata_atac)

# 3) 加载用于 scAI 的归一化矩阵（RNA: log1p 后；ATAC: TF–IDF 后）
rna_norm = io.mmread(
    r"D:\HuaweiMoveData\Users\hya\Desktop\CMML\ICA2\Results\scAI_rna_counts.mtx"
).tocsr()
atac_tfidf = io.mmread(
    r"D:\HuaweiMoveData\Users\hya\Desktop\CMML\ICA2\Results\scAI_atac_counts.mtx"
).tocsr()

# 4) 检查维度和标签
rna_genes = pd.read_csv(
    r"D:\HuaweiMoveData\Users\hya\Desktop\CMML\ICA2\Results\scAI_rna_genes.tsv",
    sep="\t"
)["gene"].tolist()
rna_cells = pd.read_csv(
    r"D:\HuaweiMoveData\Users\hya\Desktop\CMML\ICA2\Results\scAI_rna_barcodes.tsv",
    sep="\t"
)["cell"].tolist()

assert rna_norm.shape == (len(rna_genes), len(rna_cells)), "RNA 矩阵形状与标签不匹配"
print("RNA 矩阵 shape:", rna_norm.shape)

atac_peaks = pd.read_csv(
    r"D:\HuaweiMoveData\Users\hya\Desktop\CMML\ICA2\Results\scAI_atac_peaks.tsv",
    sep="\t"
)["peak"].tolist()
atac_cells = pd.read_csv(
    r"D:\HuaweiMoveData\Users\hya\Desktop\CMML\ICA2\Results\scAI_atac_barcodes.tsv",
    sep="\t"
)["cell"].tolist()

assert atac_tfidf.shape == (len(atac_peaks), len(atac_cells)), "ATAC 矩阵形状与标签不匹配"
print("ATAC 矩阵 shape:", atac_tfidf.shape)

# 5) 非负值检查（归一化矩阵允许浮点数）
assert np.all(rna_norm.data >= 0), "RNA 矩阵存在负值"
assert np.all(atac_tfidf.data >= 0), "ATAC 矩阵存在负值"
print("✔ 非负值检查通过")

# 6) 统计分布摘要
print("\n--- RNA per-cell 总和（log1p 后）---")
print(pd.Series(rna_norm.sum(axis=0).A1).describe())

print("\n--- ATAC per-cell 总和（TF–IDF 后）---")
print(pd.Series(atac_tfidf.sum(axis=0).A1).describe())

# 7) 交集细胞检查
common = set(rna_cells) & set(atac_cells)
print(f"\n共享 cells：{len(common)} / RNA {len(rna_cells)} / ATAC {len(atac_cells)}")

print("\nSanity check 完成，归一化矩阵已可用于 scAI 输入！")

=== RNA preprocessed AnnData ===
AnnData object with n_obs × n_vars = 1081 × 2000
    obs: 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_counts', 'n_genes'
    var: 'gene_ids', 'feature_types', 'genome', 'interval', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg', 'log1p'

=== ATAC preprocessed AnnData ===
AnnData object with n_obs × n_vars = 1081 × 77242
    obs: 'n_fragments'
    obsm: 'X_tfidf'
RNA 矩阵 shape: (2000, 1081)
ATAC 矩阵 shape: (77242, 1081)
✔ 非负值检查通过

--- RNA per-cell 总和（log1p 后）---
count    1081.000000
mean      215.878005
std        41.057185
min       118.199217
25%       189.725395
50%       207.043906
75%       229.262270
max       400.052728
dtype: float64

--- ATAC per-cell 总和（TF–IDF 后）---
count    1.081000e+03
mean     1.000000e+00
std      3.638885e-15
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%     