# GSE23385空转数据
|Sample ID|实验分组|分组代号|
|---------|-------|------|
|-|sham|sham|
|-|pMCAO 第 1 天|D1|
|-|pMCAO 第 3 天|D3|
|-|pMCAO 第 7 天|D7|


## 数据读取与转换

In [11]:
%reset -f
import pandas as pd
import numpy as np
import anndata as ad
import scipy.sparse as sp
from os.path import join
import scanpy as sc
import yaml
import os

print(os.getcwd())
sc.settings.verbosity = 3
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

def csv2hdf(folder = "", prefix = "", output = "output.h5ad"):
    # 1. 读取数据
    # 假设你在 R 中导出的文件都在当前工作目录
    expr_data_df = pd.read_csv(join(folder, f"{prefix}expression_data.csv"), index_col=0) # index_col=0 确保第一列是索引
    cell_meta_df = pd.read_csv(join(folder, f"{prefix}cell_metadata.csv"), index_col=0)
    gene_meta_df = pd.read_csv(join(folder, f"{prefix}gene_metadata.csv"), index_col=0) # 如果有的话
    # 确保数据的行和列与 AnnData 的要求一致
    # AnnData 的 X 通常是 (n_cells, n_genes)
    # Seurat 的表达矩阵通常是 (n_genes, n_cells)，所以需要转置
    # 原始数据是稀疏的，可以转换为稀疏矩阵以节省内存
    X = sp.csr_matrix(expr_data_df.T)

    # 2. 构建 AnnData 对象
    # obs 是细胞元数据 (index 必须与 X 的行名匹配)
    # var 是基因元数据 (index 必须与 X 的列名匹配)
    adata = ad.AnnData(
        X=X,
        obs=cell_meta_df,
        var=gene_meta_df # 如果有的话
    )

    # 3. 添加降维嵌入
    # adata.obsm['X_pca'] = pca_coords_df.loc[adata.obs_names].values # 确保顺序匹配
    # adata.obsm['X_umap'] = umap_coords_df.loc[adata.obs_names].values

    # 4. 添加聚类信息到 obs
    # adata.obs['seurat_clusters'] = clusters_df.loc[adata.obs_names, 'clusters']

    # 5. 保存为 .h5ad 文件
    adata.write(output)

    return adata
 
targets = pd.DataFrame({
    "id": ["GSM7437217", "GSM7437218", "GSM7437219", "GSM7437220"],
    "species": ["wt", "wt", "wt", "wt"],
    "group": ["sham", "mcao_d1", "mcao_d3", "mcao_d7"],
    "prefix": ["sham_", "d1_", "d3_", "d7_"]
})
targets.to_csv("data/spatial_targets.csv", index=None)
targets

/Volumes/MacPassport/project/bioinfo/GSE233815


Unnamed: 0,id,species,group,prefix
0,GSM7437217,wt,sham,sham_
1,GSM7437218,wt,mcao_d1,d1_
2,GSM7437219,wt,mcao_d3,d3_
3,GSM7437220,wt,mcao_d7,d7_


In [13]:
adatas={}

for _,row in targets.iterrows():
    sample_adata = csv2hdf(folder="data/spatial/GSE233813_csv", prefix=row["prefix"])
    sample_adata.obs.columns = ["location", 'nCount_Spatial', 'nFeature_Spatial', 'sample', 'condition']
    adatas[row["id"]] = sample_adata

# 合并所有样本
adata_raw = ad.concat(adatas=adatas, label="sample")
adata_raw.raw = adata_raw.copy()
adata_raw.write(f"data/{config['project_code']}_sp_raw.h5ad")
adata_raw

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 10173 × 32285
    obs: 'location', 'nCount_Spatial', 'nFeature_Spatial', 'sample', 'condition'