In [1]:
# 步骤1: 安装核心科学计算包
import sys
print(f"Python版本: {sys.version}")
print("安装核心科学计算包...")

%pip install pandas numpy matplotlib seaborn scipy


Python版本: 3.11.13 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 13:03:15) [MSC v.1929 64 bit (AMD64)]
安装核心科学计算包...
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Note: you may need to restart the kernel to use updated packages.


In [2]:
# 步骤2: 安装scanpy (单细胞分析核心包)
print("安装scanpy...")
%pip install scanpy


安装scanpy...
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Note: you may need to restart the kernel to use updated packages.


In [3]:
# 步骤3: 测试基本导入
print("测试基本包导入...")
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import scanpy as sc
    print("✓ 基本包导入成功")
except ImportError as e:
    print(f"✗ 导入失败: {e}")
    print("请重启内核后重新运行前面的安装步骤")


测试基本包导入...
✓ 基本包导入成功


In [4]:
# 步骤4: 安装可选的高级包 (如果需要)
print("安装可选的高级包...")
print("注意: 这些包是可选的，如果安装失败可以跳过")

# 安装通路富集分析包
print("尝试安装 gseapy...")
import subprocess
import sys

try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "gseapy"])
    print("✓ gseapy 安装成功")
except Exception as e:
    print(f"⚠️ gseapy 安装失败: {e}")
    print("可以跳过，后续会使用替代方法")

# 安装批次校正包
print("尝试安装 harmonypy...")
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "harmonypy"])
    print("✓ harmonypy 安装成功")
except Exception as e:
    print(f"⚠️ harmonypy 安装失败: {e}")
    print("可以跳过，后续会使用替代方法")


安装可选的高级包...
注意: 这些包是可选的，如果安装失败可以跳过
尝试安装 gseapy...
✓ gseapy 安装成功
尝试安装 harmonypy...
✓ harmonypy 安装成功


In [5]:
# 步骤5: 安全的环境设置
print("开始安全的环境设置...")

# 逐步导入和测试每个库
import os
import warnings
warnings.filterwarnings('ignore')

print("✓ 基本库导入成功")

# 测试scanpy导入
try:
    import scanpy as sc
    print("✓ scanpy 导入成功")
    
    # 安全地设置scanpy参数
    try:
        sc.settings.verbosity = 1  # 降低日志级别避免过多输出
        print("✓ scanpy 基本设置完成")
    except Exception as e:
        print(f"⚠️ scanpy 设置出现问题: {e}")
        print("使用默认设置继续")
        
except ImportError as e:
    print(f"✗ scanpy 导入失败: {e}")
    print("请重新安装 scanpy: pip install scanpy")
    raise

# 测试matplotlib导入和设置
try:
    import matplotlib.pyplot as plt
    print("✓ matplotlib 导入成功")
    
    # 设置基本绘图参数
    plt.rcParams['figure.figsize'] = (8, 6)
    plt.rcParams['figure.dpi'] = 80
    print("✓ matplotlib 设置完成")
    
except Exception as e:
    print(f"⚠️ matplotlib 设置出现问题: {e}")

print("✓ 环境设置完成！")
print("如果上述步骤都显示成功，现在可以开始单细胞分析了。")


开始安全的环境设置...
✓ 基本库导入成功
✓ scanpy 导入成功
✓ scanpy 基本设置完成
✓ matplotlib 导入成功
✓ matplotlib 设置完成
✓ 环境设置完成！
如果上述步骤都显示成功，现在可以开始单细胞分析了。


In [6]:
# 🔍 环境检查和数据路径设置
print("检查运行环境...")

# 检查必要的包是否可用
required_packages = {
    'pandas': 'pd',
    'numpy': 'np', 
    'matplotlib.pyplot': 'plt',
    'seaborn': 'sns',
    'scanpy': 'sc'
}

missing_packages = []
for package, alias in required_packages.items():
    try:
        exec(f"import {package} as {alias}")
        print(f"✓ {package}")
    except ImportError:
        print(f"✗ {package} - 需要安装")
        missing_packages.append(package)

if missing_packages:
    print(f"\n⚠️ 缺少以下包: {', '.join(missing_packages)}")
    print("请先安装缺少的包后再继续")
else:
    print("\n✅ 所有必要的包都已安装")
    
    # 设置数据路径
    print("\n设置数据路径...")
    counts_path = '../data/raw/singlecell/UMIcounts_HGSC.tsv'
    annotation_path = '../data/raw/singlecell/annotation_HGSC.tsv'
    output_dir = '../results/single_cell_analysis/'
    
    # 创建输出目录
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"✓ 创建输出目录: {output_dir}")
    else:
        print(f"✓ 输出目录已存在: {output_dir}")
        
    # 检查数据文件
    print("\n检查数据文件...")
    if os.path.exists(counts_path):
        print(f"✓ UMI计数文件存在: {counts_path}")
    else:
        print(f"✗ UMI计数文件不存在: {counts_path}")
        print("请检查文件路径或下载数据文件")
        
    if os.path.exists(annotation_path):
        print(f"✓ 注释文件存在: {annotation_path}")
    else:
        print(f"✗ 注释文件不存在: {annotation_path}")
        print("请检查文件路径或下载数据文件")
        
    print("\n🚀 环境检查完成，可以开始分析！")


检查运行环境...
✓ pandas
✓ numpy
✓ matplotlib.pyplot
✓ seaborn
✓ scanpy

✅ 所有必要的包都已安装

设置数据路径...
✓ 创建输出目录: ../results/single_cell_analysis/

检查数据文件...
✓ UMI计数文件存在: ../data/raw/singlecell/UMIcounts_HGSC.tsv
✓ 注释文件存在: ../data/raw/singlecell/annotation_HGSC.tsv

🚀 环境检查完成，可以开始分析！


In [8]:
# 检查系统资源和环境
import psutil
import sys
import platform

print("=== 系统信息 ===")
print(f"操作系统: {platform.system()} {platform.release()}")
print(f"Python版本: {sys.version}")
print(f"总内存: {psutil.virtual_memory().total / (1024**3):.2f} GB")
print(f"可用内存: {psutil.virtual_memory().available / (1024**3):.2f} GB")
print(f"CPU核心数: {psutil.cpu_count()}")

# 检查是否有足够内存进行分析
if psutil.virtual_memory().available < 4 * 1024**3:  # 小于4GB
    print("⚠️  警告: 可用内存可能不足，建议关闭其他程序或使用更小的数据集进行测试")
else:
    print("✅ 内存充足，可以进行分析")


=== 系统信息 ===
操作系统: Windows 10
Python版本: 3.11.13 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 13:03:15) [MSC v.1929 64 bit (AMD64)]
总内存: 15.19 GB
可用内存: 6.54 GB
CPU核心数: 16
✅ 内存充足，可以进行分析


In [1]:
# 如果您使用的是conda环境，建议先用conda安装一些基础包
# conda install -c conda-forge scanpy python-igraph leidenalg

# 然后使用pip安装其他包
%pip install scanpy gseapy scikit-misc harmony-pytorch pandas matplotlib seaborn


Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting scanpy
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/34/cd/50e2af91c8d64722958cc53ca59f3e2bdc05ae4a5b82d78672cfd2ec7b3c/scanpy-1.11.3-py3-none-any.whl (2.1 MB)
Collecting scikit-misc
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/98/7a/0ef8646dac8bcac7151c5dd3e604a33c497d6d093f4db538e25656144394/scikit_misc-0.5.1-cp311-cp311-win_amd64.whl (157 kB)
Collecting harmony-pytorch
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/75/da/42486f1c79b6f2db9140ee23161791e5b25d9369f30c1d9f67b67f3eb4bf/harmony_pytorch-0.1.8-py3-none-any.whl (8.5 kB)
Collecting anndata>=0.8 (from scanpy)
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/0a/4b/ab615fea52e34579d5c6c7dba86b4f9d7f3cdb6a170b348ec49f34cf4355/anndata-0.11.4-py3-none-any.whl (144 kB)
Collecting h5py>=3.7.0 (from scanpy)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/db/0c/6c3f879a0f8e891625817637fad902da6e764e36919ed091dc775

In [9]:
# 导入所有需要的库
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

# 设置Scanpy的绘图参数，使其更美观
sc.settings.verbosity = 3  # 设置日志信息的详细程度
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

print("环境设置完成！")


: 

In [None]:
# 导入所有需要的库
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

# 设置Scanpy的绘图参数，使其更美观
sc.settings.verbosity = 3  # 设置日志信息的详细程度
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

print("环境设置完成！")


In [None]:
# 导入所有需要的库
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

# 设置Scanpy的绘图参数，使其更美观
sc.settings.verbosity = 3  # 设置日志信息的详细程度
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

print("环境设置完成！")


In [None]:
# 导入所有需要的库
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

# 设置Scanpy的绘图参数，使其更美观
sc.settings.verbosity = 3  # 设置日志信息的详细程度
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

print("环境设置完成！")


In [None]:
# 导入所有需要的库
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

# 设置Scanpy的绘图参数，使其更美观
sc.settings.verbosity = 3  # 设置日志信息的详细程度
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

print("环境设置完成！")


In [None]:
# 导入所有需要的库
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

# 设置Scanpy的绘图参数，使其更美观
sc.settings.verbosity = 3  # 设置日志信息的详细程度
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

print("环境设置完成！")


In [None]:
# 导入所有需要的库
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

# 设置Scanpy的绘图参数，使其更美观
sc.settings.verbosity = 3  # 设置日志信息的详细程度
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

print("环境设置完成！")


In [None]:
# 导入所有需要的库
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

# 设置Scanpy的绘图参数，使其更美观
sc.settings.verbosity = 3  # 设置日志信息的详细程度
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

print("环境设置完成！")


In [None]:
# 导入所有需要的库
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

# 设置Scanpy的绘图参数，使其更美观
sc.settings.verbosity = 3  # 设置日志信息的详细程度
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

print("环境设置完成！")


In [None]:
# 导入所有需要的库
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

# 设置Scanpy的绘图参数，使其更美观
sc.settings.verbosity = 3  # 设置日志信息的详细程度
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

print("环境设置完成！")


: 

In [None]:
# 导入所有需要的库
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

# 设置Scanpy的绘图参数，使其更美观
sc.settings.verbosity = 3  # 设置日志信息的详细程度
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

print("环境设置完成！")


: 

In [None]:
# --- 数据加载 ---

# 定义文件路径 (请根据您的项目结构调整)
# 假设Notebook在 `notebooks/` 目录下，数据在 `data/raw/singlecell/`
counts_path = '../data/raw/singlecell/UMIcounts_HGSC.tsv'
annotation_path = '../data/raw/singlecell/annotation_HGSC.tsv'
output_dir = '../results/single_cell_analysis/'

# 创建输出目录
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
print(f"数据路径:")
print(f"  UMI计数文件: {counts_path}")
print(f"  注释文件: {annotation_path}")
print(f"  输出目录: {output_dir}")

# 检查文件是否存在
if os.path.exists(counts_path):
    print("✓ UMI计数文件存在")
else:
    print("✗ UMI计数文件不存在，请检查路径")
    
if os.path.exists(annotation_path):
    print("✓ 注释文件存在")
else:
    print("✗ 注释文件不存在，请检查路径")


In [None]:
# 读取UMI计数矩阵
print("正在读取UMI计数矩阵...")
counts_df = pd.read_csv(counts_path, sep='\t', index_col=0)
print(f"原始计数矩阵形状: {counts_df.shape} (基因 x 细胞)")

# 转置为Scanpy要求的 细胞x基因 格式
adata = sc.AnnData(counts_df.T)
print(f"AnnData对象形状: {adata.shape} (细胞 x 基因)")

# 读取并加载细胞注释信息
print("\n正在读取细胞注释信息...")
annotation_df = pd.read_csv(annotation_path, sep='\t', index_col=0)
print(f"注释信息形状: {annotation_df.shape}")
print(f"注释列名: {list(annotation_df.columns)}")

# 将注释信息合并到AnnData对象中
adata.obs = adata.obs.join(annotation_df)

print("\nAnnData对象创建成功:")
print(adata)

# 显示前几行注释信息
print("\n前5个细胞的注释信息:")
print(adata.obs.head())


In [None]:
# --- 质量控制 (QC) ---

# 识别线粒体基因 (通常以 'MT-' 开头)
adata.var['mt'] = adata.var_names.str.startswith('MT-')
print(f"识别出 {adata.var['mt'].sum()} 个线粒体基因")

# 识别核糖体基因 (通常以 'RPS' 或 'RPL' 开头)
adata.var['ribo'] = adata.var_names.str.startswith(('RPS', 'RPL'))
print(f"识别出 {adata.var['ribo'].sum()} 个核糖体基因")

# 计算QC指标
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pp.calculate_qc_metrics(adata, qc_vars=['ribo'], percent_top=None, log1p=False, inplace=True)

print("\n质量控制指标计算完成")
print(f"QC指标列: {[col for col in adata.obs.columns if 'qc' in col.lower() or 'count' in col or 'gene' in col]}")

# 显示QC统计信息
print("\n基本QC统计:")
print(f"细胞数: {adata.n_obs}")
print(f"基因数: {adata.n_vars}")
print(f"每个细胞平均基因数: {adata.obs['n_genes_by_counts'].mean():.1f}")
print(f"每个细胞平均UMI数: {adata.obs['total_counts'].mean():.1f}")
print(f"平均线粒体基因比例: {adata.obs['pct_counts_mt'].mean():.1f}%")


In [None]:
# 可视化QC指标，帮助确定过滤阈值
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# 基因数分布
axes[0].hist(adata.obs['n_genes_by_counts'], bins=50, alpha=0.7)
axes[0].set_xlabel('检测到的基因数')
axes[0].set_ylabel('细胞数')
axes[0].set_title('每个细胞检测到的基因数分布')
axes[0].axvline(x=200, color='red', linestyle='--', label='最小阈值=200')
axes[0].axvline(x=5000, color='red', linestyle='--', label='最大阈值=5000')
axes[0].legend()

# UMI数分布
axes[1].hist(adata.obs['total_counts'], bins=50, alpha=0.7)
axes[1].set_xlabel('总UMI数')
axes[1].set_ylabel('细胞数')
axes[1].set_title('每个细胞总UMI数分布')
axes[1].axvline(x=1000, color='red', linestyle='--', label='最小阈值=1000')
axes[1].legend()

# 线粒体基因比例分布
axes[2].hist(adata.obs['pct_counts_mt'], bins=50, alpha=0.7)
axes[2].set_xlabel('线粒体基因比例 (%)')
axes[2].set_ylabel('细胞数')
axes[2].set_title('线粒体基因比例分布')
axes[2].axvline(x=20, color='red', linestyle='--', label='最大阈值=20%')
axes[2].legend()

plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'qc_metrics_before_filtering.png'), dpi=300, bbox_inches='tight')
plt.show()

print("QC指标可视化完成，请根据分布情况调整过滤阈值")


In [None]:
# 根据QC指标进行过滤
print("过滤前统计:")
print(f"细胞数: {adata.n_obs}")
print(f"基因数: {adata.n_vars}")

# 过滤低质量细胞和基因
sc.pp.filter_cells(adata, min_genes=200)  # 过滤掉基因数少于200的细胞
sc.pp.filter_genes(adata, min_cells=3)    # 过滤掉在少于3个细胞中表达的基因

# 过滤线粒体基因比例过高的细胞
adata = adata[adata.obs.pct_counts_mt < 20, :]

# 过滤基因数过多的细胞（可能是双细胞）
adata = adata[adata.obs.n_genes_by_counts < 5000, :]

# 过滤UMI数过低的细胞
adata = adata[adata.obs.total_counts > 1000, :]

print("\n过滤后统计:")
print(f"细胞数: {adata.n_obs}")
print(f"基因数: {adata.n_vars}")

# 显示各组样本的细胞数
print("\n各组样本的细胞数:")
print(adata.obs.groupby(['Sample', 'Type']).size())

print("\n质量控制和过滤完成！")


In [None]:
# --- 数据标准化与特征选择 ---

# 对每个细胞的总计数进行标准化，然后进行log1p对数变换
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

print("数据标准化完成")

# 识别高变异基因 (Highly Variable Genes, HVGs)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

print(f"识别出 {adata.var.highly_variable.sum()} 个高变异基因")

# 可视化高变异基因
sc.pl.highly_variable_genes(adata, save='_highly_variable_genes.png')

# 保存原始（标准化后）的表达数据
adata.raw = adata

# 只保留高变异基因进行后续分析
adata = adata[:, adata.var.highly_variable]

print(f"保留 {adata.n_vars} 个高变异基因进行后续分析")


In [None]:
# --- 批次效应校正 ---

# 首先进行PCA降维
sc.pp.scale(adata, max_value=10)  # 对基因进行缩放，使均值为0，方差为1
sc.tl.pca(adata, svd_solver='arpack')

print("PCA降维完成")

# 可视化PCA，检查是否存在批次效应
sc.pl.pca_variance_ratio(adata, log=True, n_pcs=50, save='_pca_variance.png')

# 检查不同样本在PCA空间中的分布
sc.pl.pca(adata, color='Sample', save='_pca_by_sample.png')
sc.pl.pca(adata, color='Type', save='_pca_by_type.png')

print("PCA可视化完成，请检查是否存在明显的批次效应")


In [None]:
# 尝试使用Harmony进行批次校正
try:
    import harmonypy as hm
    
    # 使用Harmony进行批次校正
    harmony_out = hm.run_harmony(adata.obsm['X_pca'], adata.obs, vars_use=['Sample'])
    adata.obsm['X_pca_harmony'] = harmony_out.Z_corr.T
    
    print("Harmony批次校正完成")
    
    # 可视化校正后的结果
    sc.pl.embedding(adata, basis='X_pca_harmony', color='Sample', save='_harmony_by_sample.png')
    sc.pl.embedding(adata, basis='X_pca_harmony', color='Type', save='_harmony_by_type.png')
    
    use_harmony = True
    
except ImportError:
    print("Harmony未安装，跳过批次校正")
    print("如需使用Harmony，请安装: pip install harmonypy")
    use_harmony = False


In [None]:
# --- 降维与聚类 ---

# 选择用于邻近图计算的表示
if use_harmony:
    rep_key = 'X_pca_harmony'
    print("使用Harmony校正后的PCA进行后续分析")
else:
    rep_key = 'X_pca'
    print("使用原始PCA进行后续分析")

# 计算邻近图
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40, use_rep=rep_key)

# 使用Leiden算法进行细胞聚类
sc.tl.leiden(adata, resolution=0.5)  # resolution参数可以调整聚类的精细程度

print(f"Leiden聚类完成，共识别出 {len(adata.obs['leiden'].unique())} 个聚类")

# 进行UMAP非线性降维，用于可视化
sc.tl.umap(adata)

print("UMAP降维完成")

# 可视化聚类结果
sc.pl.umap(adata, color=['leiden', 'Sample', 'Type'], 
           save='_umap_overview.png', ncols=3)

print("聚类和降维分析完成！")


In [None]:
# --- 寻找标记基因 (Marker Genes) ---

# 计算每个聚类的标记基因
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')

print("标记基因计算完成")

# 可视化Top标记基因
sc.pl.rank_genes_groups(adata, n_genes=5, sharey=False, save='_marker_genes_heatmap.png')

# 可视化标记基因的表达热图
sc.pl.rank_genes_groups_heatmap(adata, n_genes=3, save='_marker_genes_expression_heatmap.png')

print("标记基因可视化完成")


In [None]:
# --- 基于已知标记进行细胞注释 ---

# 定义经典的细胞类型标记基因
marker_genes = {
    'Cancer_Cells': ['EPCAM', 'PAX8', 'KRT7', 'MUC16', 'WT1'],
    'T_Cells': ['CD3D', 'CD3E', 'CD8A', 'CD4', 'IL7R'],
    'B_Cells': ['MS4A1', 'CD19', 'CD79A', 'CD79B'],
    'Myeloid_Cells': ['CD68', 'CD14', 'CD163', 'C1QA', 'LYZ'],
    'Fibroblasts': ['DCN', 'COL1A1', 'COL3A1', 'LUM', 'VIM'],
    'Endothelial_Cells': ['PECAM1', 'VWF', 'CDH5', 'ENG'],
    'NK_Cells': ['GNLY', 'NKG7', 'PRF1', 'GZMB'],
    'Dendritic_Cells': ['FCER1A', 'CD1C', 'CLEC9A']
}

# 检查这些标记基因是否在数据中存在
print("检查标记基因可用性:")
for cell_type, genes in marker_genes.items():
    available_genes = [g for g in genes if g in adata.raw.var_names]
    print(f"{cell_type}: {len(available_genes)}/{len(genes)} 个基因可用")
    if available_genes:
        print(f"  可用基因: {', '.join(available_genes)}")

# 筛选出实际存在的标记基因
filtered_marker_genes = {}
for cell_type, genes in marker_genes.items():
    available_genes = [g for g in genes if g in adata.raw.var_names]
    if available_genes:
        filtered_marker_genes[cell_type] = available_genes

print(f"\n最终用于注释的细胞类型数: {len(filtered_marker_genes)}")


In [None]:
# 使用点图可视化标记基因在各个聚类中的表达情况
if filtered_marker_genes:
    sc.pl.dotplot(adata, filtered_marker_genes, groupby='leiden', 
                  save='_marker_gene_dotplot.png', use_raw=True)
    
    # 在UMAP上可视化一些关键标记基因
    key_markers = ['EPCAM', 'CD3D', 'CD68', 'DCN', 'PECAM1']
    available_key_markers = [g for g in key_markers if g in adata.raw.var_names]
    
    if available_key_markers:
        sc.pl.umap(adata, color=available_key_markers, use_raw=True, 
                   save='_key_markers_umap.png', ncols=3)
    
    print("标记基因表达可视化完成")
else:
    print("未找到足够的标记基因进行可视化")

# 显示每个聚类的细胞数和在不同组中的分布
cluster_stats = adata.obs.groupby(['leiden', 'Type']).size().unstack(fill_value=0)
print("\n各聚类在不同组中的细胞数分布:")
print(cluster_stats)


In [None]:
# 根据标记基因表达和聚类分布进行手动注释
# 这需要根据实际的点图和UMAP结果进行调整

# 示例注释（需要根据实际结果调整）
def annotate_clusters(adata):
    """
    根据标记基因表达模式注释细胞类型
    这个函数需要根据实际的可视化结果进行调整
    """
    # 获取每个聚类的标记基因表达得分
    cluster_annotations = {}
    
    for cluster in adata.obs['leiden'].unique():
        cluster_cells = adata[adata.obs['leiden'] == cluster]
        
        # 计算各种细胞类型标记基因的平均表达
        scores = {}
        for cell_type, genes in filtered_marker_genes.items():
            if genes:  # 确保有可用基因
                # 计算该细胞类型标记基因的平均表达
                expr_data = cluster_cells.raw[:, genes].X
                if hasattr(expr_data, 'toarray'):
                    expr_data = expr_data.toarray()
                scores[cell_type] = np.mean(expr_data)
        
        # 选择得分最高的细胞类型
        if scores:
            best_type = max(scores, key=scores.get)
            cluster_annotations[cluster] = best_type
        else:
            cluster_annotations[cluster] = 'Unknown'
    
    return cluster_annotations

# 执行自动注释
cluster_annotations = annotate_clusters(adata)

print("聚类注释结果:")
for cluster, annotation in cluster_annotations.items():
    cell_count = sum(adata.obs['leiden'] == cluster)
    print(f"聚类 {cluster}: {annotation} ({cell_count} 个细胞)")

# 将注释添加到AnnData对象
adata.obs['cell_type_auto'] = adata.obs['leiden'].map(cluster_annotations)

# 可视化注释结果
sc.pl.umap(adata, color='cell_type_auto', legend_loc='on data', 
           save='_cell_type_annotation.png')

print("\n自动细胞类型注释完成！")


In [None]:
# --- 细胞组成差异分析 ---

# 计算每种细胞类型在不同样本和分组中的比例
composition = adata.obs.groupby(['Type', 'Sample', 'cell_type_auto']).size().unstack(fill_value=0)
composition_proportions = composition.div(composition.sum(axis=1), axis=0)

print("细胞类型组成比例:")
print(composition_proportions)

# 计算每种细胞类型在两组中的平均比例
group_composition = adata.obs.groupby(['Type', 'cell_type_auto']).size().unstack(fill_value=0)
group_proportions = group_composition.div(group_composition.sum(axis=1), axis=0)

print("\n各组细胞类型平均比例:")
print(group_proportions)

# 可视化细胞组成差异
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# 按样本显示
composition_proportions.plot(kind='bar', stacked=True, ax=axes[0])
axes[0].set_title('各样本细胞类型组成')
axes[0].set_ylabel('比例')
axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 按组显示
group_proportions.plot(kind='bar', ax=axes[1])
axes[1].set_title('耐药组 vs 敏感组细胞类型比例')
axes[1].set_ylabel('比例')
axes[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
axes[1].set_xticklabels(['耐药组', '敏感组'], rotation=0)

plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'cell_composition_analysis.png'), 
            dpi=300, bbox_inches='tight')
plt.show()

print("细胞组成差异分析完成")


In [None]:
# --- 差异表达基因 (DEG) 分析 ---

# 重点分析癌细胞中的差异表达基因
cancer_cells = adata[adata.obs['cell_type_auto'] == 'Cancer_Cells'].copy()

if cancer_cells.n_obs > 0:
    print(f"抽取出 {cancer_cells.n_obs} 个癌细胞用于差异分析")
    
    # 检查两组的细胞数
    type_counts = cancer_cells.obs['Type'].value_counts()
    print(f"癌细胞分组情况:")
    print(type_counts)
    
    if len(type_counts) >= 2 and type_counts.min() >= 10:
        # 进行差异表达分析
        sc.tl.rank_genes_groups(cancer_cells, 'Type', 
                               groups=['refractory'], 
                               reference='sensitive', 
                               method='wilcoxon')
        
        print("癌细胞差异表达分析完成")
        
        # 可视化火山图
        sc.pl.rank_genes_groups_volcano(cancer_cells, save='_cancer_deg_volcano.png')
        
        # 获取差异表达基因结果
        deg_results = sc.get.rank_genes_groups_df(cancer_cells, group='refractory')
        deg_results = deg_results[deg_results['pvals_adj'] < 0.05]  # 只保留显著的基因
        
        print(f"\n显著差异表达基因数量: {len(deg_results)}")
        print("Top 10 上调基因:")
        print(deg_results.head(10)[['names', 'logfoldchanges', 'pvals_adj']])
        
        # 保存差异表达基因结果
        deg_results.to_csv(os.path.join(output_dir, 'cancer_cells_deg_results.csv'), index=False)
        
        # 可视化Top差异基因的表达
        top_genes = deg_results.head(10)['names'].tolist()
        available_top_genes = [g for g in top_genes if g in cancer_cells.raw.var_names]
        
        if available_top_genes:
            sc.pl.violin(cancer_cells, keys=available_top_genes[:6], 
                        groupby='Type', use_raw=True, 
                        save='_top_deg_violin.png')
    else:
        print("癌细胞数量不足，无法进行差异分析")
        deg_results = pd.DataFrame()
else:
    print("未找到癌细胞，跳过差异分析")
    deg_results = pd.DataFrame()


In [None]:
# --- 通路富集分析 (GSEA) ---

if not deg_results.empty:
    try:
        import gseapy as gp
        
        # 准备基因列表进行GSEA
        gene_list = deg_results.set_index('names')['logfoldchanges'].sort_values(ascending=False)
        
        print(f"用于GSEA的基因数量: {len(gene_list)}")
        
        # 运行GSEA
        gsea_results = gp.prerank(
            rnk=gene_list,
            gene_sets=['GO_Biological_Process_2021', 'KEGG_2021_Human'],
            threads=4,
            min_size=5,
            max_size=500,
            permutation_num=100,  # 实际分析建议用1000
            outdir=os.path.join(output_dir, 'gsea_results'),
            seed=42,
            verbose=True
        )
        
        print("GSEA分析完成")
        
        # 显示Top富集通路
        gsea_df = gsea_results.res2d
        significant_pathways = gsea_df[gsea_df['FDR q-val'] < 0.05]
        
        print(f"\n显著富集通路数量: {len(significant_pathways)}")
        
        if len(significant_pathways) > 0:
            print("Top 10 富集通路:")
            print(significant_pathways.head(10)[['Term', 'ES', 'NES', 'FDR q-val']])
            
            # 检查IFN-I和缺氧相关通路
            ifn_pathways = significant_pathways[
                significant_pathways['Term'].str.contains('interferon|IFN', case=False, na=False)
            ]
            hypoxia_pathways = significant_pathways[
                significant_pathways['Term'].str.contains('hypoxia|oxygen', case=False, na=False)
            ]
            
            print(f"\n干扰素相关通路数量: {len(ifn_pathways)}")
            if len(ifn_pathways) > 0:
                print("干扰素相关通路:")
                print(ifn_pathways[['Term', 'ES', 'NES', 'FDR q-val']])
            
            print(f"\n缺氧相关通路数量: {len(hypoxia_pathways)}")
            if len(hypoxia_pathways) > 0:
                print("缺氧相关通路:")
                print(hypoxia_pathways[['Term', 'ES', 'NES', 'FDR q-val']])
        
        # 保存结果
        gsea_df.to_csv(os.path.join(output_dir, 'gsea_all_results.csv'), index=False)
        
    except ImportError:
        print("gseapy未安装，跳过GSEA分析")
        print("如需进行通路富集分析，请安装: pip install gseapy")
    except Exception as e:
        print(f"GSEA分析出错: {e}")
else:
    print("无差异表达基因，跳过GSEA分析")


In [None]:
# --- SMARCA2 表达可视化 ---

# 检查SMARCA2是否在数据中
if 'SMARCA2' in adata.raw.var_names:
    print("✓ SMARCA2基因在数据中存在")
    
    # 在UMAP图上可视化 SMARCA2 的表达
    sc.pl.umap(adata, color='SMARCA2', use_raw=True, 
               title='SMARCA2 Expression', save='_smarca2_umap.png')
    
    # 在不同细胞类型中可视化 SMARCA2 的表达
    sc.pl.violin(adata, keys='SMARCA2', groupby='cell_type_auto', 
                 rotation=90, use_raw=True, save='_smarca2_violin_celltype.png')
    
    # 在不同分组中可视化 SMARCA2 的表达
    sc.pl.violin(adata, keys='SMARCA2', groupby='Type', 
                 use_raw=True, save='_smarca2_violin_type.png')
    
    # 计算SMARCA2在不同组中的表达统计
    smarca2_expr = adata.raw[:, 'SMARCA2'].X
    if hasattr(smarca2_expr, 'toarray'):
        smarca2_expr = smarca2_expr.toarray().flatten()
    
    # 添加SMARCA2表达到obs中
    adata.obs['SMARCA2_expr'] = smarca2_expr
    
    # 按组计算统计
    smarca2_stats = adata.obs.groupby('Type')['SMARCA2_expr'].agg(['mean', 'median', 'std'])
    print("\nSMARCA2表达统计:")
    print(smarca2_stats)
    
    # 按细胞类型计算统计
    smarca2_celltype_stats = adata.obs.groupby('cell_type_auto')['SMARCA2_expr'].agg(['mean', 'median', 'std'])
    print("\nSMARCA2在不同细胞类型中的表达统计:")
    print(smarca2_celltype_stats)
    
else:
    print("✗ SMARCA2基因不在数据中")
    print("可能的原因：1) 基因名称不同 2) 基因在质控中被过滤")
    
    # 搜索可能的SMARCA2相关基因
    possible_names = ['SMARCA2', 'BRM', 'BAF190B', 'SNF2L2']
    found_genes = []
    for name in possible_names:
        if name in adata.raw.var_names:
            found_genes.append(name)
    
    if found_genes:
        print(f"找到可能相关的基因: {found_genes}")
        # 使用找到的第一个基因进行分析
        target_gene = found_genes[0]
        print(f"使用 {target_gene} 进行分析")
        
        # 重复上述分析
        sc.pl.umap(adata, color=target_gene, use_raw=True, 
                   title=f'{target_gene} Expression', save=f'_{target_gene.lower()}_umap.png')
    else:
        print("未找到SMARCA2相关基因")


In [None]:
# --- SMARCA2 在癌细胞中的表达差异 ---

# 确定目标基因
target_gene = 'SMARCA2' if 'SMARCA2' in adata.raw.var_names else None
if target_gene is None:
    possible_names = ['SMARCA2', 'BRM', 'BAF190B', 'SNF2L2']
    for name in possible_names:
        if name in adata.raw.var_names:
            target_gene = name
            break

if target_gene and cancer_cells.n_obs > 0:
    print(f"分析 {target_gene} 在癌细胞中的表达差异")
    
    # 在癌细胞中比较两组的表达
    cancer_smarca2_expr = cancer_cells.raw[:, target_gene].X
    if hasattr(cancer_smarca2_expr, 'toarray'):
        cancer_smarca2_expr = cancer_smarca2_expr.toarray().flatten()
    
    cancer_cells.obs[f'{target_gene}_expr'] = cancer_smarca2_expr
    
    # 统计分析
    cancer_smarca2_stats = cancer_cells.obs.groupby('Type')[f'{target_gene}_expr'].agg(['mean', 'median', 'std', 'count'])
    print(f"\n{target_gene}在癌细胞中的表达统计:")
    print(cancer_smarca2_stats)
    
    # 可视化
    sc.pl.violin(cancer_cells, keys=target_gene, groupby='Type', 
                 use_raw=True, save=f'_{target_gene.lower()}_cancer_violin.png')
    
    sc.pl.boxplot(cancer_cells, keys=target_gene, groupby='Type', 
                  use_raw=True, save=f'_{target_gene.lower()}_cancer_boxplot.png')
    
    # 统计检验
    from scipy import stats
    
    refractory_expr = cancer_cells.obs[cancer_cells.obs['Type'] == 'refractory'][f'{target_gene}_expr']
    sensitive_expr = cancer_cells.obs[cancer_cells.obs['Type'] == 'sensitive'][f'{target_gene}_expr']
    
    if len(refractory_expr) > 0 and len(sensitive_expr) > 0:
        # 执行Mann-Whitney U检验
        statistic, p_value = stats.mannwhitneyu(refractory_expr, sensitive_expr, alternative='two-sided')
        
        print(f"\n{target_gene}表达差异统计检验:")
        print(f"Mann-Whitney U统计量: {statistic}")
        print(f"P值: {p_value:.6f}")
        
        if p_value < 0.05:
            print("结论: 两组间存在显著差异")
        else:
            print("结论: 两组间无显著差异")
    
    print(f"\n{target_gene}在癌细胞中的分析完成")
else:
    print("无法进行SMARCA2在癌细胞中的分析")


In [None]:
# --- SMARCA2 共表达基因分析 ---

if target_gene and cancer_cells.n_obs > 0:
    print(f"分析与 {target_gene} 共表达的基因")
    
    # 获取癌细胞的表达矩阵
    cancer_expr_matrix = cancer_cells.raw.X
    if hasattr(cancer_expr_matrix, 'toarray'):
        cancer_expr_matrix = cancer_expr_matrix.toarray()
    
    # 获取目标基因的表达
    target_gene_idx = list(cancer_cells.raw.var_names).index(target_gene)
    target_expr = cancer_expr_matrix[:, target_gene_idx]
    
    # 计算与所有其他基因的相关性
    correlations = []
    gene_names = []
    
    for i, gene in enumerate(cancer_cells.raw.var_names):
        if gene != target_gene:
            gene_expr = cancer_expr_matrix[:, i]
            # 计算Pearson相关系数
            corr, p_val = stats.pearsonr(target_expr, gene_expr)
            correlations.append(corr)
            gene_names.append(gene)
    
    # 创建相关性结果DataFrame
    corr_df = pd.DataFrame({
        'gene': gene_names,
        'correlation': correlations
    })
    
    # 按相关性绝对值排序
    corr_df['abs_correlation'] = corr_df['correlation'].abs()
    corr_df = corr_df.sort_values('abs_correlation', ascending=False)
    
    print(f"\n与{target_gene}表达最相关的前20个基因:")
    print(corr_df.head(20)[['gene', 'correlation']])
    
    # 保存共表达分析结果
    corr_df.to_csv(os.path.join(output_dir, f'{target_gene}_coexpression_analysis.csv'), index=False)
    
    # 可视化Top相关基因
    top_pos_genes = corr_df.head(5)['gene'].tolist()  # 正相关
    top_neg_genes = corr_df.tail(5)['gene'].tolist()  # 负相关
    
    # 检查基因是否在高变异基因中
    available_pos_genes = [g for g in top_pos_genes if g in cancer_cells.var_names]
    available_neg_genes = [g for g in top_neg_genes if g in cancer_cells.var_names]
    
    if available_pos_genes:
        print(f"\n可视化与{target_gene}正相关的基因: {available_pos_genes}")
        sc.pl.umap(cancer_cells, color=[target_gene] + available_pos_genes[:3], 
                   use_raw=True, save=f'_{target_gene.lower()}_positive_coexpression.png')
    
    if available_neg_genes:
        print(f"\n可视化与{target_gene}负相关的基因: {available_neg_genes}")
        sc.pl.umap(cancer_cells, color=[target_gene] + available_neg_genes[:3], 
                   use_raw=True, save=f'_{target_gene.lower()}_negative_coexpression.png')
    
    # 基于SMARCA2表达水平定义细胞亚群
    smarca2_median = np.median(cancer_cells.obs[f'{target_gene}_expr'])
    cancer_cells.obs[f'{target_gene}_high'] = cancer_cells.obs[f'{target_gene}_expr'] > smarca2_median
    cancer_cells.obs[f'{target_gene}_group'] = cancer_cells.obs[f'{target_gene}_high'].map({True: 'High', False: 'Low'})
    
    print(f"\n基于{target_gene}表达水平的细胞分组:")
    print(cancer_cells.obs[f'{target_gene}_group'].value_counts())
    
    # 可视化SMARCA2高低表达细胞的分布
    sc.pl.umap(cancer_cells, color=f'{target_gene}_group', 
               save=f'_{target_gene.lower()}_high_low_groups.png')
    
    print(f"\n{target_gene}共表达分析完成")
else:
    print("无法进行共表达分析")


In [None]:
# --- 定义和导出细胞状态 ---

print("为Geneformer模拟实验准备输入文件...")

# 创建Geneformer输出目录
geneformer_dir = os.path.join(output_dir, 'geneformer_input')
if not os.path.exists(geneformer_dir):
    os.makedirs(geneformer_dir)

if cancer_cells.n_obs > 0:
    # 1. 化疗敏感的癌细胞
    sensitive_cancer_cells = cancer_cells[cancer_cells.obs['Type'] == 'sensitive'].copy()
    
    # 2. 化疗耐药的癌细胞
    refractory_cancer_cells = cancer_cells[cancer_cells.obs['Type'] == 'refractory'].copy()
    
    print(f"敏感癌细胞数量: {sensitive_cancer_cells.n_obs}")
    print(f"耐药癌细胞数量: {refractory_cancer_cells.n_obs}")
    
              # 文件1: 差异表达基因列表 (用于评估模拟效果)
     if not deg_results.empty:
         deg_list_path = os.path.join(geneformer_dir, 'deg_gene_list.csv')
         deg_results_for_geneformer = deg_results[['names', 'logfoldchanges', 'pvals_adj']].copy()
         deg_results_for_geneformer.columns = ['gene', 'log2fc', 'padj']
         deg_results_for_geneformer.to_csv(deg_list_path, index=False)
         print(f"差异基因列表已保存到: {deg_list_path}")
    
    # 文件2: 敏感癌细胞的平均表达谱 (作为模拟的基线细胞状态)
    if sensitive_cancer_cells.n_obs > 0:
        baseline_expression = np.mean(sensitive_cancer_cells.raw.X, axis=0)
        if hasattr(baseline_expression, 'A1'):
            baseline_expression = baseline_expression.A1
        
        baseline_df = pd.DataFrame({
            'gene': sensitive_cancer_cells.raw.var_names,
            'mean_expression': baseline_expression
        })
        baseline_df = baseline_df.sort_values('mean_expression', ascending=False)
        
        baseline_path = os.path.join(geneformer_dir, 'baseline_sensitive_cancer_cells.csv')
        baseline_df.to_csv(baseline_path, index=False)
        print(f"基线表达谱已保存到: {baseline_path}")
    
    # 文件3: 耐药癌细胞的平均表达谱 (作为目标状态)
    if refractory_cancer_cells.n_obs > 0:
        target_expression = np.mean(refractory_cancer_cells.raw.X, axis=0)
        if hasattr(target_expression, 'A1'):
            target_expression = target_expression.A1
        
        target_df = pd.DataFrame({
            'gene': refractory_cancer_cells.raw.var_names,
            'mean_expression': target_expression
        })
        target_df = target_df.sort_values('mean_expression', ascending=False)
        
        target_path = os.path.join(geneformer_dir, 'target_refractory_cancer_cells.csv')
        target_df.to_csv(target_path, index=False)
        print(f"目标表达谱已保存到: {target_path}")
    
    # 文件4: SMARCA2共表达基因网络
    if target_gene and 'corr_df' in locals():
        coexpr_path = os.path.join(geneformer_dir, f'{target_gene}_coexpression_network.csv')
        corr_df.to_csv(coexpr_path, index=False)
        print(f"{target_gene}共表达网络已保存到: {coexpr_path}")
    
    print(f"\n所有Geneformer输入文件已保存到: {geneformer_dir}")
else:
    print("无癌细胞数据，无法生成Geneformer输入文件")


In [None]:
# --- 生成分析总结报告 ---

print("生成分析总结报告...")

# 创建总结报告
summary_report = f"""
# 卵巢癌化疗耐药性单细胞RNA-seq分析总结报告

## 数据概况
- 总细胞数: {adata.n_obs}
- 总基因数: {adata.n_vars}
- 样本数: {len(adata.obs['Sample'].unique())}
- 耐药组样本: {len(adata.obs[adata.obs['Type'] == 'refractory']['Sample'].unique())}
- 敏感组样本: {len(adata.obs[adata.obs['Type'] == 'sensitive']['Sample'].unique())}

## 细胞类型分布
{adata.obs['cell_type_auto'].value_counts().to_string()}

## 癌细胞分析结果
- 癌细胞总数: {cancer_cells.n_obs if cancer_cells.n_obs > 0 else 0}
- 耐药组癌细胞: {len(cancer_cells.obs[cancer_cells.obs['Type'] == 'refractory']) if cancer_cells.n_obs > 0 else 0}
- 敏感组癌细胞: {len(cancer_cells.obs[cancer_cells.obs['Type'] == 'sensitive']) if cancer_cells.n_obs > 0 else 0}

## 差异表达基因分析
- 显著差异基因数: {len(deg_results) if not deg_results.empty else 0}

## SMARCA2分析结果
- 目标基因: {target_gene if target_gene else 'SMARCA2未找到'}
"""

if target_gene and cancer_cells.n_obs > 0:
    summary_report += f"""
- {target_gene}在耐药组平均表达: {cancer_cells.obs[cancer_cells.obs['Type'] == 'refractory'][f'{target_gene}_expr'].mean():.4f}
- {target_gene}在敏感组平均表达: {cancer_cells.obs[cancer_cells.obs['Type'] == 'sensitive'][f'{target_gene}_expr'].mean():.4f}
"""

summary_report += f"""

## 输出文件
- 主要结果保存在: {output_dir}
- Geneformer输入文件保存在: {geneformer_dir}
- 处理后的AnnData对象将保存为: processed_adata.h5ad

## 下一步建议
1. 根据可视化结果调整细胞类型注释
2. 深入分析感兴趣的差异表达基因
3. 使用生成的输入文件进行Geneformer模拟
4. 验证关键发现的生物学意义

分析完成时间: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
"""

# 保存总结报告
report_path = os.path.join(output_dir, 'analysis_summary_report.txt')
with open(report_path, 'w', encoding='utf-8') as f:
    f.write(summary_report)

print(f"分析总结报告已保存到: {report_path}")
print(summary_report)


In [None]:
# --- 保存最终的AnnData对象 ---

print("保存最终处理的数据...")

# 最后，保存最终的AnnData对象，以便未来快速加载和探索
final_adata_path = os.path.join(output_dir, 'processed_adata.h5ad')
adata.write(final_adata_path)

print(f"最终处理好的AnnData对象已保存到: {final_adata_path}")

# 也保存癌细胞的子集数据
if cancer_cells.n_obs > 0:
    cancer_adata_path = os.path.join(output_dir, 'cancer_cells_adata.h5ad')
    cancer_cells.write(cancer_adata_path)
    print(f"癌细胞子集数据已保存到: {cancer_adata_path}")

print("\n🎉 所有分析步骤完成！")
print("=" * 60)
print("主要输出文件:")
print(f"1. 完整分析数据: {final_adata_path}")
print(f"2. 分析报告: {report_path}")
print(f"3. Geneformer输入文件: {geneformer_dir}")
print(f"4. 各类图表: {output_dir}")
print("=" * 60)
print("\n现在您可以:")
print("1. 查看生成的图表和分析结果")
print("2. 根据需要调整参数重新运行特定步骤")
print("3. 使用Geneformer输入文件进行BRM敲除模拟")
print("4. 进一步验证关键发现")
