In [2]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns

# combined_data_file = "a549_perturbation_analysis_loka/a549_combined_data_loka.h5ad"
combined_data_file = "../a549_combined_data.h5ad"
combined = sc.read_h5ad(combined_data_file)
sc.pp.filter_cells(combined, min_genes=1000)
sc.pp.filter_genes(combined, min_cells=30000)
print(f"Combined data shape: {combined.shape}")

combined = combined[combined.obs['perturbation'] != 'unsure']
combined, len(combined)

Combined data shape: (219853, 10607)


(View of AnnData object with n_obs × n_vars = 218594 × 10607
     obs: 'gene_count', 'umi_count', 'SRX_accession', 'sample_id', 'batch', 'n_genes', 'tissue', 'disease', 'perturbation', 'cell_line'
     var: 'gene_symbols', 'feature_types', 'n_cells',
 218594)

In [3]:
# 显示数据的前10行
combined.obs.head(10)

Unnamed: 0,gene_count,umi_count,SRX_accession,sample_id,batch,n_genes,tissue,disease,perturbation,cell_line
AAACCTGAGCACAGGT-0,4366,18578.0,SRX17915870,SRX17915870,0,4366,other,influenza A virus (A/WSN/1933(H1N1)),wild-type (WT) virus,"A549 cells, MDCK-SIAT1 cells"
AAACCTGAGGTGCAAC-0,5795,28615.0,SRX17915870,SRX17915870,0,5795,other,influenza A virus (A/WSN/1933(H1N1)),wild-type (WT) virus,"A549 cells, MDCK-SIAT1 cells"
AAACCTGAGTTGTAGA-0,4690,18794.0,SRX17915870,SRX17915870,0,4690,other,influenza A virus (A/WSN/1933(H1N1)),wild-type (WT) virus,"A549 cells, MDCK-SIAT1 cells"
AAACCTGTCGTGGTCG-0,3667,11265.0,SRX17915870,SRX17915870,0,3667,other,influenza A virus (A/WSN/1933(H1N1)),wild-type (WT) virus,"A549 cells, MDCK-SIAT1 cells"
AAACGGGCAACTTGAC-0,5587,23267.0,SRX17915870,SRX17915870,0,5587,other,influenza A virus (A/WSN/1933(H1N1)),wild-type (WT) virus,"A549 cells, MDCK-SIAT1 cells"
AAACGGGGTACATCCA-0,1051,1484.0,SRX17915870,SRX17915870,0,1051,other,influenza A virus (A/WSN/1933(H1N1)),wild-type (WT) virus,"A549 cells, MDCK-SIAT1 cells"
AAACGGGGTCTAACGT-0,3869,13169.0,SRX17915870,SRX17915870,0,3869,other,influenza A virus (A/WSN/1933(H1N1)),wild-type (WT) virus,"A549 cells, MDCK-SIAT1 cells"
AAACGGGGTTGAGTTC-0,3998,12751.0,SRX17915870,SRX17915870,0,3998,other,influenza A virus (A/WSN/1933(H1N1)),wild-type (WT) virus,"A549 cells, MDCK-SIAT1 cells"
AAAGATGAGTATTGGA-0,5415,24334.0,SRX17915870,SRX17915870,0,5415,other,influenza A virus (A/WSN/1933(H1N1)),wild-type (WT) virus,"A549 cells, MDCK-SIAT1 cells"
AAAGATGCATTAGGCT-0,4678,18168.0,SRX17915870,SRX17915870,0,4678,other,influenza A virus (A/WSN/1933(H1N1)),wild-type (WT) virus,"A549 cells, MDCK-SIAT1 cells"


In [4]:
combined.var.head(10)

Unnamed: 0,gene_symbols,feature_types,n_cells
ENSG00000237491,LINC01409,Gene Expression,51433
ENSG00000228794,LINC01128,Gene Expression,35955
ENSG00000187634,SAMD11,Gene Expression,55968
ENSG00000188976,NOC2L,Gene Expression,95817
ENSG00000188290,HES4,Gene Expression,60959
ENSG00000187608,ISG15,Gene Expression,83315
ENSG00000188157,AGRN,Gene Expression,69326
ENSG00000131591,C1orf159,Gene Expression,57982
ENSG00000078808,SDF4,Gene Expression,122528
ENSG00000176022,B3GALT6,Gene Expression,58401


In [5]:
drugs = combined.obs['perturbation'].unique().tolist()
drugs, len(drugs)

(['wild-type (WT) virus',
  'irradiated A549 cells (6 Gy γ-ray treatment)',
  'Infected (Cal07, 16 hours, Replicate 2)',
  'CAR T cell therapy with SUV39H1 knockout',
  'infected with H3N2 (A/Perth/16/2009)',
  '8 hours post infection',
  'irradiation',
  'Ritonavir, gemcitabine, cisplatin',
  'glyconanomaterials for combating bacterial infections',
  'uninfected (mock treatment)',
  'NS1 4xstop (mutant)',
  'infected with H3N2, Cetuximab (anti-EGFR therapy)',
  'in vitro culture of iPSC-RPE',
  'mixed sample treatments include DMSO, ARS-1620, and Vemurafenib',
  '6 Gy γ-ray treatment',
  'CAR T cell therapy, SUV39H1 knockout, CD19 tumor cell treatment',
  'antineoplastic agents, mixed micelles for drug delivery',
  'irradiation (IR 6h)',
  'Infected with H3N2 (A/Perth/16/2009), 16 hours post infection',
  'uninfected (Mock)',
  'infected with H3N2 (A/Perth/16/2009), 16 hours post infection',
  'ACME HS dissociation',
  'CAR T cell therapy, SUV39H1 knockout',
  'lentiviral pool for exp

In [6]:
drugs = [
  'glyconanomaterials for combating bacterial infections',
  'CAR T cell therapy, SUV39H1 knockout',
  'CAR T cell therapy with SUV39H1 knockout',
  'CAR T cell therapy, SUV39H1 knockout, CD19 tumor cell treatment', 
  'in vitro culture of iPSC-RPE',
  'irradiated A549 cells (6 Gy γ-ray treatment)']

In [7]:
results_dir = "a549_perturbation_analysis_loka"
fig_dir = os.path.join(results_dir, "figures_new_data/t_test")
os.makedirs(fig_dir, exist_ok=True) 

In [9]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time 
import pandas as pd 

# 对每种药物进行差异基因分析并绘制火山图
for drug in drugs:
    if drug == 'control':
        continue
    
    print(f"Processing {drug}...")
    
    # 运行Wilcoxon rank-sum test

    start_time = time.time()
    sc.tl.rank_genes_groups(combined, 'perturbation', groups=[drug], reference='control', 
                           method='t-test', key_added=f"{drug}_vs_control")
    end_time = time.time()
    print(f"Time taken for t-test test: {end_time - start_time} seconds")
    
    # 提取结果并保存为DataFrame
    start_time = time.time()
    result = sc.get.rank_genes_groups_df(combined, group=drug, key=f"{drug}_vs_control")
    end_time = time.time()
    print(f"Time taken for get results: {end_time - start_time} seconds")

    # 处理pvals和pvals_adj，将小于1e-20的值设置为0
    result['pvals'] = result['pvals'].where(result['pvals'] >= 1e-40, 0)
    result['pvals_adj'] = result['pvals_adj'].where(result['pvals_adj'] >= 1e-40, 0)
    
    # # 保存所有p值和log2倍数变化
    drug = drug.strip().replace('/', '_')
    result.to_csv(f"{fig_dir}/{drug}.csv")
    # result = pd.read_csv(f"{fig_dir}/{drug}.csv")
    
    # 创建火山图数据
    volcano_data = pd.DataFrame({
        'gene': result['names'],
        'log2fc': result['logfoldchanges'],
        'pval': -np.log10(result['pvals']),
        'padj': -np.log10(result['pvals_adj'])
    })
    
    # 添加显著性标记
    threshold = 1e-10
    volcano_data['significant'] = 'Not Significant'
    volcano_data.loc[(volcano_data['padj'] > -np.log10(threshold)) & (abs(volcano_data['log2fc']) > 1), 'significant'] = 'Significant'
    volcano_data.loc[(volcano_data['padj'] > -np.log10(threshold)) & (volcano_data['log2fc'] > 1), 'significant'] = 'Up-regulated'
    volcano_data.loc[(volcano_data['padj'] > -np.log10(threshold)) & (volcano_data['log2fc'] < -1), 'significant'] = 'Down-regulated'
    
    # 绘制火山图
    plt.figure(figsize=(5, 4))
    sns.scatterplot(
        data=volcano_data,
        x='log2fc',
        y='padj',
        hue='significant',
        palette={'Not Significant': 'grey', 'Up-regulated': 'red', 'Down-regulated': 'blue', 'Significant': 'purple'},
        alpha=0.7,
        s=20
    )
    
    # 添加显著性阈值线
    plt.axhline(-np.log10(threshold), color='black', linestyle='--', alpha=0.5)
    plt.axvline(-1, color='black', linestyle='--', alpha=0.5)
    plt.axvline(1, color='black', linestyle='--', alpha=0.5)
    
    # 标记一些顶部基因
    top_genes = volcano_data.sort_values('padj', ascending=False).head(10)
    for _, gene in top_genes.iterrows():
        plt.text(gene['log2fc'], gene['padj'], gene['gene'], fontsize=8, ha='center')
    
    # 设置图表标题和标签
    plt.title(f"Volcano Plot: {drug} vs Control", fontsize=10)
    plt.xlim(-4, 4) 
    plt.xlabel("Log2 Fold Change", fontsize=10)
    plt.ylabel("-Log10 Adjusted P-value", fontsize=10)
    plt.gca().get_legend().remove()
    # plt.legend(title="Gene Status", fontsize=8)
    
    # 保存图表
    plt.tight_layout()
    plt.savefig(f"{fig_dir}/volcano_{drug}.png", dpi=500)
    plt.show()
    plt.close()
    print(f"Completed analysis for {drug}")

Processing glyconanomaterials for combating bacterial infections...


ValueError: reference = control needs to be one of groupby = ['6 Gy γ-ray treatment', '8 hours post infection', 'ACME HS dissociation', 'Bexmarilimab', 'CAR T cell therapy with SUV39H1 knockout', 'CAR T cell therapy, SUV39H1 knockout', 'CAR T cell therapy, SUV39H1 knockout, CD19 tumor cell treatment', 'Infected (Cal07, 16 hours, Replicate 2)', 'Infected with H1N1 (A/California/07/2009), 8 hours post infection', 'Infected with H3N2 (A/Perth/16/2009), 16 hours post infection', 'NS1 4xstop (mutant)', 'Ritonavir, gemcitabine, cisplatin', 'antineoplastic agents, mixed micelles for drug delivery', 'glyconanomaterials for combating bacterial infections', 'in vitro culture of iPSC-RPE', 'infected with H1N1 (A/California/07/2009)', 'infected with H3N2 (A/Perth/16/2009)', 'infected with H3N2 (A/Perth/16/2009), 16 hours post infection', 'infected with H3N2, Cetuximab (anti-EGFR therapy)', 'irradiated A549 cells (6 Gy γ-ray treatment)', 'irradiation', 'irradiation (IR 6h)', 'lentiviral pool for expression of 120 gRNAs, tamoxifen, puromycin', 'mixed sample treatments include DMSO, ARS-1620, and Vemurafenib', 'uninfected (Mock)', 'uninfected (mock treatment)', 'wild-type (WT) virus'].

In [10]:
combined

View of AnnData object with n_obs × n_vars = 218594 × 10607
    obs: 'gene_count', 'umi_count', 'SRX_accession', 'sample_id', 'batch', 'n_genes', 'tissue', 'disease', 'perturbation', 'cell_line'
    var: 'gene_symbols', 'feature_types', 'n_cells'