In [None]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from adjustText import adjust_text

# 设置绘图参数
sc.settings.set_figure_params(dpi=100, frameon=False)
plt.rcParams['figure.figsize'] = (10, 8)
sns.set_style("whitegrid")

# 读取已有的数据
combined_data_file = "a549_combined_data.h5ad"
combined = sc.read_h5ad(combined_data_file)
sc.pp.filter_cells(combined, min_genes=200)
sc.pp.filter_genes(combined, min_cells=100)
print(f"Combined data shape: {combined.shape}")

# 过滤掉'unsure'药物类型的细胞
combined = combined[combined.obs['drug'] != 'unsure']
print(f"Filtered data shape: {combined.shape}")

# 检查药物种类
print(f"Available drugs: {combined.obs['drug'].unique()}")
print(f"Number of drugs: {combined.obs['drug'].nunique()}")

In [2]:
results_dir = "a549_perturbation_analysis_loka"
fig_dir = os.path.join(results_dir, "figures_cd")
os.makedirs(fig_dir, exist_ok=True)

In [None]:
print(combined.obs['drug'].unique().tolist(), combined.obs['perturbation'].unique().tolist())

In [None]:
combined = combined[combined.obs['drug'] != 'control']
print(f"Filtered data shape: {combined.shape}")

In [None]:
grok_gene = pd.read_csv("a549_perturbation_analysis_loka/Bexmarilimab_Related_Genes_grok.csv")
grok_g = grok_gene['Ensembl ID'].tolist() 
# 遍历DataFrame的每一行

my_dict = {}
for index, row in grok_gene.iterrows():
    # 这里可以通过row['列名']访问每一列的数据
    gene_id = row['Gene Name']
    ensenbl_id = row['Ensembl ID']
    my_dict[ensenbl_id] = gene_id

grok_gene.head()

In [None]:
import os
import glob
import pandas as pd 
import scanpy as sc
import numpy as np
import scipy.sparse

# Find all .csv files in the directory
csv_files = glob.glob(os.path.join("a549_perturbation_analysis_loka/figures", "*.csv"))
# print(csv_files)

perturbation_list = []

csv_files = ['a549_perturbation_analysis_loka/figures/bexmarilimab_vs_control_DEGs.csv']
for file in csv_files:
    # print(f"{os.path.basename(file)}")
    data = pd.read_csv(file, index_col=0)
    subdata = data[ (abs(data['logfoldchanges'])>1) & (data['pvals_adj']<0.01) ]
    gene_list = subdata['names'].tolist()
    print(f"找到 {len(gene_list)} 个差异表达基因")
    print("ENSG00000010327" in gene_list)

    flag = 1
    if flag==1:
        # 确保combined数据已加载
        # if 'combined' not in locals():
        #     combined = sc.read_h5ad("a549_combined_data.h5ad")
        
        # 找出gene_list中存在于combined.var_names中的基因
        valid_genes = [gene for gene in grok_g if gene in gene_list]
        print(f"在数据集中找到 {len(valid_genes)} 个有效基因")
        
        # 筛选这些基因
        gene_data = combined[combined.obs['drug'] == 'bexmarilimab']
        gene_data = gene_data[:, valid_genes].copy()
        print(gene_data.shape)

        # 筛选基因表达大于0的细胞
        print("筛选基因表达大于0的细胞...")
        
        # 将X转换为密集矩阵用于筛选
        X_temp = gene_data.X.toarray() if scipy.sparse.issparse(gene_data.X) else gene_data.X
        
        # 创建一个掩码，标识每个细胞中至少有一个基因表达值大于0
        cells_with_expression = (X_temp > 0).any(axis=1)
        
        # 使用掩码筛选细胞
        gene_data = gene_data[cells_with_expression]
        
        print(f"筛选后的数据形状: {gene_data.shape}")
        print(f"保留了 {cells_with_expression.sum()} 个有基因表达的细胞")
        print(f"移除了 {len(cells_with_expression) - cells_with_expression.sum()} 个无基因表达的细胞")

        # 将X转换为密集矩阵
        X_dense = gene_data.X.toarray() if scipy.sparse.issparse(gene_data.X) else gene_data.X

        # 创建非零行的掩码
        non_zero_mask = ~(X_dense == 0).all(axis=1)

        # 使用掩码过滤数据
        gene_data = gene_data[non_zero_mask]
        print(f"删除全0行后的数据形状: {gene_data.shape}")

        # 重新获取过滤后的X_dense
        X_dense_filtered = gene_data.X.toarray() if scipy.sparse.issparse(gene_data.X) else gene_data.X

        # 转换为DataFrame并保存为CSV
        gene_df = pd.DataFrame(X_dense_filtered, index=gene_data.obs_names, columns=valid_genes)
        print(gene_df.shape)
        # gene_df.to_csv(f"a549_perturbation_analysis_loka/figures_cd/bexmarilimab_perturb_5286_14.csv")

    elif flag==2:
        # 找出gene_list中存在于combined.var_names中的基因
        # valid_genes = [gene for gene in grok_g if gene in gene_list]
        # print(f"在数据集中找到 {len(valid_genes)} 个有效基因")
        
        # 筛选这些基因
        # gene_data = combined[combined.obs['drug'] == 'bexmarilimab']
        gene_data = combined[:, gene_list].copy()
        print(gene_data.shape)

        # 筛选基因表达大于0的细胞
        print("筛选基因表达大于0的细胞...")
        
        # 将X转换为密集矩阵用于筛选
        X_temp = gene_data.X.toarray() if scipy.sparse.issparse(gene_data.X) else gene_data.X
        
        # 创建一个掩码，标识每个细胞中至少有一个基因表达值大于0
        cells_with_expression = (X_temp > 0).any(axis=1)
        
        # 使用掩码筛选细胞
        gene_data = gene_data[cells_with_expression]
        
        print(f"筛选后的数据形状: {gene_data.shape}")
        print(f"保留了 {cells_with_expression.sum()} 个有基因表达的细胞")
        print(f"移除了 {len(cells_with_expression) - cells_with_expression.sum()} 个无基因表达的细胞")

        # 将X转换为密集矩阵
        X_dense = gene_data.X.toarray() if scipy.sparse.issparse(gene_data.X) else gene_data.X

        # 创建非零行的掩码
        non_zero_mask = ~(X_dense == 0).all(axis=1)

        # 使用掩码过滤数据
        gene_data = gene_data[non_zero_mask]
        print(f"删除全0行后的数据形状: {gene_data.shape}")

        # 重新获取过滤后的X_dense
        X_dense_filtered = gene_data.X.toarray() if scipy.sparse.issparse(gene_data.X) else gene_data.X

        # 转换为DataFrame并保存为CSV
        gene_df = pd.DataFrame(X_dense_filtered, index=gene_data.obs_names, columns=gene_list)
        print(gene_df.shape)
        gene_df.to_csv(f"a549_perturbation_analysis_loka/figures_cd/bexmarilimab_perturb_22w_9k.csv")

In [None]:
from causallearn.search.ConstraintBased.PC import pc

data = pd.read_csv("a549_perturbation_analysis_loka/figures_cd/bexmarilimab_perturb_5286_14.csv", index_col=0).astype(float)
data = data.dropna()
data.head()

In [None]:
labels = [my_dict[i] for i in data.columns.values]
labels, len(labels)

In [None]:
from causallearn.utils.PCUtils.BackgroundKnowledge import BackgroundKnowledge

cg = pc(data.values, alpha=0.05, indep_test="fisherz")
nodes = cg.G.get_nodes()
bk = BackgroundKnowledge()
for i in range(1, len(nodes)):
    bk.add_required_by_node(nodes[0], nodes[i])

In [None]:
cg = pc(data.values, alpha=0.05, indep_test="fisherz", background_knowledge=bk)

In [None]:
cg.draw_pydot_graph(labels=labels)