# Differential gene expression analysis per cell-type between conditions

In [6]:
import pandas as pd
import numpy as np
import os
import subprocess as sp
import decoupler as dc
import scanpy as sc


In [7]:
path_to_R = "/usr/local/bioinf/R/R-4.2.3/bin/"

In [8]:
os.environ["PATH"] = path_to_R + os.pathsep + os.environ["PATH"]
# os.environ["PATH"]


In [9]:
#Load data
input_adata = "/data/projects/2023/atlas_protocol/input_data_zenodo/atlas-integrated-annotated.h5ad"
adata = sc.read_h5ad(input_adata)

In [10]:
#Processing
# Gene symbols required 
#adata.var.head()
# All cells are annotated 
#adata.obs["cell_type"].isnull().value_counts()
#adata.obs["cell_type_coarse"].unique()

In [19]:
adata = adata[adata.obs["origin"].isin(["tumor_primary"])]
adata = adata[adata.obs["condition"].isin(["LUAD", "LUSC"])]
cell_type = list(adata.obs["cell_type_coarse"].unique())

In [20]:
adata_dict = {}
for name in cell_type:
    name_ad = name.replace(" ","_")
    adata_name = f"{name_ad}_adata"
    adata_dict[adata_name] = adata[adata.obs["cell_type_coarse"].isin([name])]

In [21]:
# Compute distances in the PCA space, and find cell neighbors
#sc.pp.neighbors(adata,use_rep="X_scANVI")

In [22]:

deseq = "../../bin/deseq2.R"
deseq_results = "/data/projects/2023/atlas_protocol/results/differential_expression/deseq_resdir"
result_dir = deseq_results
#result_dir = "/data/projects/2023/atlas_protocol/results/differential_expression/cell_type"
cpus = 6

In [23]:
def run_deseq(count_table, sample_sheet, deseq_prefix, contrast, deseq_resdir):
    os.makedirs(deseq_resdir, exist_ok = True)
    
    deseq_cmd = [deseq, count_table, sample_sheet,
                 "--cond_col", "condition",
                 "--c1", contrast[0],
                 "--c2", contrast[1], 
                 "--resDir", deseq_resdir, 
                 "--prefix", deseq_prefix, 
                 "--cpus", str(cpus), 
                 "--save_workspace"]
    
    stdout = open(deseq_resdir + "/" + deseq_prefix + ".log", 'w')
    stderr = open(deseq_resdir + "/" + deseq_prefix + ".err", 'w')
    sp.run(deseq_cmd, capture_output=False, stdout=stdout, stderr=stderr)
    stdout.close()
    stderr.close()

In [24]:
def save_pseudobulk(pb, samplesheet_filename, counts_filename):
    samplesheet = pb.obs.copy()
    samplesheet.reset_index(inplace=True)
    sample_ids_repl = fix_sample_ids(pb)
    bulk_df = pb.to_df().T.rename(columns=sample_ids_repl)
    bulk_df = pb.to_df().T
    bulk_df.index.name = "gene_id"
    samplesheet.to_csv(samplesheet_filename)
    bulk_df.to_csv(counts_filename)

In [25]:
def fix_sample_ids(pb):
    repl = {}
    for k,v in dict(zip(pb.obs["condition"].index, "_"+pb.obs["condition"].values)).items():
        repl[k] = k.replace(v,"")

    return(repl)

In [29]:
for ct ,tmp_ad in adata_dict.items():    
    pb = dc.get_pseudobulk(
        tmp_ad,
        sample_col='sample',
        groups_col='condition',
        layer='raw_counts',
        mode='sum',
        min_prop=0.05,
        min_cells=10,
        min_counts=1000,
        min_smpls=2
    )
    if pb.obs["condition"].nunique() <= 1:
        print(f"Cell type {ct} does not have enough replicates per group")
    else:
        contrast = ["LUSC", "LUAD"]
        contrast_str = f"{contrast[0]}_vs_{contrast[1]}"
        deseq_resdir = f"{deseq_results}/{contrast_str}"

        ct = ct.replace(" ", "_")
        ct_fname = ct.replace("/", "_")
        deseq_prefix = f"{contrast_str}_{ct_fname}"
        
        
        
        sample_sheet = f"{result_dir}/{deseq_prefix}.samplesheet.csv"
        count_table = f"{result_dir}/{deseq_prefix}.counts.csv"

        save_pseudobulk(pb, sample_sheet, count_table)
        run_deseq(count_table, sample_sheet, deseq_prefix, contrast, deseq_resdir)
    

KeyboardInterrupt: 