# Differential gene expression analysis per cell-type between conditions

In [None]:
#Load libraries
import pandas as pd
import numpy as np
import scanpy as sc
import decoupler as dc
import biomart 
import os


In [None]:
#Load data
input_adata = "/data/projects/2023/atlas_protocol/input_data_zenodo/atlas-integrated-annotated.h5ad"
adata = sc.read_h5ad(input_adata)

In [None]:
#Processing
# Gene symbols required 
adata.var.head()

In [None]:
# All cells are annotated 
adata.obs["cell_type"].isnull().value_counts()

In [None]:
adata = adata[adata.obs["origin"] == "tumor_primary"]
adata = adata[adata.obs["condition"].isin(["LUAD", "LUSC"])]

In [None]:
# Compute distances in the PCA space, and find cell neighbors
sc.pp.neighbors(adata,use_rep="X_scANVI")

In [None]:
# Generate UMAP features
sc.tl.umap(adata, init_pos = "X_umap")
# Visualize
sc.pl.umap(adata, color=['cell_type'], frameon=False)

In [None]:
# Get pseudo-bulk profile
pdata = dc.get_pseudobulk(adata,
                          sample_col='patient',
                          groups_col='cell_type',
                          layer='raw_counts',
                          mode='sum',
                          min_cells=10,
                          min_counts=1000
                         )
pdata

In [None]:
cell_type_name = "Tumor cells" # e.g Tumor cells
pdata_cell = pdata[(pdata.obs["cell_type"] == cell_type_name)]

In [None]:
# Create counts 
patient = pdata_cell.obs["patient"] #patient id
gene_symbol = pdata_cell.var.index #gene id as index
counts_df  = pd.DataFrame(data = pdata_cell.X, columns = gene_symbol, index =patient) #counts dataframe
counts_df.index.name = None
counts_df = counts_df.T

resDir = '/data/projects/2023/atlas_protocol/results/differential_expression/'
cell_type_name = cell_type_name.replace(" ","")
filename_co = f"{cell_type_name}_counts.tsv"
file_path = os.path.join(resDir, filename_co)
counts_df.to_csv(file_path,sep = "\t",index = True)


In [None]:
covariates = ['sex', 'ever_smoker', 'condition', 'age','tumor_stage', 'study', 'platform']
samplesheet_df = pdata_cell.obs.loc[:,covariates] # More columns can be added to be further used as covariates
samplesheet_df["sample"] = samplesheet_df.index

samplesheet_df.rename(columns = {"condition":"group"}, inplace = True) # Rename columns

filename_co = f"{cell_type_name}_samplesheet.csv"
file_path = os.path.join(resDir, filename_co)
samplesheet_df.to_csv(file_path,sep = ",",index = False)


In [None]:
#!./deseq2.R --input whaterver.csv