In [56]:
REPO = '../..'
Manuscript_RESULT = f'{REPO}/data/result/manuscript_table/'
import pandas as pd
import decoupler as dc
import warnings
warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# load adata and cell type/clinical annotation

In [57]:
obs_path = f"{Manuscript_RESULT}/GEX_OBS.csv"
obs = pd.read_csv(obs_path,index_col=0)

# DEGs to create GMT file for each Cellstate in Tumor

In [6]:
with open(f'{Manuscript_RESULT}/Tumor_Cellstate.gmt','w') as f:
    for cellstate in obs.loc[obs.Celltype=='Tumor','Cellstate'].unique():
        print(cellstate)
        sheet_name = cellstate.replace('/','_')  if '/' in cellstate else cellstate
        degs = pd.read_excel(f"{Manuscript_RESULT}/DEGs_Cellstate.xlsx",sheet_name=sheet_name,index_col=0)
        degs = degs.sort_values('stat',ascending=False).index[(degs.padj<0.05) & (degs.log2FoldChange>1)].tolist()[:100]
        f.write('\t'.join([cellstate,'All_cells']+degs)+'\n')

Tumor.EMT-III
Tumor.EMT-II
Tumor.ER-II
Tumor.ER-I
Tumor.Cell_Cycle
Tumor.Interferon/MHCII(I)


# Prioritize markers for staining

In [58]:
markers = (
    pd.read_csv(f"{Manuscript_RESULT}/Tumor_Cellstate.gmt",sep='\t',header=None,index_col=0).
    T.
    drop(1,axis=0)[['Tumor.EMT-III','Tumor.ER-I']]. # remove the `All gene` row, which indicates that DEGs were generated on comparing cellstates to all cells, including cells from other celltype
    melt(var_name='Group',value_name='Gene')
)
non_uniq_markers = markers.Gene.value_counts()
non_uniq_markers = non_uniq_markers[non_uniq_markers>1].index.tolist()
markers = markers.loc[~markers.Gene.isin(non_uniq_markers),:].set_index('Gene')
markers

Unnamed: 0_level_0,Group
Gene,Unnamed: 1_level_1
SLC5A6,Tumor.EMT-III
LINC00342,Tumor.EMT-III
TFDP1,Tumor.EMT-III
CDKAL1,Tumor.EMT-III
CEP295,Tumor.EMT-III
...,...
ZFAS1,Tumor.ER-I
WDR35,Tumor.ER-I
WDR12,Tumor.ER-I
CRACR2B,Tumor.ER-I


## Get the stats of the marker genes

In [64]:
stain_genes = {}
markers_stats = []
i = 0
for cellstate in obs.loc[obs.Celltype=='Tumor','Cellstate'].unique():
    sheet_name = cellstate.replace('/','_')  if '/' in cellstate else cellstate
    degs = pd.read_excel(f"{Manuscript_RESULT}/DEGs_Cellstate.xlsx",sheet_name=sheet_name,index_col=0)
    selected_degs = markers.copy()
    selected_degs[cellstate] = degs.stat
    if i>0:
        selected_degs.drop(['Group'],axis=1,inplace=True)
    markers_stats.append(selected_degs)
    i+=1
markers_stats = pd.concat(markers_stats,axis=1)


In [65]:
emt = markers_stats.loc[markers_stats.Group=='Tumor.EMT-III',:].copy().drop(['Group'],axis=1)
mtrx=(emt['Tumor.EMT-III'].T - emt.T).drop(['Tumor.EMT-III'],axis=0).T.fillna(mtrx.min().min())
mtrx['ave'] = mtrx.mean(axis=1)
mtrx = mtrx.sort_values('ave',ascending=False)
stain_genes['EMT'] = mtrx.head(20).index
mtrx.head(20)

Unnamed: 0_level_0,Tumor.EMT-II,Tumor.ER-II,Tumor.ER-I,Tumor.Cell_Cycle,Tumor.Interferon/MHCII(I),ave
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PIWIL4,4.87772,8.264331,11.720908,10.359972,3.517325,7.748051
RARRES1,4.878602,8.94113,8.337165,8.495693,3.875129,6.905544
YEATS2,5.032181,7.463642,8.202885,6.141944,4.597363,6.287603
SGCZ,3.414773,7.999928,7.497089,7.489701,4.162728,6.112844
IL34,3.30117,7.67366,8.711297,7.518781,2.605888,5.962159
LINC00342,1.640528,6.254157,8.493016,2.328437,6.889505,5.121129
CDKAL1,6.581742,5.999477,5.482895,2.411675,4.661076,5.027373
CD82,3.102917,4.435741,8.184654,5.271405,1.828054,4.564554
SOX9-AS1,2.501111,4.176043,5.529014,4.294967,4.424217,4.185071
MED17,3.202518,4.042301,4.418479,4.998706,2.792001,3.890801


In [66]:
er1 = markers_stats.loc[markers_stats.Group=='Tumor.ER-I',:].copy().drop(['Group'],axis=1)
mtrx=(er1['Tumor.ER-I'].T - er1.T).drop(['Tumor.ER-I'],axis=0).T.fillna(mtrx.min().min())
mtrx['ave'] = mtrx.mean(axis=1)
mtrx = mtrx.sort_values('ave',ascending=False)
stain_genes['ER'] = mtrx.head(20).index
mtrx.head(20)

Unnamed: 0_level_0,Tumor.EMT-III,Tumor.EMT-II,Tumor.ER-II,Tumor.Cell_Cycle,Tumor.Interferon/MHCII(I),ave
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENTPD5,8.314571,3.113009,-0.420884,4.266849,5.401331,4.134975
MIR2052HG,6.852251,4.029267,2.074146,2.458615,5.00578,4.084012
STX18-AS1,5.020838,3.517517,2.745564,2.981204,5.629796,3.978984
TLE3,8.073721,2.211885,2.699135,2.027988,2.613007,3.525147
AC090709.1,5.21696,3.215216,3.341391,2.022966,3.657164,3.490739
LINC00862,6.624972,2.507093,-0.203422,3.282306,5.13986,3.470162
LRRC37A3,5.91421,2.844462,0.186358,2.940906,5.346445,3.446476
AF127577.2,7.364883,2.71825,1.212693,1.876945,3.263594,3.287273
REPS2,4.121734,2.738343,3.385627,1.884437,3.946832,3.215395
ZNF420,6.276427,2.457046,0.235467,2.141359,4.918051,3.20567


In [70]:
pd.DataFrame(stain_genes).to_csv(f'{Manuscript_RESULT}/EMT_ER_Staining_geneset.tsv',sep='\t',index=False)

# Activity inference with Multivariate Linear Model (MLM)

In [12]:
progeny = dc.get_progeny(organism='human', top=500)

In [13]:
progeny.to_csv(f"{Manuscript_RESULT}/progeny_reference.csv",index=False)

In [14]:
pathway_acts = []
pathway_pvals = []
for celltype in ['CD8T','Myeloid','Tumor']:
    cellstates = obs.loc[obs.Celltype==celltype,'Cellstate'].unique()
    for cellstate in cellstates:
        sheet_name = cellstate.replace('/','_')  if '/' in cellstate else cellstate
        stat_df = pd.read_excel(f"{Manuscript_RESULT}/DEGs_{celltype}.xlsx",sheet_name=sheet_name,index_col=0)[['stat']].T.rename(index={'stat':cellstate})
        # Infer pathway activities with mlm
        pathway_act, pathway_pval = dc.run_mlm(mat=stat_df, net=progeny)
        pathway_acts.append(pathway_act)
        pathway_pvals.append(pathway_pval)

## Tumor aggr state
for cellstate in ['EMT','ER','Interferon','Cell_Cycle']:
    stat_df = pd.read_excel(f"{Manuscript_RESULT}/DEGs_Tumor_Aggr.xlsx",sheet_name=sheet_name,index_col=0)[['stat']].T.rename(index={'stat':cellstate})
    # Infer pathway activities with mlm
    pathway_act, pathway_pval = dc.run_mlm(mat=stat_df, net=progeny)
    pathway_acts.append(pathway_act)
    pathway_pvals.append(pathway_pval)
    
pathway_acts = pd.concat(pathway_acts,axis=0)
pathway_pvals = pd.concat(pathway_pvals,axis=0)
# store result
with pd.ExcelWriter(f"{Manuscript_RESULT}/Progeny.xlsx") as f:
    pathway_acts.to_excel(f,sheet_name='Activity',index=True)
    pathway_pvals.to_excel(f,sheet_name='Pvalue',index=True)