In [2]:
REPO = '../'
RESULT_TABLE = f'{REPO}/data/table'
RESULT_OBJ = f'{REPO}/data/object'
FIGURE_FOLDER= f'{REPO}/data/figure'
SETTING_FOLDER = f'{REPO}/data/setting'
EXTERNAL_DATA=f'{REPO}/data/external'

import os
import sys
import glob
import scanpy as sc
import anndata
sys.path.append(REPO)
from settings import COLOR_PAlETTE
from utils.visual import *
import warnings
import commentjson
import pickle

### Additional Colormap
with open(f'{SETTING_FOLDER}/ColorMap.json','r') as f:
    colormap = commentjson.load(f)
COLOR_PAlETTE.update(colormap)
plt.style.use(f'{REPO}/paper.mplstyle')

warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2

In [4]:
_stderr = sys.stderr
null = open(os.devnull,'wb')
import dill
scplus_obj = dill.load(open(f'{RESULT_OBJ}/scenic/tumor/scplus_obj.pkl', 'rb'))

FileNotFoundError: [Errno 2] No such file or directory: '..//data/object/scenic/tumor/scplus_obj.pkl'

## Simplifying and filtering SCENIC+ output

In [3]:
scplus_obj.uns.keys()

dict_keys(['Cistromes', 'search_space', 'region_to_gene', 'TF2G_adj', 'eRegulons', 'eRegulon_metadata', 'eRegulon_signatures', 'eRegulon_AUC', 'Pseudobulk', 'TF_cistrome_correlation', 'eRegulon_AUC_thresholds'])

In [4]:
from scenicplus.preprocessing.filtering import apply_std_filtering_to_eRegulons
apply_std_filtering_to_eRegulons(scplus_obj)

Only keeping positive R2G
Only keep extended if not direct
Getting signatures...
Simplifying eRegulons ...


## eRegulon enrichment scores

In [None]:
from scenicplus.eregulon_enrichment import score_eRegulons
region_ranking = dill.load(open(f'{RESULT_OBJ}/scenic/tumor/region_ranking.pkl', 'rb')) #load ranking calculated using the wrapper function
gene_ranking = dill.load(open(f'{RESULT_OBJ}/scenic/tumor/gene_ranking.pkl', 'rb')) #load ranking calculated using the wrapper function
score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures_filtered',
                key_added = 'eRegulon_AUC_filtered',
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = 5)
score_eRegulons(scplus_obj,
                gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures_filtered',
                key_added = 'eRegulon_AUC_filtered',
                enrichment_type = 'gene',
                auc_threshold = 0.05,
                normalize= False,
                n_cpu = 5)

## eRegulon dimensionality reduction

In [None]:
from scenicplus.dimensionality_reduction import run_eRegulons_tsne, run_eRegulons_umap
run_eRegulons_umap(
    scplus_obj = scplus_obj,
    auc_key = 'eRegulon_AUC_filtered',
    reduction_name = 'eRegulons_UMAP', #overwrite previously calculated UMAP
)
run_eRegulons_tsne(
    scplus_obj = scplus_obj,
    auc_key = 'eRegulon_AUC_filtered',
    reduction_name = 'eRegulons_tSNE', #overwrite previously calculated tSNE
)

In [None]:
scplus_obj.metadata_cell['Cellstate']

In [None]:

adata = anndata.AnnData(obs=scplus_obj.metadata_cell[['Cellstate']],
                        obsm={'X_tsne':scplus_obj.dr_cell['eRegulons_tSNE'].values}
                       )
adata.obs['Cellstate'] = pd.Categorical(adata.obs['Cellstate'])
adata.uns['Cellstate_colors'] =[ COLOR_PAlETTE['Cellstate'][x] for x in adata.obs['Cellstate'].cat.categories]
sc.pl.tsne(adata,color=['Cellstate'],
           title=[f"Transcriptional factors and expression of their target genes (eRegulons)\n(N={adata.shape[0]:,})"])

In [None]:
from scenicplus.dimensionality_reduction import plot_metadata_given_ax
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#specify color_dictionary


fig, axs = plt.subplots(ncols=2, figsize = (16, 8))
plot_metadata_given_ax(
    scplus_obj=scplus_obj,
    ax = axs[0],
    reduction_name = 'eRegulons_UMAP',
    variable = 'Cellstate', #note the GEX_ prefix, this metadata originated from the gene expression metadata (on which we did the cell type annotation before)
    # color_dictionary={'GEX_celltype': color_dict}
)
plot_metadata_given_ax(
    scplus_obj=scplus_obj,
    ax = axs[1],
    reduction_name = 'eRegulons_tSNE',
    variable = 'Cellstate', #note the GEX_ prefix, this metadata originated from the gene expression metadata (on which we did the cell type annotation before)
    # color_dictionary={'GEX_celltype': color_dict}
)
fig.tight_layout()
sns.despine(ax = axs[0]) #remove top and right edge of axis border
sns.despine(ax = axs[1]) #remove top and right edge of axis border
plt.show()

## correlation between TF expression and target region enrichment scores (AUC values)

In [None]:
from scenicplus.cistromes import TF_cistrome_correlation, generate_pseudobulks

generate_pseudobulks(
        scplus_obj = scplus_obj,
        variable = 'Cellstate',
        auc_key = 'eRegulon_AUC_filtered',
        signature_key = 'Gene_based')
generate_pseudobulks(
        scplus_obj = scplus_obj,
        variable = 'Cellstate',
        auc_key = 'eRegulon_AUC_filtered',
        signature_key = 'Region_based')

TF_cistrome_correlation(
            scplus_obj,
            use_pseudobulk = True,
            variable = 'Cellstate',
            auc_key = 'eRegulon_AUC_filtered',
            signature_key = 'Gene_based',
            out_key = 'filtered_gene_based')
TF_cistrome_correlation(
            scplus_obj,
            use_pseudobulk = True,
            variable = 'Cellstate',
            auc_key = 'eRegulon_AUC_filtered',
            signature_key = 'Region_based',
            out_key = 'filtered_region_based')

## select eRegulons for which the correlaiton coefficient is above 0.70 or below -0.75

In [None]:
import numpy as np
n_targets = [int(x.split('(')[1].replace('r)', '')) for x in scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Cistrome']]
rho = scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Rho'].to_list()
adj_pval = scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Adjusted_p-value'].to_list()

thresholds = {
        'rho': [-0.75, 0.70],
        'n_targets': 0
}
import seaborn as sns
fig, ax = plt.subplots(figsize = (10, 5))
sc = ax.scatter(rho, n_targets, c = -np.log10(adj_pval), s = 5)
ax.set_xlabel('Correlation coefficient')
ax.set_ylabel('nr. target regions')
#ax.hlines(y = thresholds['n_targets'], xmin = min(rho), xmax = max(rho), color = 'black', ls = 'dashed', lw = 1)
ax.vlines(x = thresholds['rho'], ymin = 0, ymax = max(n_targets), color = 'black', ls = 'dashed', lw = 1)
ax.text(x = thresholds['rho'][0], y = max(n_targets), s = str(thresholds['rho'][0]))
ax.text(x = thresholds['rho'][1], y = max(n_targets), s = str(thresholds['rho'][1]))
sns.despine(ax = ax)
fig.colorbar(sc, label = '-log10(adjusted_pvalue)', ax = ax)
plt.show()

## Overlap of predicted target regions

In [None]:
selected_cistromes = scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based'].loc[
        np.logical_or(
                scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Rho'] > thresholds['rho'][1],
                scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Rho'] < thresholds['rho'][0]
        )]['Cistrome'].to_list()
selected_eRegulons = [x.split('_(')[0] for x in selected_cistromes]
selected_eRegulons_gene_sig = [
        x for x in scplus_obj.uns['eRegulon_signatures_filtered']['Gene_based'].keys()
        if x.split('_(')[0] in selected_eRegulons]
selected_eRegulons_region_sig = [
        x for x in scplus_obj.uns['eRegulon_signatures_filtered']['Region_based'].keys()
        if x.split('_(')[0] in selected_eRegulons]
#save the results in the scenicplus object
scplus_obj.uns['selected_eRegulon'] = {'Gene_based': selected_eRegulons_gene_sig, 'Region_based': selected_eRegulons_region_sig}
print(f'selected: {len(selected_eRegulons_gene_sig)} eRegulons')

## Overlap of target regions of the top 5 TFs per cell type based on the Regulon Specificity Score (RSS).

In [None]:
from scenicplus.RSS import *
regulon_specificity_scores(
        scplus_obj,
        variable = 'Cellstate',
        auc_key = 'eRegulon_AUC_filtered',
        signature_keys = ['Region_based'],
        selected_regulons = [x for x in scplus_obj.uns['selected_eRegulon']['Region_based'] if '-' not in x],
        out_key_suffix = '_filtered')

In [None]:
plot_rss(scplus_obj, 'Cellstate_filtered', num_columns=2, top_n=5, figsize = (20, 20))

In [None]:
scplus_obj.uns['selected_eRegulon']['Gene_based']

In [None]:
from scenicplus.plotting.dotplot import generate_dotplot_df
dotplot_df=generate_dotplot_df(
     scplus_obj = scplus_obj,
        size_matrix = scplus_obj.uns['eRegulon_AUC_filtered']['Region_based'], #specify what to plot as dot sizes, target region enrichment in this case
        color_matrix = scplus_obj.to_df('EXP'), #specify  what to plot as colors, TF expression in this case
        scale_size_matrix = True,
        scale_color_matrix = True,
        group_variable = 'Cellstate',
        subset_eRegulons = scplus_obj.uns['selected_eRegulon']['Gene_based']
)

In [None]:
size_val='Target region enrichment\n(Scaled)'
color_val='TF expression\n(Scaled)'
dotplot_df.rename(columns={
    'size_val':size_val,
    'color_val':color_val
},inplace=True)

In [None]:

from PyComplexHeatmap import DotClustermapPlotter
plt.figure(figsize=(3, 3),dpi=150)
df_col = dotplot_df[['index']].drop_duplicates().set_index('index')
df_col['Cellstate'] = df_col.index
col_ha = HeatmapAnnotation(
                           label=anno_label(df_col['Cellstate'],
                                                 colors={k:v for k,v in COLOR_PAlETTE['Cellstate'].items() if k in df_col['Cellstate']},
                                                 merge=True,rotation=45),
                           Cellstate=anno_simple(df_col['Cellstate'],
                                                 colors={k:v for k,v in COLOR_PAlETTE['Cellstate'].items() if k in df_col['Cellstate']},
                                                 legend=False),
                           verbose=0,label_side='left')

cm = DotClustermapPlotter(data=dotplot_df, x='index',y='eRegulon_name',value=color_val,c=color_val,s=size_val,
                          row_cluster=True,col_cluster=True,spines=True,
                          cmap='RdYlGn_r',ratio=200,dot_legend_kws={'frameon':False},
                         top_annotation=col_ha,show_rownames=True,row_names_side='left')

## Coverage plot

In [None]:
tumor_MP_gmt_path = f'{RESULT_TABLE}/MPs/Tumor/MP_Programs.gmt'
tumor_MP_anno_path = f'{RESULT_TABLE}/MPs/Tumor/MetaProgram_Annotation.csv'
ref_ITH=pd.read_table(tumor_MP_gmt_path,header=None,index_col=0).drop([1],axis=1).T
ref_anno = pd.read_csv(tumor_MP_anno_path)
MP_anno_map = ref_anno[['MetaProgram','MPs']].drop_duplicates().set_index('MetaProgram')['MPs'].to_dict()
ref_ITH.columns = ref_ITH.columns.map(MP_anno_map)

In [None]:
scplus_obj.uns.keys()

In [None]:
from scenicplus.plotting.coverageplot import coverage_plot
import pyranges as pr

In [None]:
pr_gtf = pr.read_gtf(f"{EXTERNAL_DATA}/gencode.v32.annotation.gtf.gz")
pr_consensus_bed = pr.read_bed(f'{RESULT_TABLE}/Consensus_Peaks/consensus_regions.bed')

In [None]:
 scplus_obj.uns.keys()

In [None]:
from scenicplus.utils import get_interaction_pr
pr_interact = get_interaction_pr(scplus_obj, 'hsapiens', 'hg38', inplace = False, subset_for_eRegulons_regions = True, eRegulons_key = 'eRegulons')

In [None]:
scplus_obj.metadata_cell['Cellstate_format'] = scplus_obj.metadata_cell['Cellstate'].str.replace('[.+()-/\ ]','_')
cellstate_maps = scplus_obj.metadata_cell[['Cellstate','Cellstate_format']].drop_duplicates().set_index('Cellstate')['Cellstate_format'].to_dict()
bw_dict  = {
    os.path.basename(x).replace('.bw',''):x 
    for x in glob.glob(f'{RESULT_TABLE}/Consensus_Peaks/consensus_peak_calling/pseudobulk_bw_files/*.bw') if os.path.basename(x).replace('.bw','') in scplus_obj.metadata_cell['Cellstate_format'].unique()
}

In [None]:
tf_df = scplus_obj.uns['eRegulon_metadata_filtered']
tf_df = tf_df.loc[tf_df.Gene.isin(ref_ITH['EMT-III'].tolist()),:]
show_tf = tf_df.loc[tf_df.Region_signature_name=='TCF7L2_+_(325r)',:]
show_tf.sort_values('R2G_importance',ascending=False)

### EMT-III genes related regulons

In [None]:
color_dict = { cellstate_maps[k]:v for k,v in COLOR_PAlETTE['Cellstate'].items() if k in cellstate_maps}
fig = coverage_plot(scplus_obj,bw_dict=bw_dict,region='chr1:214374838-214604382',
                    pr_gtf=pr_gtf,pr_interact = pr_interact,pr_consensus_bed=pr_consensus_bed,
                    genes_violin_plot=['TCF7L2','PTPN14'],meta_data_key='Cellstate_format',
                    plot_order=['Tumor_EMT_III','Tumor_Cell_Cycle','Tumor_ER_I','Tumor_EMT_II','Tumor_ER_II'],
                    color_dict=color_dict,
                    figsize = (12,12),
                    region_bed_height=0.1,
                    gene_label_offset=30,
                    fontsize_dict={'bigwig_label': 15, 'title': 15,
                                   'bigwig_tick_label': 5,'gene_label': 9, 'violinplots_xlabel': 9, 'violinplots_ylabel': 9},
                    width_ratios_dict={'bigwig': 4, 'violinplots': 1},
                    height_ratios_dict = {'bigwig_violin': 10, 'genes': 0.5, 'arcs': 20})

In [None]:
color_dict = { cellstate_maps[k]:v for k,v in COLOR_PAlETTE['Cellstate'].items() if k in cellstate_maps}
fig = coverage_plot(scplus_obj,bw_dict=bw_dict,region='chr6:56670820-56673320',
                    pr_gtf=pr_gtf,pr_interact = pr_interact,pr_consensus_bed=pr_consensus_bed,
                    genes_violin_plot=['TCF7L2','DST'],meta_data_key='Cellstate_format',
                    plot_order=['Tumor_EMT_III','Tumor_Cell_Cycle','Tumor_ER_I','Tumor_EMT_II','Tumor_ER_II'],
                    color_dict=color_dict,
                    figsize = (12,12),
                    region_bed_height=0.1,
                    gene_label_offset=30,
                    fontsize_dict={'bigwig_label': 15, 'title': 15,
                                   'bigwig_tick_label': 5,'gene_label': 9, 'violinplots_xlabel': 9, 'violinplots_ylabel': 9},
                    width_ratios_dict={'bigwig': 4, 'violinplots': 1},
                    height_ratios_dict = {'bigwig_violin': 10, 'genes': 0.5, 'arcs': 20})

### Uniuqe regulons in EMT-III

In [None]:
color_dict = { cellstate_maps[k]:v for k,v in COLOR_PAlETTE['Cellstate'].items() if k in cellstate_maps}
fig = coverage_plot(scplus_obj,bw_dict=bw_dict,region='chr17:42312698-42333198',
                    pr_gtf=pr_gtf,pr_interact = pr_interact,pr_consensus_bed=pr_consensus_bed,
                    genes_violin_plot=['TCF7L2','STAT3'],meta_data_key='Cellstate_format',
                    plot_order=['Tumor_EMT_III','Tumor_Cell_Cycle','Tumor_ER_I','Tumor_EMT_II','Tumor_ER_II'],
                    color_dict=color_dict,
                    figsize = (12,12),
                    region_bed_height=0.1,
                    gene_label_offset=30,
                    width_ratios_dict={'bigwig': 4, 'violinplots': 1},
                    height_ratios_dict = {'bigwig_violin': 10, 'genes': 0.5, 'arcs': 5})

In [None]:
color_dict = { cellstate_maps[k]:v for k,v in COLOR_PAlETTE['Cellstate'].items() if k in cellstate_maps}
fig = coverage_plot(scplus_obj,bw_dict=bw_dict,region='chrX:6218698-6239198',
                    genes_violin_plot=['TCF7L2','NLGN4X'],meta_data_key='Cellstate_format',
                    pr_gtf=pr_gtf,pr_interact = pr_interact,pr_consensus_bed=pr_consensus_bed,
                    plot_order=['Tumor_EMT_III','Tumor_Cell_Cycle','Tumor_ER_I','Tumor_EMT_II','Tumor_ER_II'],
                    color_dict=color_dict,
                    figsize = (12,12),
                    region_bed_height=0.1,
                    gene_label_offset=30,
                    width_ratios_dict={'bigwig': 4, 'violinplots': 1},
                    height_ratios_dict = {'bigwig_violin': 10, 'genes': 0.5, 'arcs': 5})

#### NOTCH1

In [None]:
show_tf.loc[show_tf.Gene.isin(['NOTCH1']),:].sort_values('Region')

In [None]:
color_dict = { cellstate_maps[k]:v for k,v in COLOR_PAlETTE['Cellstate'].items() if k in cellstate_maps}
fig = coverage_plot(scplus_obj,bw_dict=bw_dict,region='chr9:136484572-136565370',
                    pr_gtf=pr_gtf,pr_interact = pr_interact,pr_consensus_bed=pr_consensus_bed,
                    genes_violin_plot=['TCF7L2','NOTCH1'],meta_data_key='Cellstate_format',
                    plot_order=['Tumor_EMT_III','Tumor_Cell_Cycle','Tumor_ER_I','Tumor_EMT_II','Tumor_ER_II'],
                    color_dict=color_dict,
                    figsize = (12,12),
                    region_bed_height=0.1,
                    gene_label_offset=30,
                    width_ratios_dict={'bigwig': 4, 'violinplots': 1},
                    height_ratios_dict = {'bigwig_violin': 10, 'genes': 0.5, 'arcs': 10})

### ER-I

#### ESR1_+_(307r)

In [None]:
tf_df = scplus_obj.uns['eRegulon_metadata_filtered']
tf_df = tf_df.loc[tf_df.Gene.isin(ref_ITH['ER-I'].tolist()),:]
show_tf = tf_df.loc[tf_df.Region_signature_name=='ESR1_+_(307r)',:]
show_tf.sort_values('R2G_importance',ascending=False)

In [None]:
color_dict = { cellstate_maps[k]:v for k,v in COLOR_PAlETTE['Cellstate'].items() if k in cellstate_maps}
fig = coverage_plot(scplus_obj,bw_dict=bw_dict,region='chr7:41960949-42264100',
                    pr_gtf=pr_gtf,pr_consensus_bed=pr_consensus_bed,pr_interact = pr_interact,
                    genes_violin_plot=['ESR1','GLI3'],meta_data_key='Cellstate_format',
                    plot_order=['Tumor_ER_I','Tumor_EMT_III','Tumor_Cell_Cycle','Tumor_EMT_II','Tumor_ER_II'],
                    color_dict=color_dict,
                    figsize = (12,12),
                    region_bed_height=0.1,
                    gene_label_offset=30,
                    fontsize_dict={'bigwig_label': 15, 'title': 15,
                                   'bigwig_tick_label': 5,'gene_label': 9, 'violinplots_xlabel': 9, 'violinplots_ylabel': 9},
                    width_ratios_dict={'bigwig': 4, 'violinplots': 1},
                    height_ratios_dict = {'bigwig_violin': 10, 'genes': 0.5, 'arcs': 10})

In [None]:
color_dict = { cellstate_maps[k]:v for k,v in COLOR_PAlETTE['Cellstate'].items() if k in cellstate_maps}
fig = coverage_plot(scplus_obj,bw_dict=bw_dict,region='chr14:37582844-37613344',
                    pr_gtf=pr_gtf,pr_consensus_bed=pr_consensus_bed,pr_interact = pr_interact,
                    genes_violin_plot=['ESR1','TTC6'],meta_data_key='Cellstate_format',
                    plot_order=['Tumor_ER_I','Tumor_EMT_III','Tumor_Cell_Cycle','Tumor_EMT_II','Tumor_ER_II'],
                    color_dict=color_dict,
                    figsize = (12,12),
                    region_bed_height=0.1,
                    gene_label_offset=30,
                    width_ratios_dict={'bigwig': 4, 'violinplots': 1},
                    height_ratios_dict = {'bigwig_violin': 10, 'genes': 0.5, 'arcs': 10})

#### GATA3_+_(95r)

In [None]:
tf_df = scplus_obj.uns['eRegulon_metadata_filtered']
tf_df = tf_df.loc[tf_df.Gene.isin(ref_ITH['ER-I'].tolist()),:]
show_tf = tf_df.loc[tf_df.Region_signature_name=='GATA3_+_(95r)',:]
show_tf.sort_values('R2G_importance',ascending=False)

In [None]:
color_dict = { cellstate_maps[k]:v for k,v in COLOR_PAlETTE['Cellstate'].items() if k in cellstate_maps}
fig = coverage_plot(scplus_obj,bw_dict=bw_dict,region='chr14:37582844-37613344',
                    pr_gtf=pr_gtf,pr_consensus_bed=pr_consensus_bed,pr_interact = pr_interact,
                    genes_violin_plot=['GATA3','ESR1','TTC6'],meta_data_key='Cellstate_format',
                    plot_order=['Tumor_ER_I','Tumor_EMT_III','Tumor_Cell_Cycle','Tumor_EMT_II','Tumor_ER_II'],
                    color_dict=color_dict,
                    figsize = (12,12),
                    region_bed_height=0.1,
                    gene_label_offset=30,
                    fontsize_dict={'bigwig_label': 15, 'title': 15,
                                   'bigwig_tick_label': 5,'gene_label': 9, 'violinplots_xlabel': 9, 'violinplots_ylabel': 9},
                    width_ratios_dict={'bigwig': 4, 'violinplots': 1},
                    height_ratios_dict = {'bigwig_violin': 10, 'genes': 0.5, 'arcs': 10})