# GRN - over-representation pcw16

In [None]:
!cd /home/jovyan/jm_jlab/

import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns

import celloracle as co
co.__version__

# visualization settings
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

#plt.rcParams['figure.figsize'] = (15,7)
#plt.rcParams["savefig.dpi"] = 600

save_folder = "/home/jovyan/jm_jlab/data_indNeuro/1.GRN/"
os.makedirs(save_folder, exist_ok=True)

from pathlib import Path

sc._settings.ScanpyConfig.cachedir = Path('/home/jovyan/jm_jlab/celloracle_data/cache')

In [None]:
from scipy.stats import hypergeom

## **PEAKS**

In [None]:
import os, sys, shutil, importlib, glob
from tqdm.notebook import tqdm

In [None]:
from celloracle import motif_analysis as ma
from celloracle.utility import save_as_pickled_object

In [None]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

plt.rcParams['figure.figsize'] = (15,7)
plt.rcParams["savefig.dpi"] = 600

In [None]:
df = pd.read_parquet("/home/jovyan/jm_jlab/data_indNeuro/consensus_atlas_ATACregions_hg38/base_GRN_dataframe_HOCOMOCOv11.parquet")

## 

In [None]:
links = co.load_hdf5(file_path="/home/jovyan/jm_jlab/data_indNeuro/1.GRN/milestones_pcw16_baggindridge.celloracle.links")

In [None]:
links_bayesian_ridge = co.load_hdf5(file_path="/home/jovyan/jm_jlab/data_indNeuro/1.GRN/alt_milestones_pcw16_bayesianridge.celloracle.links")

## **ENRICHMENT**

### Comparison to CO Bayesian ridge-based results

### Comparison to Polioudakis 2019 regulons

In [None]:
regulons = pd.read_excel("/home/jovyan/jm_jlab/ST7_regulons_pol19.xlsx")

universe_regulons = regulons.copy()

universe_regulons.drop('TF', axis=1, inplace=True)

universe_regulons = universe_regulons.stack().reset_index(drop=True)

universe_regulons.drop_duplicates(inplace=True)

All targets

In [None]:
A_1 = pd.DataFrame(links.filtered_links['vRG']['source']+"-"+links.filtered_links['vRG']['target'])[0].to_list()
A_2 = pd.DataFrame(links.filtered_links['IPC']['source']+"-"+links.filtered_links['IPC']['target'])[0].to_list()
A_3 = pd.DataFrame(links.filtered_links['oRG']['source']+"-"+links.filtered_links['oRG']['target'])[0].to_list()

In [None]:
vRG_TFtargets = pd.DataFrame(A_1, columns=['TF_targets_vRG'])
IPC_TFtargets = pd.DataFrame(A_2, columns=['TF_targets_IPC'])
oRG_TFtargets = pd.DataFrame(A_3, columns=['TF_targets_oRG'])

**Saving to excel**

In [None]:
writer = pd.ExcelWriter("/home/jovyan/jm_jlab/CBL_data/indirectNeurogenesis/GRN_CellOracle/GRN_CellOracle_Trevino21/pcw16/SupplementaryTable_pcw16.xlsx", engine='xlsxwriter')

for j in ['vRG', 'IPC', 'oRG']:

    tmp = pd.DataFrame(links.filtered_links[j])
    tmp2 = pd.DataFrame(links_bayesian_ridge.filtered_links[j])
    d=[]

    for i in set(tmp['source']).intersection(tmp2['source']):
        subset1 = tmp[tmp['source'] == i].copy()
        subset2 = tmp2[tmp2['source'] == i].copy()

        d.append({

            'N_Targets': len(subset1['target'].drop_duplicates()),

            'Intersection': len(subset1[subset1['target'].isin(subset2['target'])]),

            'N_Targets_in_bayesianridge': len(subset2['target'].drop_duplicates()),
            
            '%_reference': (len(subset1[subset1['target'].isin(subset2['target'])])*100)/len(subset1['target'].drop_duplicates()),

            'pval': hypergeom.sf(len(subset1[subset1['target'].isin(subset2['target'])])-1, 
                                  4000, 
                                  subset1.shape[0], len(subset2['target']))

            })
    pd.DataFrame(d, index=set(tmp['source']).intersection(tmp2['source'])).to_excel(writer, sheet_name="CO_"+j)
    
for j in ['vRG', 'IPC', 'oRG']:

    tmp = pd.DataFrame(links.filtered_links[j])
    d=[]

    for i in set(tmp['source']).intersection(regulons.columns):
        subset = tmp[tmp['source'] == i]
        goi = regulons.iloc[:,regulons.columns.str.contains(i)]
        goi.dropna(inplace=True)
        goi.drop_duplicates(inplace=True)

        d.append({

            'N_Targets': subset.shape[0],

            'Intersection_w_Pol19_regulon': len(subset[subset['target'].isin(goi[i])]),

            'N_Targets_in_Pol19_regulon': len(goi),
            
            '%_reference': (len(subset[subset['target'].isin(goi[i])])*100)/subset.shape[0],

            'pval': hypergeom.sf(len(subset[subset['target'].isin(goi[i])])-1, 
                                  len(universe_regulons), 
                                  subset.shape[0], len(goi))

            })

    pd.DataFrame(d, index=set(tmp['source']).intersection(regulons.columns)).to_excel(writer, sheet_name="Pol19_"+j)

    
  

for l in [vRG_TFtargets, oRG_TFtargets, IPC_TFtargets]:
    
    l.to_excel(writer, sheet_name=l.columns[0], index=False)

    
writer.close()

**PLOT**

In [None]:
filename = "/home/jovyan/jm_jlab/CBL_data/indirectNeurogenesis/GRN_CellOracle/GRN_CellOracle_Trevino21/pcw16/SupplementaryTable_pcw16.xlsx"
ST = pd.ExcelFile(filename)

In [None]:
ST.sheet_names

In [None]:
df0 = pd.read_excel(filename, sheet_name=ST.sheet_names[0])
df1 = pd.read_excel(filename, sheet_name=ST.sheet_names[1])
df2 = pd.read_excel(filename, sheet_name=ST.sheet_names[2])

df3 = pd.read_excel(filename, sheet_name=ST.sheet_names[3])
df4 = pd.read_excel(filename, sheet_name=ST.sheet_names[4])
df5 = pd.read_excel(filename, sheet_name=ST.sheet_names[5])

In [None]:
names = [['RegressionModels_vRG'],['RegressionModels_IPC'],['RegressionModels_oRG'],
['Datasets_vRG'],['Datasets_IPC'],['Datasets_oRG']]

dfs = []
for i, n in enumerate([df0,df1,df2,df3,df4,df5]):
    d = {'Comparison': names[i]*n.shape[0], '%_Overlap': n['%_reference']}
    tmp = pd.DataFrame(d)
    dfs.append(tmp)
    all_dfs = pd.concat(dfs)

In [None]:
all_dfs
all_dfs.groupby('Comparison', as_index=False)['%_Overlap'].mean()

In [None]:
sns.set(rc={'figure.figsize':(12,8)})

sns.set(style='whitegrid')


sns.stripplot(x="Comparison",
                y="%_Overlap", hue='Comparison',data=all_dfs)

g = sns.boxplot(x="Comparison",
                y="%_Overlap", hue='Comparison',
                data=all_dfs, 
                dodge =False, 
                width=.1, 
                boxprops=dict(alpha=.3), 
                showfliers=False) #outliers from dotplot

g.set(xlabel="Pairwise comparisons")
g.set(ylabel="% Overlap")

plt.xticks(fontsize=12, rotation=90)                                                               
g.legend_.remove()

In [None]:
for i in df.iloc[:,df.columns.isin(oRG_TFtargets['TF_targets_oRG'].str.split("-", expand=True)[0])].columns:
    if oRG_TFtargets['TF_targets_oRG'].str.split("-", expand=True)[0].str.contains(i).sum() >= 30:
        #print(i)

        print("Number of targets found for "+i+" in oRG: "+oRG_TFtargets['TF_targets_oRG'].str.split("-", expand=True)[0].str.contains(i).sum().astype(str))

In [None]:
for i in df.iloc[:,df.columns.isin(vRG_TFtargets['TF_targets_vRG'].str.split("-", expand=True)[0])].columns:
    if vRG_TFtargets['TF_targets_vRG'].str.split("-", expand=True)[0].str.contains(i).sum() >= 30:
        #print(i)

        print("Number of targets found for "+i+" in vRG: "+vRG_TFtargets['TF_targets_vRG'].str.split("-", expand=True)[0].str.contains(i).sum().astype(str))

In [None]:
for i in df.iloc[:,df.columns.isin(IPC_TFtargets['TF_targets_IPC'].str.split("-", expand=True)[0])].columns:
    if IPC_TFtargets['TF_targets_IPC'].str.split("-", expand=True)[0].str.contains(i).sum() >= 30:
        #print(i)

        print("Number of targets found for "+i+" in IPC: "+IPC_TFtargets['TF_targets_IPC'].str.split("-", expand=True)[0].str.contains(i).sum().astype(str))

## **IDENTIFYING UNIQUE TF-target genes in pairwise comparisons**

**vRG vs IPC**

**vRG vs oRG**

**oRG vs IPC**