# Evaluation of KLF6 targets 

In [None]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns

In [None]:
# visualization settings
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [None]:
import celloracle as co
co.__version__

In [None]:
from gprofiler import GProfiler

In [None]:
gp = GProfiler(return_dataframe=True)

## Trevino et al 2021

In [None]:
links = co.load_hdf5(file_path="/home/jovyan/jm_jlab/data_indNeuro/1.GRN/milestones_pcw16_baggindridge.celloracle.links")

In [None]:
sc.set_figure_params(figsize=(8,4))

links.plot_score_per_cluster(goi="KLF6")

In [None]:
klf6_vRG = links.filtered_links['vRG'][links.filtered_links['vRG']['source'] == 'KLF6']['target']
klf6_oRG = links.filtered_links['oRG'][links.filtered_links['oRG']['source'] == 'KLF6']['target']

In [None]:
klf6_oRG.isin(klf6_vRG).sum()/len(klf6_oRG)

In [None]:
enrich_oRG = gp.profile(organism='hsapiens',
            query=klf6_oRG.tolist(), no_evidences=False)

In [None]:
enrich_vRG = gp.profile(organism='hsapiens',
            query=klf6_vRG.tolist(), no_evidences=False)

## Polioudakis et al 2019

In [None]:
# You can load files with the following command.
links_pol = co.load_hdf5(file_path="/home/jovyan/jm_jlab/data_indNeuro/1.GRN/pol19_baggingridge.links.celloracle.links")

In [None]:
sc.set_figure_params(figsize=(8,4))

links_pol.plot_score_per_cluster(goi="KLF6")

In [None]:
klf6_vRG_pol = links_pol.filtered_links['vRG'][links_pol.filtered_links['vRG']['source'] == 'KLF6']['target']
klf6_oRG_pol = links_pol.filtered_links['oRG'][links_pol.filtered_links['oRG']['source'] == 'KLF6']['target']

In [None]:
klf6_oRG_pol.isin(klf6_vRG_pol).sum()/len(klf6_oRG_pol)

In [None]:
enrich_oRG_KLF6_pol19 = gp.profile(organism='hsapiens',
            query=klf6_oRG_pol.tolist(), 
               no_evidences=False)

In [None]:
enrich_vRG_KLF6_pol19 = gp.profile(organism='hsapiens',
            query=klf6_vRG_pol.tolist(), 
               no_evidences=False)

### Shared targets between datasets

In [None]:
d = {'Trevino21': [klf6_oRG.isin(klf6_oRG).sum()/len(klf6_oRG)*100, 
               klf6_oRG.isin(klf6_oRG_pol).sum()/len(klf6_oRG)*100,
 klf6_vRG.isin(klf6_vRG).sum()/len(klf6_vRG)*100, 
 klf6_vRG.isin(klf6_vRG_pol).sum()/len(klf6_vRG)*100],
     
 
 
 'Polioudakis19': [klf6_oRG_pol.isin(klf6_oRG).sum()/len(klf6_oRG_pol)*100, 
                 klf6_oRG_pol.isin(klf6_oRG_pol).sum()/len(klf6_oRG_pol)*100,
                 klf6_vRG_pol.isin(klf6_vRG).sum()/len(klf6_vRG_pol)*100, 
                 klf6_vRG_pol.isin(klf6_vRG_pol).sum()/len(klf6_vRG_pol)*100]}

In [None]:
overlap = pd.DataFrame(d, index=['Trevino21_oRG', 'Polioudakis19_oRG', 'Trevino21_vRG', 'Polioudakis19_vRG'])

### **Polioudakis 2019 KLF6 regulon**

In [None]:
!wget https://www.cell.com/cms/10.1016/j.neuron.2019.06.011/attachment/d6337751-fed2-4e9c-ac8d-1a0d3ad24373/mmc8.xlsx -O mmc8.xlsx

In [None]:
regulons = pd.read_excel("mmc8.xlsx", sheet_name="TF gene modules")

In [None]:
klf6_regulon = regulons[['KLF6']].dropna()

In [None]:
klf6_regulon_list = klf6_regulon.KLF6.values.tolist()

In [None]:
klf6_regulon_list.append("KLF6")

In [None]:
enrich_regulon_pol19 = gp.profile(organism='hsapiens',
            query=klf6_regulon_list, 
               no_evidences=False)

In [None]:
!rm mmc8.xlsx

### Saving results to xlsx file

In [None]:
sheet_names_klf6 = dict({0:"oRG_Trevino21_KLF6",1:"vRG_Trevino21_KLF6",2:"oRG_Polioudakis19_KLF6",3:"vRG_Polioudakis19_KLF6",4:'KLF6targets_overlap',5:'Polioudakis19_KLF6regulon'})

In [None]:
sheet_names_klf6

In [None]:
cbl_github = '/home/jovyan/jm_jlab/CBL_data/indirectNeurogenesis/GRN_CellOracle/'

### GO plot

In [None]:
top10_vRG = enrich_vRG[enrich_vRG['source'] != 'TF'].sort_values('p_value').head(10)
top10_oRG = enrich_oRG[enrich_oRG['source'] != 'TF'].sort_values('p_value').head(10)

top10_oRG['p_value'] = -np.log10(top10_oRG['p_value'])

top10_vRG['p_value'] = np.log10(top10_vRG['p_value'])



top10_oRG = top10_oRG[['name', 'p_value']]
top10_oRG['color'] = 'oRG'

top10_vRG = top10_vRG[['name', 'p_value']]
top10_vRG['color'] = 'vRG'

mydf = top10_oRG.append(top10_vRG, ignore_index=True)

mydf['name'][0]

mydf['name'][0] = 'Cholesterol metabolism' #For plotting purposes, instead of Cholesterol metabolism with Bloch and Kandutsc...

plt.rcParams["figure.figsize"] = [10, 6]


g = sns.barplot(data=mydf,
    x='p_value',
    y='name',
    hue='color', palette=dict({"oRG": "#279E68", "vRG": '#D62728'})
)
g.set(xlabel="-log10 p-value") 
g.set(ylabel=None)  # remove the axis label
g.set_xticklabels(np.array([0, 7.5, 5, 2.5, 0, 2.5, 5, 7.5, 10])) #all log10
sns.despine()

plt.legend(title="", loc=4)
plt.yticks(fontsize=15, rotation=0)
plt.show()

In [None]:
top10_vRG = enrich_vRG[enrich_vRG['source'] != 'TF'].sort_values('p_value').head(10)
top10_oRG = enrich_oRG[enrich_oRG['source'] != 'TF'].sort_values('p_value').head(10)

top10_oRG['p_value'] = -np.log10(top10_oRG['p_value'])

top10_vRG['p_value'] = np.log10(top10_vRG['p_value'])



top10_oRG = top10_oRG[['name', 'p_value']]
top10_oRG['color'] = 'oRG'

top10_vRG = top10_vRG[['name', 'p_value']]
top10_vRG['color'] = 'vRG'

mydf = top10_oRG.append(top10_vRG, ignore_index=True)

mydf['name'][0]

mydf['name'][0] = 'Cholesterol metabolism' #For plotting purposes, instead of Cholesterol metabolism with Bloch and Kandutsc...

plt.rcParams["figure.figsize"] = [6, 6]


g = sns.barplot(data=mydf,
    x='p_value',
    y='name',
    hue='color', palette=dict({"oRG": "#279E68", "vRG": '#D62728'})
)
g.set(xlabel="-log10 p-value") 
g.set(ylabel=None)  # remove the axis label
g.set_xticklabels(np.array([0, 7.5, 5, 2.5, 0, 2.5, 5, 7.5, 10])) #all log10
sns.despine()

plt.legend(title="", loc=4)
plt.yticks(fontsize=15, rotation=0)
plt.show()

In [None]:
top10_vRG_pol = enrich_vRG_KLF6_pol19[enrich_vRG_KLF6_pol19['source'] != 'TF'].sort_values('p_value').head(10)
top10_oRG_pol = enrich_oRG_KLF6_pol19[enrich_oRG_KLF6_pol19['source'] != 'TF'].sort_values('p_value').head(10)

top10_oRG_pol['p_value'] = -np.log10(top10_oRG_pol['p_value'])

top10_vRG_pol['p_value'] = np.log10(top10_vRG_pol['p_value'])



top10_oRG_pol = top10_oRG_pol[['name', 'p_value']]
top10_oRG_pol['color'] = 'oRG'

top10_vRG_pol = top10_vRG_pol[['name', 'p_value']]
top10_vRG_pol['color'] = 'vRG'

mydf = top10_oRG_pol.append(top10_vRG_pol, ignore_index=True)

plt.rcParams["figure.figsize"] = [10, 6]


g = sns.barplot(data=mydf,
    x='p_value',
    y='name',
    hue='color', palette=dict({"oRG": "#279E68", "vRG": '#D62728'})
)
g.set(xlabel="-log10 p-value") 
g.set(ylabel=None)  # remove the axis label
g.set_xticklabels(np.array([0, 7.5, 5, 2.5, 0, 2.5, 5, 7.5, 10])) #all log10
sns.despine()

plt.legend(title="", loc=4)
plt.yticks(fontsize=15, rotation=0)
plt.show()

# Evaluation KLF6 targets in NMF cholesterol module 

In [None]:
piNMF_oRG = pd.read_csv("/home/jovyan/jm_jlab/data_indNeuro/3.NMF_data/4k_oRG_pcw16/4k_oRG_pcw16modules_to_oRG_topgenes_pcw16_4K_piNMF.tsv", sep='\t')

In [None]:
output_directory = '/home/jovyan/jm_jlab/CBL_data/indirectNeurogenesis/GRN_CellOracle/'

In [None]:
writer = pd.ExcelWriter(output_directory+"KLF6targets_in_piNMF-Modules_GOenrich.xlsx", engine='xlsxwriter')

for i in ['Module_1', 'Module_2', 'Module_3', 'Module_4']:
    tmp = gp.profile(organism='hsapiens',
            query=klf6_oRG[klf6_oRG.isin(piNMF_oRG[i])].tolist(), 
               no_evidences=False)
    print(i)
    
    tmp.to_excel(writer, sheet_name=f"KLF6 targets in piNMF {i}")
    
writer.close()