In [2]:
import pandas as pd
import pathlib
from papermill import execute_notebook

In [3]:
name = 'InhMGECGE'

major_types = ['MGE-Pvalb', 'CGE-Lamp5', 'CGE-Vip', 'MGE-Sst', 'Unc5c']
output_dir = f'/home/hanliu/project/mouse_rostral_brain/study/ClustersEnsemble/{name}SubtypeEnsemble'
pathlib.Path(output_dir).mkdir(exist_ok=True)

## Cell Meta

In [4]:
cell_tidy_data = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
)
cell_tidy_data['MajorType'].unique()

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


array(['MGE-Sst', 'CA3', 'CA1', 'CA3-St18', 'Unc5c', 'Gfra1', 'ODC', 'PC',
       'ANP', 'IT-L5', 'NP-L6', 'CGE-Lamp5', 'CT-L6', 'IG-CA2', 'DG-po',
       'DG', 'CGE-Vip', 'OPC', 'ASC', 'MGC', 'PAL-Inh', 'PT-L5',
       'MGE-Pvalb', 'VLMC', 'EC', 'VLMC-Pia', 'OLF', 'MSN-D2', 'L6b',
       'IT-L6', 'IT-L23', 'IT-L4', 'OLF-Exc', 'CLA', 'Foxp2', 'MSN-D1',
       'LSX-Inh', 'D1L-Fstl4', 'EP', 'D1L-PAL', 'Chd7'], dtype=object)

## Select significant subtype of CP ACB

In [5]:
use_clusters = sorted(cell_tidy_data[cell_tidy_data['MajorType'].isin(
    major_types)]['SubType'].unique())
use_clusters = [i for i in use_clusters if 'Outlier' not in i]
use_clusters

['CGE-Lamp5 Dock5',
 'CGE-Lamp5 Grid1',
 'CGE-Lamp5 Grk5',
 'CGE-Lamp5 Nrxn3',
 'CGE-Lamp5 Sorcs1',
 'CGE-Vip Ccser1',
 'CGE-Vip Clstn2',
 'CGE-Vip Fstl4',
 'CGE-Vip Galnt17',
 'CGE-Vip Grm8',
 'CGE-Vip Ntng1',
 'CGE-Vip Ptprm',
 'CGE-Vip Robo1',
 'MGE-Pvalb Cacna1i',
 'MGE-Pvalb Cnih3',
 'MGE-Pvalb Entpd3',
 'MGE-Pvalb Gfra2',
 'MGE-Pvalb Ptprk',
 'MGE-Pvalb Sema5a',
 'MGE-Pvalb Thsd7a',
 'MGE-Sst Bmper',
 'MGE-Sst Chodl',
 'MGE-Sst Dock4',
 'MGE-Sst Etv1',
 'MGE-Sst Frmd6',
 'MGE-Sst Kcnip4',
 'MGE-Sst Ptpre',
 'MGE-Sst Rerg',
 'MGE-Sst Rxra',
 'MGE-Sst Ubtd1',
 'MGE-Sst Unc5b',
 'Unc5c Unc5c']

In [6]:
n_cluster = len(use_clusters)
n_cluster * (n_cluster - 1) / 2 * 0.02

9.92

## Step1 Select DMG

In [6]:
parameters = dict(auroc_cutoff=0.9,
                  fc_cutoff=2,
                  delta_cutoff=0.01,
                  rate_to_min=0.01,
                  rate_from_max=0.01,
                  n_repeat_pairs = 7,
                  use_clusters=use_clusters)

input_path = '1.related_dmgs.ipynb'
output_path = f'{output_dir}/1.related_dmgs.ipynb'

execute_notebook(str(input_path),
                 str(output_path),
                 parameters=parameters,
                 engine_name=None,
                 prepare_only=False,
                 kernel_name=None,
                 progress_bar=True,
                 log_output=False,
                 start_timeout=60,
                 report_mode=False,
                 cwd=output_dir)
pass

HBox(children=(IntProgress(value=0, max=19), HTML(value='')))




## Step2 Select DMR

In [7]:
parameters = dict(
    use_clusters=[i.replace(' ', '_') for i in use_clusters],
    hypo_dmr_hits_path=
    '/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalHits.HypoDMR.h5ad'
)

input_path = '2.related_dmrs.ipynb'
output_path = f'{output_dir}/2.related_dmrs.ipynb'

execute_notebook(str(input_path),
                 str(output_path),
                 parameters=parameters,
                 engine_name=None,
                 prepare_only=False,
                 kernel_name=None,
                 progress_bar=True,
                 log_output=False,
                 start_timeout=60,
                 report_mode=False,
                 cwd=output_dir)
pass

HBox(children=(IntProgress(value=0, max=19), HTML(value='')))




## Step3 Select corr

In [8]:
parameters = dict(corr_cutoff=0.3, distance_cutoff=500000)

input_path = '3.related_corr.ipynb'
output_path = f'{output_dir}/3.related_corr.ipynb'

execute_notebook(str(input_path),
                 str(output_path),
                 parameters=parameters,
                 engine_name=None,
                 prepare_only=False,
                 kernel_name=None,
                 progress_bar=True,
                 log_output=False,
                 start_timeout=60,
                 report_mode=False,
                 cwd=output_dir)
pass

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




## Step4 intersect everything

In [9]:
parameters = dict(delta_dmr_rate_cutoff = 0.3)

input_path = '4.intersect_everything.ipynb'
output_path = f'{output_dir}/4.intersect_everything.ipynb'

execute_notebook(str(input_path),
                 str(output_path),
                 parameters=parameters,
                 engine_name=None,
                 prepare_only=False,
                 kernel_name=None,
                 progress_bar=True,
                 log_output=False,
                 start_timeout=60,
                 report_mode=False,
                 cwd=output_dir)
pass

HBox(children=(IntProgress(value=0, max=23), HTML(value='')))




## Step5 Gene Clustering

In [10]:
parameters = dict(n_pcs=20, k=25, resolution=0.8, use_clusters=use_clusters)

input_path = '5.GeneClustering.ipynb'
output_path = f'{output_dir}/5.GeneClustering.ipynb'

execute_notebook(str(input_path),
                 str(output_path),
                 parameters=parameters,
                 engine_name=None,
                 prepare_only=True,
                 kernel_name=None,
                 progress_bar=True,
                 log_output=False,
                 start_timeout=60,
                 report_mode=False,
                 cwd=output_dir)
pass

## Step6 Group DMR based on gene clusters

In [11]:
input_path = '6.DMRGroupBasedOnGeneClustering.ipynb'
output_path = f'{output_dir}/6.DMRGroupBasedOnGeneClustering.ipynb'

execute_notebook(str(input_path),
                 str(output_path),
                 parameters={},
                 engine_name=None,
                 prepare_only=False,
                 kernel_name=None,
                 progress_bar=True,
                 log_output=False,
                 start_timeout=60,
                 report_mode=False,
                 cwd=output_dir)
pass

HBox(children=(IntProgress(value=0, max=13), HTML(value='')))




## Motif enrichment

In [8]:
motif_enrichment_dir = pathlib.Path(output_dir) / 'MotifEnrichment'
motif_enrichment_dir.mkdir(exist_ok=True)

In [9]:
import json
with open(pathlib.Path(output_dir) / 'GeneCluster.relatedDMR.index.json') as f:
    gene_dmr_dict = json.load(f)

In [10]:
min_dmr_cutoff = 300

In [15]:
for cluster, use_dmrs in gene_dmr_dict.items():
    if len(use_dmrs) < min_dmr_cutoff:
        continue
    print(cluster, len(use_dmrs))
    parameters = dict(gene_cluster=cluster,
                      or_cutoff=1.6,
                      neg_lgp_cutoff=3,
                      mask_quantile_to_max=0.8)

    input_path = '7.MotifEnrichment.ipynb'
    output_path = f'{output_dir}/7.MotifEnrichment.ipynb'

    result_file = motif_enrichment_dir / f'{cluster}.Hypo.motif_enrichment.msg'
    if result_file.exists():
        continue

    execute_notebook(input_path=str(input_path),
                     output_path=str(output_path),
                     parameters=parameters,
                     engine_name=None,
                     prepare_only=False,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False,
                     start_timeout=60,
                     report_mode=False,
                     cwd=str(output_dir))

0 3336


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


1 3977


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


2 18450


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


3 7109


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


4 10744


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


5 2022


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


6 1040


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


7 15217


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


8 12109


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


9 5367


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


10 19494


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


11 2473


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


12 4372


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


13 4059


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))




### Aggregate motif enrichment

In [17]:
parameters = dict(
    oddsratio_cutoff=1.8,
    lgp_cutoff=3,
    motif_enrichment_dir=str(motif_enrichment_dir),
    tf_class_level='SubFamily',
)

input_path = '8.aggregate_motif_enrichment.ipynb'
output_path = f'{output_dir}/8.aggregate_motif_enrichment.ipynb'
   
execute_notebook(str(input_path),
                 str(output_path),
                 parameters=parameters,
                 engine_name=None,
                 prepare_only=False,
                 kernel_name=None,
                 progress_bar=True,
                 log_output=False,
                 start_timeout=60,
                 report_mode=False,
                 cwd=str(output_dir))
pass

HBox(children=(IntProgress(value=0, max=27), HTML(value='')))




## Motif Enrichment Cell Cluster

In [None]:
use_clusters = [i.replace(' ', '_') for i in use_clusters]
for cluster in use_clusters:
    print(cluster)
    parameters = dict(use_clusters=use_clusters,
                      cluster=cluster,
                      or_cutoff=1.6,
                      neg_lgp_cutoff=10,
                      mask_quantile_to_max=0.6)
    
    input_path = '9.MotifEnrichment_CellCluster.ipynb'
    output_path = f'{output_dir}/9.MotifEnrichment_CellCluster.ipynb'
    
    result_file = motif_enrichment_dir / f'Cell.{cluster}.Hypo.motif_enrichment.msg'
    if result_file.exists():
        pass# continue

    execute_notebook(input_path=str(input_path),
                     output_path=str(output_path),
                     parameters=parameters,
                     engine_name=None,
                     prepare_only=False,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False,
                     start_timeout=60,
                     report_mode=False,
                     cwd=str(output_dir))

CGE-Lamp5_Dock5


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CGE-Lamp5_Grid1


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CGE-Lamp5_Grk5


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CGE-Lamp5_Nrxn3


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CGE-Lamp5_Sorcs1


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CGE-Vip_Ccser1


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CGE-Vip_Clstn2


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CGE-Vip_Fstl4


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CGE-Vip_Galnt17


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CGE-Vip_Grm8


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CGE-Vip_Ntng1


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CGE-Vip_Ptprm


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CGE-Vip_Robo1


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


MGE-Pvalb_Cacna1i


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


MGE-Pvalb_Cnih3


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


MGE-Pvalb_Entpd3


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


MGE-Pvalb_Gfra2


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


MGE-Pvalb_Ptprk


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


MGE-Pvalb_Sema5a


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


MGE-Pvalb_Thsd7a


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


MGE-Sst_Bmper


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


MGE-Sst_Chodl


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


MGE-Sst_Dock4


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


MGE-Sst_Etv1


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


MGE-Sst_Frmd6


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


MGE-Sst_Kcnip4


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


MGE-Sst_Ptpre


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


MGE-Sst_Rerg


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


MGE-Sst_Rxra


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


MGE-Sst_Ubtd1


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


MGE-Sst_Unc5b


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


Unc5c_Unc5c


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))

In [None]:
parameters = dict(
    oddsratio_cutoff=1.8,
    lgp_cutoff=3,
    motif_enrichment_dir=str(motif_enrichment_dir),
    tf_class_level='SubFamily',
)

input_path = '10.aggregate_motif_enrichment-cell_cluster.ipynb'
output_path = f'{output_dir}/10.aggregate_motif_enrichment-cell_cluster.ipynb'

execute_notebook(str(input_path),
                 str(output_path),
                 parameters=parameters,
                 engine_name=None,
                 prepare_only=False,
                 kernel_name=None,
                 progress_bar=True,
                 log_output=False,
                 start_timeout=60,
                 report_mode=False,
                 cwd=str(output_dir))
pass