In [1]:
import pandas as pd
import pathlib
from papermill import execute_notebook

In [2]:
name = 'ExcITPT'

major_types = ['IT-L5', 'PT-L5', 'IT-L6', 'IT-L23', 'IT-L4', 'CLA', 'EP']
output_dir = f'/home/hanliu/project/mouse_rostral_brain/study/ClustersEnsemble/{name}SubtypeEnsemble'
pathlib.Path(output_dir).mkdir(exist_ok=True)

## Cell Meta

In [3]:
cell_tidy_data = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
)
cell_tidy_data['MajorType'].unique()

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


array(['MGE-Sst', 'CA3', 'CA1', 'CA3-St18', 'Unc5c', 'Gfra1', 'ODC', 'PC',
       'ANP', 'IT-L5', 'NP-L6', 'CGE-Lamp5', 'CT-L6', 'IG-CA2', 'DG-po',
       'DG', 'CGE-Vip', 'OPC', 'ASC', 'MGC', 'PAL-Inh', 'PT-L5',
       'MGE-Pvalb', 'VLMC', 'EC', 'VLMC-Pia', 'OLF', 'MSN-D2', 'L6b',
       'IT-L6', 'IT-L23', 'IT-L4', 'OLF-Exc', 'CLA', 'Foxp2', 'MSN-D1',
       'LSX-Inh', 'D1L-Fstl4', 'EP', 'D1L-PAL', 'Chd7'], dtype=object)

## Select significant subtype of CP ACB

In [4]:
use_clusters = sorted(cell_tidy_data[cell_tidy_data['MajorType'].isin(
    major_types)]['SubType'].unique())
use_clusters = [i for i in use_clusters if 'Outlier' not in i]
use_clusters

['CLA Bcl11a',
 'CLA Cdh8',
 'CLA Nrp2',
 'EP Adcy8',
 'EP Rgs8',
 'EP Tspan5',
 'IT-L23 Cux1',
 'IT-L23 Foxp1',
 'IT-L23 Ptprt',
 'IT-L23 Tenm2',
 'IT-L4 Astn2',
 'IT-L4 Shc3',
 'IT-L5 Cdh8',
 'IT-L5 Etv1',
 'IT-L5 Grik3',
 'IT-L6 Cadps2',
 'IT-L6 Fstl4',
 'IT-L6 Man1c1',
 'IT-L6 Oxr1',
 'PT-L5 Abca12',
 'PT-L5 Astn2',
 'PT-L5 Kcnh1',
 'PT-L5 Nectin1',
 'PT-L5 Plcb4',
 'PT-L5 Ptprt',
 'PT-L5 Tenm2',
 'PT-L5 Tmtc2',
 'PT-L5 Unc5b']

In [10]:
n_cluster = len(use_clusters)
n_cluster * (n_cluster - 1) / 2 * 0.02

7.5600000000000005

## Step1 Select DMG

In [13]:
parameters = dict(auroc_cutoff=0.9,
                  fc_cutoff=2,
                  delta_cutoff=0.01,
                  rate_to_min=0.01,
                  rate_from_max=0.01,
                  n_repeat_pairs = 7,
                  use_clusters=use_clusters)

input_path = '1.related_dmgs.ipynb'
output_path = f'{output_dir}/1.related_dmgs.ipynb'

execute_notebook(str(input_path),
                 str(output_path),
                 parameters=parameters,
                 engine_name=None,
                 prepare_only=False,
                 kernel_name=None,
                 progress_bar=True,
                 log_output=False,
                 start_timeout=60,
                 report_mode=False,
                 cwd=output_dir)
pass

HBox(children=(IntProgress(value=0, max=19), HTML(value='')))




## Step2 Select DMR

In [12]:
parameters = dict(
    use_clusters=[i.replace(' ', '_') for i in use_clusters],
    hypo_dmr_hits_path=
    '/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalHits.HypoDMR.h5ad'
)

input_path = '2.related_dmrs.ipynb'
output_path = f'{output_dir}/2.related_dmrs.ipynb'

execute_notebook(str(input_path),
                 str(output_path),
                 parameters=parameters,
                 engine_name=None,
                 prepare_only=False,
                 kernel_name=None,
                 progress_bar=True,
                 log_output=False,
                 start_timeout=60,
                 report_mode=False,
                 cwd=output_dir)
pass

HBox(children=(IntProgress(value=0, max=19), HTML(value='')))




## Step3 Select corr

In [16]:
parameters = dict(corr_cutoff=0.3, distance_cutoff=500000)

input_path = '3.related_corr.ipynb'
output_path = f'{output_dir}/3.related_corr.ipynb'

execute_notebook(str(input_path),
                 str(output_path),
                 parameters=parameters,
                 engine_name=None,
                 prepare_only=False,
                 kernel_name=None,
                 progress_bar=True,
                 log_output=False,
                 start_timeout=60,
                 report_mode=False,
                 cwd=output_dir)
pass

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




## Step4 intersect everything

In [17]:
parameters = dict(delta_dmr_rate_cutoff = 0.3)

input_path = '4.intersect_everything.ipynb'
output_path = f'{output_dir}/4.intersect_everything.ipynb'

execute_notebook(str(input_path),
                 str(output_path),
                 parameters=parameters,
                 engine_name=None,
                 prepare_only=False,
                 kernel_name=None,
                 progress_bar=True,
                 log_output=False,
                 start_timeout=60,
                 report_mode=False,
                 cwd=output_dir)
pass

HBox(children=(IntProgress(value=0, max=23), HTML(value='')))




## Step5 Gene Clustering

In [18]:
parameters = dict(n_pcs=20, k=25, resolution=0.8, use_clusters=use_clusters)

input_path = '5.GeneClustering.ipynb'
output_path = f'{output_dir}/5.GeneClustering.ipynb'

execute_notebook(str(input_path),
                 str(output_path),
                 parameters=parameters,
                 engine_name=None,
                 prepare_only=True,
                 kernel_name=None,
                 progress_bar=True,
                 log_output=False,
                 start_timeout=60,
                 report_mode=False,
                 cwd=output_dir)
pass

HBox(children=(IntProgress(value=0, max=52), HTML(value='')))




## Step6 Group DMR based on gene clusters

In [19]:
input_path = '6.DMRGroupBasedOnGeneClustering.ipynb'
output_path = f'{output_dir}/6.DMRGroupBasedOnGeneClustering.ipynb'

execute_notebook(str(input_path),
                 str(output_path),
                 parameters={},
                 engine_name=None,
                 prepare_only=False,
                 kernel_name=None,
                 progress_bar=True,
                 log_output=False,
                 start_timeout=60,
                 report_mode=False,
                 cwd=output_dir)
pass

HBox(children=(IntProgress(value=0, max=13), HTML(value='')))




## Motif enrichment

In [20]:
motif_enrichment_dir = pathlib.Path(output_dir) / 'MotifEnrichment'
motif_enrichment_dir.mkdir(exist_ok=True)

In [21]:
import json
with open(pathlib.Path(output_dir) / 'GeneCluster.relatedDMR.index.json') as f:
    gene_dmr_dict = json.load(f)

In [22]:
min_dmr_cutoff = 300

In [23]:
for cluster, use_dmrs in gene_dmr_dict.items():
    if len(use_dmrs) < min_dmr_cutoff:
        continue
    print(cluster, len(use_dmrs))
    parameters = dict(gene_cluster=cluster,
                      or_cutoff=1.6,
                      neg_lgp_cutoff=3,
                      mask_quantile_to_max=0.8)

    input_path = '7.MotifEnrichment.ipynb'
    output_path = f'{output_dir}/7.MotifEnrichment.ipynb'

    result_file = motif_enrichment_dir / f'{cluster}.Hypo.motif_enrichment.msg'
    if result_file.exists():
        continue

    execute_notebook(input_path=str(input_path),
                     output_path=str(output_path),
                     parameters=parameters,
                     engine_name=None,
                     prepare_only=False,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False,
                     start_timeout=60,
                     report_mode=False,
                     cwd=str(output_dir))

0 2769


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


1 9650


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


2 15731


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


3 11583


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


4 4574


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


5 3407


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


6 8897


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


7 5150


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


8 8111


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


9 4886


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))




### Aggregate motif enrichment

In [24]:
parameters = dict(
    oddsratio_cutoff=1.8,
    lgp_cutoff=3,
    motif_enrichment_dir=str(motif_enrichment_dir),
    tf_class_level='SubFamily',
)

input_path = '8.aggregate_motif_enrichment.ipynb'
output_path = f'{output_dir}/8.aggregate_motif_enrichment.ipynb'

execute_notebook(str(input_path),
                 str(output_path),
                 parameters=parameters,
                 engine_name=None,
                 prepare_only=False,
                 kernel_name=None,
                 progress_bar=True,
                 log_output=False,
                 start_timeout=60,
                 report_mode=False,
                 cwd=str(output_dir))
pass

HBox(children=(IntProgress(value=0, max=27), HTML(value='')))




## Motif Enrichment Cell Cluster

In [20]:
use_clusters = [i.replace(' ', '_') for i in use_clusters]
for cluster in use_clusters:
    print(cluster)
    parameters = dict(use_clusters=use_clusters,
                      cluster=cluster,
                      or_cutoff=1.6,
                      neg_lgp_cutoff=10,
                      mask_quantile_to_max=0.8)
    
    input_path = '9.MotifEnrichment_CellCluster.ipynb'
    output_path = f'{output_dir}/9.MotifEnrichment_CellCluster.ipynb'
    
    result_file = motif_enrichment_dir / f'Cell.{cluster}.Hypo.motif_enrichment.msg'
    if result_file.exists():
        continue

    execute_notebook(input_path=str(input_path),
                     output_path=str(output_path),
                     parameters=parameters,
                     engine_name=None,
                     prepare_only=False,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False,
                     start_timeout=60,
                     report_mode=False,
                     cwd=str(output_dir))

CA1_Ak5
CA1_Chrm3


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CA1_Kif26a


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CA1_Lingo2


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CA1_Ptprg


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CA3_Cadm2


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CA3_Efnb2


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CA3-St18_Epha5


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CA3-St18_Nuak1


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


CA3-St18_Tead1


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


DG-po_Bcl11a


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


DG-po_Calb2


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


DG-po_Kctd8


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


Gfra1_Gfra1


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


IG-CA2_Chrm3


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


IG-CA2_Peak1


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


IG-CA2_Xpr1


HBox(children=(IntProgress(value=0, max=35), HTML(value='')))




In [21]:
parameters = dict(
    oddsratio_cutoff=1.8,
    lgp_cutoff=3,
    motif_enrichment_dir=str(motif_enrichment_dir),
    tf_class_level='SubFamily',
)

input_path = '10.aggregate_motif_enrichment-cell_cluster.ipynb'
output_path = f'{output_dir}/10.aggregate_motif_enrichment-cell_cluster.ipynb'

execute_notebook(str(input_path),
                 str(output_path),
                 parameters=parameters,
                 engine_name=None,
                 prepare_only=False,
                 kernel_name=None,
                 progress_bar=True,
                 log_output=False,
                 start_timeout=60,
                 report_mode=False,
                 cwd=str(output_dir))
pass

HBox(children=(IntProgress(value=0, max=27), HTML(value='')))


