In [1]:
import pandas as pd
import pathlib
from papermill import execute_notebook

In [2]:
cell_tidy_data = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
)

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
sub_types = cell_tidy_data[(cell_tidy_data['CellClass'] != 'NonN') & 
               (cell_tidy_data['SubType'].apply(lambda i: 'Outlier' not in i))]['SubType'].unique()

In [4]:
len(sub_types)

145

## MajorType All Neuron Pairwise gene

In [13]:
# mc
parameters = dict(
    cluster_col='MajorType',
    use_clusters=[
        'CA1', 'CA3', 'CA3-St18', 'CGE-Lamp5', 'CGE-Vip', 'CLA', 'CT-L6',
        'Chd7', 'D1L-Fstl4', 'D1L-PAL', 'DG', 'DG-po', 'EP', 'Foxp2', 'Gfra1',
        'IG-CA2', 'IT-L23', 'IT-L4', 'IT-L5', 'IT-L6', 'L6b', 'LSX-Inh',
        'MGE-Pvalb', 'MGE-Sst', 'MSN-D1', 'MSN-D2', 'NP-L6', 'OLF', 'OLF-Exc',
        'PAL-Inh', 'PT-L5', 'Unc5c'
    ],
    mcds_pattern = '/home/hanliu/project/mouse_rostral_brain/study/Level1-CellClass/ALL_manual/Adata/GeneWithSlop2kb.gene_da_rate.*.mcds',
    min_cluster_cell_number=10,
    adj_p_cutoff=5e-3,
    top_n=1000,
    cpu=30,
    max_test_cell_population=2000,
    chunk_size=50,
    delta_rate_cutoff=0.3,
    auroc_cutoff=0.8
)

In [14]:
input_path = 'pairwise_marker_selection.ipynb'
output_path = 'MajorTypePairwiseDEG/MajorType.pairwise_marker_selection.ipynb'

execute_notebook(str(input_path),
                 str(output_path),
                 parameters=parameters,
                 engine_name=None,
                 prepare_only=True,
                 kernel_name=None,
                 progress_bar=True,
                 log_output=False,
                 start_timeout=60,
                 report_mode=False,
                 cwd='MajorTypePairwiseDEG')
pass

## SubType ALL Neuron Pairwise

In [5]:
# mc
parameters = dict(
    cluster_col='SubType',
    use_clusters=sub_types.tolist(),
    mcds_pattern = '/home/hanliu/project/mouse_rostral_brain/study/Level1-CellClass/ALL_manual/Adata/GeneWithSlop2kb.gene_da_rate.*.mcds',
    min_cluster_cell_number=10,
    adj_p_cutoff=5e-3,
    top_n=30000,
    cpu=20,
    max_test_cell_population=2000,
    chunk_size=50,
    delta_rate_cutoff=0.3,
    auroc_cutoff=0.8
)

In [12]:
input_path = 'pairwise_marker_selection.ipynb'
output_path = 'SubTypePairwiseDEG/SubType.pairwise_marker_selection.ipynb'

execute_notebook(str(input_path),
                 str(output_path),
                 parameters=parameters,
                 engine_name=None,
                 prepare_only=True,
                 kernel_name=None,
                 progress_bar=True,
                 log_output=False,
                 start_timeout=60,
                 report_mode=False,
                 cwd='MajorTypePairwiseDEG')
pass

## SubType Pairwise within each MajorType
- Only Neuron

In [10]:
cell_tidy_data = cell_tidy_data[(cell_tidy_data['CellClass'] != 'NonN') & 
                                cell_tidy_data['SubType'].apply(lambda i: 'Outlier' not in i)]

for major_type, sub_df in cell_tidy_data.groupby('MajorType'):
    sub_clusters = sub_df['SubType'].unique().tolist()
    print(major_type, sub_clusters)
    if len(sub_clusters) <= 1:
        continue
    
    input_path = 'pairwise_marker_selection.ipynb'

    output_dir = f'SubTypePairwiseDEG/{major_type}'
    pathlib.Path(output_dir).mkdir(exist_ok=True, parents=True)
    output_path = f'SubTypePairwiseDEG/{major_type}/SubType.pairwise_marker_selection.ipynb'
    
    parameters = dict(cluster_col='SubType',
                      use_clusters=sub_clusters,
                      min_cluster_cell_number=10,
                      adj_p_cutoff=5e-3,
                      top_n=2000,
                      cpu=20,
                      max_test_cell_population=2000,
                      chunk_size=10,
                      delta_rate_cutoff=0.3,
                      auroc_cutoff=0.8)
    if (pathlib.Path(output_dir) / 'TotalGeneID.txt').exists():
        continue
    execute_notebook(str(input_path),
                     str(output_path),
                     parameters=parameters,
                     engine_name=None,
                     prepare_only=False,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False,
                     start_timeout=60,
                     report_mode=False,
                     cwd=output_dir)


CA1 ['CA1 Chrm3', 'CA1 Ptprg', 'CA1 Kif26a', 'CA1 Ak5', 'CA1 Lingo2']
CA3 ['CA3 Cadm2', 'CA3 Efnb2']
CA3-St18 ['CA3-St18 Tead1', 'CA3-St18 Nuak1', 'CA3-St18 Epha5']
CGE-Lamp5 ['CGE-Lamp5 Dock5', 'CGE-Lamp5 Grk5', 'CGE-Lamp5 Sorcs1', 'CGE-Lamp5 Grid1', 'CGE-Lamp5 Nrxn3']
CGE-Vip ['CGE-Vip Ntng1', 'CGE-Vip Ptprm', 'CGE-Vip Robo1', 'CGE-Vip Grm8', 'CGE-Vip Ccser1', 'CGE-Vip Fstl4', 'CGE-Vip Galnt17', 'CGE-Vip Clstn2']
CLA ['CLA Bcl11a', 'CLA Cdh8', 'CLA Nrp2']
CT-L6 ['CT-L6 Megf9', 'CT-L6 Il1rap', 'CT-L6 Hcrtr2', 'CT-L6 Map4']
Chd7 ['Chd7 Kcnc2', 'Chd7 Trpc7', 'Chd7 Megf11']
D1L-Fstl4 ['D1L-Fstl4 Sipa1l2', 'D1L-Fstl4 Trps1', 'D1L-Fstl4 Grm3', 'D1L-Fstl4 Cadm1', 'D1L-Fstl4 Crim1']
D1L-PAL ['D1L-PAL Flrt2', 'D1L-PAL Plcxd3']
DG ['DG dg-all']
DG-po ['DG-po Calb2', 'DG-po Bcl11a', 'DG-po Kctd8']
EP ['EP Tspan5', 'EP Rgs8', 'EP Adcy8']
Foxp2 ['Foxp2 Homer2', 'Foxp2 Trpc7', 'Foxp2 Dchs2', 'Foxp2 Inpp4b']
Gfra1 ['Gfra1 Gfra1']
IG-CA2 ['IG-CA2 Chrm3', 'IG-CA2 Peak1', 'IG-CA2 Xpr1']
IT-L23 ['IT-L2