In [1]:
import pandas as pd
import pathlib
from papermill import execute_notebook

## Parameters

In [2]:
mc_study_design = {
    'Cortex-OLF-Exc': [
        'NP-L6',
        'CT-L6',
        'PT-L5',
        'L6b',
        'OLF-Exc',
        'CLA',
        'EP',
    ],
    'IT': [
        'IT-L5',
        'IT-L6',
        'IT-L23',
        'IT-L4',
    ],
    'HPF-Exc': [
        'CA3',
        'CA1',
        'CA3-St18',
        'Gfra1',
        'IG-CA2',
        'DG-po',
        'DG',
    ],
    'Inh': [
        'MGE-Sst', 'Unc5c', 'CGE-Lamp5', 'CGE-Vip', 'PAL-Inh', 'MGE-Pvalb',
        'MSN-D2', 'OLF', 'Foxp2', 'MSN-D1', 'LSX-Inh', 'D1L-Fstl4', 'D1L-PAL',
        'Chd7'
    ],
    'NonN': [
        'ODC',
        'PC',
        'ANP',
        'OPC',
        'ASC',
        'MGC',
        'VLMC',
        'EC',
        'VLMC-Pia',
    ]
}
mc_study_design.keys()

dict_keys(['Cortex-OLF-Exc', 'IT', 'HPF-Exc', 'Inh', 'NonN'])

In [3]:
study = 'NonN'

In [4]:
cell_tidy_data_path = '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'

# mc
major_types = mc_study_design[study]

# rna
atac_cell_tidy_data_path = '/home/hanliu/project/mouse_rostral_brain/ATAC/Metadata/ATAC.cell_tidy_data.msg'
atac_adata_path = f'/home/hanliu/project/mouse_rostral_brain/ATAC/AdataForIntegration/{study}.gene.pass_qc.with_cluster.h5ad'

# output
output_dir = f'/home/hanliu/project/mouse_rostral_brain/study/IntegrationWithATAC/{study}'

In [5]:
output_dir = pathlib.Path(output_dir)
output_dir.mkdir(exist_ok=True, parents=True)

template_dir = ''
template_dir = pathlib.Path(template_dir)

## Make Study

In [6]:
cell_tidy_data = pd.read_msgpack(cell_tidy_data_path)

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
cell_tidy_data['MajorType'].unique()

array(['MGE-Sst', 'CA3', 'CA1', 'CA3-St18', 'Unc5c', 'Gfra1', 'ODC', 'PC',
       'ANP', 'IT-L5', 'NP-L6', 'CGE-Lamp5', 'CT-L6', 'IG-CA2', 'DG-po',
       'DG', 'CGE-Vip', 'OPC', 'ASC', 'MGC', 'PAL-Inh', 'PT-L5',
       'MGE-Pvalb', 'VLMC', 'EC', 'VLMC-Pia', 'OLF', 'MSN-D2', 'L6b',
       'IT-L6', 'IT-L23', 'IT-L4', 'OLF-Exc', 'CLA', 'Foxp2', 'MSN-D1',
       'LSX-Inh', 'D1L-Fstl4', 'EP', 'D1L-PAL', 'Chd7'], dtype=object)

In [8]:
judge = cell_tidy_data['MajorType'].isin(major_types)
cell_tidy_data = cell_tidy_data[judge]
cell_tidy_data.to_msgpack(output_dir / 'cell_tidy_data.msg')
cell_tidy_data.shape[0]

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  This is separate from the ipykernel package so we can avoid doing imports until


8167

In [9]:
cell_tidy_data['MajorType'].value_counts()

ODC         3082
ASC         2215
MGC          853
OPC          777
VLMC         419
ANP          331
VLMC-Pia     225
PC           170
EC            95
Name: MajorType, dtype: int64

In [10]:
def runner():
    # prepare mC
    params = dict(
        cell_tidy_data_path='cell_tidy_data.msg',
        cluster_col='SubType',
        clustering_feature='gene',
        dask_distribute=True,
        in_memory=False,
        mcds_path_list=[
            str(i) for i in pathlib.Path(
                '/home/hanliu/project/mouse_rostral_brain/dataset/').glob(
                    '*mcds')
        ],
        exclude_chromosome=['chrM', 'chrY'],
        black_list_path=
        '/home/hanliu/project/mouse_rostral_brain/misc/mm10-blacklist.v2.bed.gz',
        min_feature_cov=30,
        max_feature_cov=8000,
        mc_type='CHN',
        filter_by_ncbi=True,
        ncbi_path='/home/hanliu/ref/ncbi/gene2ensembl.mouse.tsv.gz')

    input_path = template_dir / 'mc.prepare_cell_feature_matrix.ipynb'
    output_path = output_dir / 'mc.prepare_cell_feature_matrix.ipynb'

    execute_notebook(str(input_path),
                     str(output_path),
                     parameters=params,
                     engine_name=None,
                     prepare_only=True,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False,
                     start_timeout=60,
                     report_mode=False,
                     cwd=str(output_dir))

    params = dict(cluster_col='SubType',
                  min_cluster_cell_number=10,
                  exclude_str=['Outlier'],
                  adj_p_cutoff=1e-3,
                  log2fc_cutoff=1,
                  top_n=20,
                  cpu=10)

    input_path = template_dir / 'mc.pairwise_marker_selection.ipynb'
    output_path = output_dir / 'mc.pairwise_marker_selection.ipynb'

    execute_notebook(str(input_path),
                     str(output_path),
                     parameters=params,
                     engine_name=None,
                     prepare_only=True,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False,
                     start_timeout=60,
                     report_mode=False,
                     cwd=str(output_dir))

    params = dict(cell_tidy_data_path=atac_cell_tidy_data_path,
                  adata_path=atac_adata_path,
                  cluster_col='SubType',
                  cpu=5,
                  top_n=50,
                  adj_p_cutoff=1e-3,
                  log2fc_cutoff=0.3,
                  min_cluster_cell_number=10)

    input_path = template_dir / 'atac.pairwise_marker_selection.ipynb'
    output_path = output_dir / 'atac.pairwise_marker_selection.ipynb'

    execute_notebook(str(input_path),
                     str(output_path),
                     parameters=params,
                     engine_name=None,
                     prepare_only=True,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False,
                     start_timeout=60,
                     report_mode=False,
                     cwd=str(output_dir))

    params = dict(
        # scanorama
        scanorama_dim=30,
        sigma=100,
        alpha=0,
        knn=20,

        # clustering
        k=30,
        n_pcs=30,
        n_jobs=40)

    input_path = template_dir / 'Integration_with_atac-pseudo.ipynb'
    output_path = output_dir / 'Integration_with_atac-pseudo.ipynb'

    execute_notebook(str(input_path),
                     str(output_path),
                     parameters=params,
                     engine_name=None,
                     prepare_only=True,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False,
                     start_timeout=60,
                     report_mode=False,
                     cwd=str(output_dir))

    params = dict(n_estimators=200,
                  random_seed=0,
                  n_splits=10,
                  n_jobs=40,
                  cluster_col='SubType')

    input_path = template_dir / 'LabelTransfer.ipynb'
    output_path = output_dir / 'LabelTransfer.ipynb'

    execute_notebook(str(input_path),
                     str(output_path),
                     parameters=params,
                     engine_name=None,
                     prepare_only=True,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False,
                     start_timeout=60,
                     report_mode=False,
                     cwd=str(output_dir))
    return

In [11]:
runner()