In [1]:
study_design = {
    'CNU': ['3F', '4D', '4E', '4G', '4H', '5E', '5F', '5H', '5J'],
    'HPF': ['8E', '8J', '9J', '9H', '10E', '10F', '11E', '11F'],
    'Isocortex': [
        '1A', '1B', '2A', '2B', '2C', '3A', '3B', 
        '3C', '3D', '4A', '4B', '4C', '5A', '5B', 
        '5C', '5D', '6A', '6B', '6C', '7B', '8B'
    ],
    'OLF': ['1C', '2D', '2E', '3E', '4F', '5G', '6D'],
    'ACA,PL,ILA': ['5A', '6A', '2A', '3A', '4A'],
    'ACB': ['3F', '4E', '5F'],
    'AI': ['3D'],
    'MOB,AON': ['1C', '2E'],
    'CA,DG': ['8H', '9H', '10E', '10F', '11E'],
    'CP': ['4D', '5E'],
    'DG,CA': ['8J', '9J', '11F', '11E', '10F'],
    'MO': ['2C', '3C', '4B', '5D', '1A', '2B', '3B'],
    'ORB': ['1B'],
    'LSX': ['4G', '5J'],
    'PAL': ['4H', '5H'],
    'PIR': ['2D', '3E', '4F', '5G', '6D'],
    'SS': ['4C', '5B', '6B', '7B', '8B', '5C', '6C']
}

In [2]:
from papermill import execute_notebook, PapermillExecutionError
import pandas as pd
import pathlib
import numpy as np
import subprocess

In [3]:
total_cell_meta_path = '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
template_dir = '/home/hanliu/project/mouse_rostral_brain/study/ClusteringRecipe/'
dataset_dir = '/home/hanliu/project/mouse_rostral_brain/dataset/'
dataset_dir = pathlib.Path(dataset_dir)


## Load data

In [4]:
total_cell_meta = pd.read_msgpack(total_cell_meta_path)
total_cell_meta['MajorType'].unique()

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


array(['MGE-Sst', 'CA3', 'CA1', 'CA3-St18', 'Unc5c', 'Gfra1', 'ODC', 'PC',
       'ANP', 'IT-L5', 'NP-L6', 'CGE-Lamp5', 'CT-L6', 'IG-CA2', 'DG-po',
       'DG', 'CGE-Vip', 'OPC', 'ASC', 'MGC', 'PAL-Inh', 'PT-L5',
       'MGE-Pvalb', 'VLMC', 'EC', 'VLMC-Pia', 'OLF', 'MSN-D2', 'L6b',
       'IT-L6', 'IT-L23', 'IT-L4', 'OLF-Exc', 'CLA', 'Foxp2', 'MSN-D1',
       'LSX-Inh', 'D1L-Fstl4', 'EP', 'D1L-PAL', 'Chd7'], dtype=object)

In [5]:
def runner(study_name, regions):
    # create dir
    study_dir = f'/home/hanliu/project/mouse_rostral_brain/study/BrainStructures/{study_name}'
    study_dir = pathlib.Path(study_dir)
    if study_dir.exists():
        print(study_name, 'exists')
        return
    
    study_dir.mkdir(exist_ok=True)
    
    # select cell
    select_cell_meta = total_cell_meta[total_cell_meta['Region'].isin(regions) &  total_cell_meta['MajorType'].apply(lambda i: 'Outlier' not in i)]
    print('Cells:', select_cell_meta.shape[0])
    cell_meta_path = study_dir / 'CellMetadata.msg'
    select_cell_meta.to_msgpack(cell_meta_path)
    
    # 0-CellBasicQC.ipynb
    step_0_params = dict(
        study_name=study_name,
        cell_metadata_path=str(cell_meta_path),
    )
    
    input_path = '0-CellBasicQC.ipynb'
    output_path = study_dir / '0-CellBasicQC.ipynb'
    execute_notebook(
        str(input_path),
        str(output_path),
        parameters=step_0_params,
        engine_name=None,
        prepare_only=False,
        kernel_name=None,
        progress_bar=True,
        log_output=False,
        start_timeout=60,
        report_mode=False,
        cwd=str(study_dir))
    
    step_1_params = dict(
        # parameters cell
        in_memory=True,
        dask_distribute=False,
    
        # selected cell metadata path
        cell_meta_path=str(study_dir / 'CellMetadata.AfterQC.msg'),
    
        # mcds_path
        mcds_path_list=[str(i) for i in dataset_dir.glob('*mcds') if (i.name.split('-')[0] in regions)],
        clustering_feature='chrom100k',  # usually 100kb chromosome bins or genes
    
        # remove bad features
        black_list_region=None,
        exclude_chromosome=['chrY', 'chrM'],
    
        # preprocess parameters
        min_feature_cov=500,
        max_feature_cov=3000,
        ch_hvf_top=3000,
        min_ch_hvf_mean=0.5,
        max_ch_hvf_mean=2.5,
        cg_hvf_top=3000,
        min_cg_hvf_mean=0.5,
        max_cg_hvf_mean=1.2)
    
    input_path = '1-PrepareAdata.ipynb'
    output_path = study_dir / '1-PrepareAdata.ipynb'
    execute_notebook(
        str(input_path),
        str(output_path),
        parameters=step_1_params,
        engine_name=None,
        prepare_only=False,
        kernel_name=None,
        progress_bar=True,
        log_output=False,
        start_timeout=60,
        report_mode=False,
        cwd=str(study_dir),
        generate_gene_rate=False)
    
    step_2_params = dict(
        ch_pc_components=20,
        cg_pc_components=10,
        pc_color_col=None,
        batch_correction_col=None,
        sigma=15,
        alpha=0.1,
        scanorama_k=25,
        n_neighbors=25,
        resolution=0.8,
        perplexity=50,
        metadata_category_cols=['Region'],
        metadata_continue_cols=['CG_RateAdj', 'CH_RateAdj', 'FinalReads'])
    
    input_path = '2-DimensionReduction.ipynb'
    output_path = study_dir / '2-DimensionReduction.ipynb'
    execute_notebook(
        str(input_path),
        str(output_path),
        parameters=step_2_params,
        engine_name=None,
        prepare_only=False,
        kernel_name=None,
        progress_bar=True,
        log_output=False,
        start_timeout=60,
        report_mode=False,
        cwd=str(study_dir))
    
    subprocess.run(['rm', '-rf', str(study_dir / 'Adata')], 
                   check=True)
    
    return

In [None]:
for study_name, regions in study_design.items():
    print(study_name)
    print(regions)
    runner(study_name, regions)

CNU
['3F', '4D', '4E', '4G', '4H', '5E', '5F', '5H', '5J']
CNU exists
HPF
['8E', '8J', '9J', '9H', '10E', '10F', '11E', '11F']
HPF exists
Isocortex
['1A', '1B', '2A', '2B', '2C', '3A', '3B', '3C', '3D', '4A', '4B', '4C', '5A', '5B', '5C', '5D', '6A', '6B', '6C', '7B', '8B']
Isocortex exists
OLF
['1C', '2D', '2E', '3E', '4F', '5G', '6D']
Cells: 17655


It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  from ipykernel import kernelapp as app


HBox(children=(IntProgress(value=0, max=28), HTML(value='')))




HBox(children=(IntProgress(value=0, max=44), HTML(value='')))