<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Prepare" data-toc-modified-id="Prepare-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Prepare</a></span><ul class="toc-item"><li><span><a href="#Create-Study-Dir" data-toc-modified-id="Create-Study-Dir-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Create Study Dir</a></span></li><li><span><a href="#Select-Cells" data-toc-modified-id="Select-Cells-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Select Cells</a></span></li></ul></li><li><span><a href="#Step-0-Cell-Basic-QC" data-toc-modified-id="Step-0-Cell-Basic-QC-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Step 0 Cell Basic QC</a></span></li><li><span><a href="#Step-1-Prepare-Adata" data-toc-modified-id="Step-1-Prepare-Adata-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Step 1 Prepare Adata</a></span></li><li><span><a href="#Step-2-Dimension-Reduction" data-toc-modified-id="Step-2-Dimension-Reduction-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Step 2 Dimension Reduction</a></span></li><li><span><a href="#Step-3-Consensus-Clustering" data-toc-modified-id="Step-3-Consensus-Clustering-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Step 3 Consensus Clustering</a></span></li><li><span><a href="#Step-4-Marker-Identification" data-toc-modified-id="Step-4-Marker-Identification-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Step 4 Marker Identification</a></span></li><li><span><a href="#Step-5-Cluster-Manual-Annotation" data-toc-modified-id="Step-5-Cluster-Manual-Annotation-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Step 5 Cluster Manual Annotation</a></span></li></ul></div>

In [1]:
from papermill import execute_notebook, PapermillExecutionError
import pandas as pd
import pathlib

In [2]:
cluster_name = 'L5-IT-Deptor'
cluster_col_name = 'consensus_cluster_rescued_anno'
regions = ['2C', '3C', '4B', '5D']

In [3]:
total_cell_meta = '/home/hanliu/project/mouse_rostral_brain/study/MOp/ALL/cell_tidy_data.final_annotation.msg'
template_dir = '/home/hanliu/project/mouse_rostral_brain/study/ClusteringRecipe/'
study_dir = f'/home/hanliu/project/mouse_rostral_brain/study/MOp/{cluster_name}'
dataset_dir = '/home/hanliu/project/mouse_rostral_brain/dataset/'

## Prepare

### Create Study Dir

In [4]:
total_cell_meta = pd.read_msgpack(total_cell_meta)
study_dir = pathlib.Path(study_dir)
study_dir.mkdir(exist_ok=True)
dataset_dir = pathlib.Path(dataset_dir)
template_dir = pathlib.Path(template_dir)

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
total_cell_meta.columns

Index(['AllcPath', 'CCC_Rate', 'CG_Rate', 'CG_RateAdj', 'CH_Rate',
       'CH_RateAdj', 'FinalReads', 'InputReads', 'MappedReads', 'Region',
       'index_name', 'uid', 'BamFilteringRate', 'MappingRate', 'Pos96',
       'Plate', 'Col96', 'Row96', 'Col384', 'Row384', 'FACS_Date', 'Slice',
       'PassFilter', 'leiden', 'pca_0', 'pca_1', 'umap_0', 'umap_1', 'tsne_0',
       'tsne_1', 'consensus_cluster', 'consensus_cluster_rescued',
       'consensus_cluster_rescued_anno'],
      dtype='object')

In [6]:
total_cell_meta = total_cell_meta.loc[:, [
    'AllcPath', 'CCC_Rate', 'CG_Rate', 'CG_RateAdj', 'CH_Rate', 'CH_RateAdj',
    'FinalReads', 'InputReads', 'MappedReads', 'Region', 'index_name', 'uid',
    'BamFilteringRate', 'MappingRate', 'Pos96', 'Plate', 'Col96', 'Row96',
    'Col384', 'Row384', 'FACS_Date', 'Slice', 'PassFilter', 'consensus_cluster_rescued_anno'
]]

In [7]:
total_cell_meta['consensus_cluster_rescued_anno'].value_counts()

L23-IT-Cux2     2057
L6-CT-Foxp2     1622
L4-IT-Rorb      1582
L5-IT-Deptor     898
L6-IT-Sulf1      881
CGE-VipNdnf      625
MGE-Sst          471
MGE-Pvalb        460
NonN             428
L5-PT-Bcl6       354
L6-NP-Tshz2      275
L6b-Galnt10      141
Outlier           82
Name: consensus_cluster_rescued_anno, dtype: int64

### Select Cells

In [8]:
select_cell_meta = total_cell_meta[total_cell_meta[cluster_col_name] == cluster_name]

cell_meta_path = study_dir / 'CellMetadata.msg'
select_cell_meta.to_msgpack(cell_meta_path)

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  after removing the cwd from sys.path.


In [9]:
cell_meta_path

PosixPath('/home/hanliu/project/mouse_rostral_brain/study/MOp/L5-IT-Deptor/CellMetadata.msg')

## Step 0 Cell Basic QC

In [10]:
# 0-CellBasicQC.ipynb
step_0_params = dict(
    study_name=f'MOp-{cluster_name}',
    cell_metadata_path=str(cell_meta_path),
)

In [11]:
input_path = template_dir / '0-CellBasicQC.ipynb'
output_path = study_dir / '0-CellBasicQC.ipynb'
execute_notebook(
    str(input_path),
    str(output_path),
    parameters=step_0_params,
    engine_name=None,
    prepare_only=False,
    kernel_name=None,
    progress_bar=True,
    log_output=False,
    start_timeout=60,
    report_mode=False,
    cwd=str(study_dir))
pass

HBox(children=(IntProgress(value=0, max=28), HTML(value='')))




## Step 1 Prepare Adata

In [12]:
step_1_params = dict(
    # parameters cell
    in_memory=True,
    dask_distribute=False,

    # selected cell metadata path
    cell_meta_path=str(study_dir / 'CellMetadata.AfterQC.msg'),

    # mcds_path
    mcds_path_list=[
        str(i) for i in dataset_dir.glob('*mcds') if i.name.split('-')[0] in regions
    ],
    clustering_feature='chrom100k',  # usually 100kb chromosome bins or genes

    # remove bad features
    black_list_region=None,
    exclude_chromosome=['chrY', 'chrM'],

    # preprocess parameters
    min_feature_cov=500,
    max_feature_cov=3000,
    ch_hvf_top=3000,
    min_ch_hvf_mean=0.5,
    max_ch_hvf_mean=2.5,
    cg_hvf_top=3000,
    min_cg_hvf_mean=0.5,
    max_cg_hvf_mean=1.2)

In [13]:
input_path = template_dir / '1-PrepareAdata.ipynb'
output_path = study_dir / '1-PrepareAdata.ipynb'
execute_notebook(
    str(input_path),
    str(output_path),
    parameters=step_1_params,
    engine_name=None,
    prepare_only=False,
    kernel_name=None,
    progress_bar=True,
    log_output=False,
    start_timeout=60,
    report_mode=False,
    cwd=str(study_dir))
pass

HBox(children=(IntProgress(value=0, max=41), HTML(value='')))




## Step 2 Dimension Reduction

In [14]:
step_2_params = dict(
    ch_pc_components=30,
    cg_pc_components=15,
    pc_color_col=None,
    batch_correction_col=None,
    sigma=15,
    alpha=0.1,
    scanorama_k=25,
    n_neighbors=25,
    resolution=0.8,
    perplexity=50,
    metadata_category_cols=['Region'],
    metadata_continue_cols=['CG_RateAdj', 'CH_RateAdj', 'FinalReads'])

In [15]:
input_path = template_dir / '2-DimensionReduction.ipynb'
output_path = study_dir / '2-DimensionReduction.ipynb'
execute_notebook(
    str(input_path),
    str(output_path),
    parameters=step_2_params,
    engine_name=None,
    prepare_only=False,
    kernel_name=None,
    progress_bar=True,
    log_output=False,
    start_timeout=60,
    report_mode=False,
    cwd=str(study_dir))
pass

HBox(children=(IntProgress(value=0, max=42), HTML(value='')))




## Step 3 Consensus Clustering

In [16]:
step_3_params = dict(
    coord_base='umap',
    n_jobs=40,

    # cc.fit_predict
    n_neighbors=25,
    metric='euclidean',
    neighbor_kwds=None,
    leiden_repeats=200,
    seed=1,
    leiden_resolution=0.5,
    leiden_kwds=None,
    min_cluster_size=10,
    min_cluster_portion=0.01,
    min_samples=1,
    epsilon='auto',
    hdbscan_kwds=None,

    # cc.supervise_training
    x=None,
    test_portion=0.1,
    n_estimators=500,
    n_splits=10,
    fbeta=1,
    average='weighted',
    outlier_proba_cutoff=0.8,
    confusion_merge_cutoff=0.2)

In [None]:
input_path = template_dir / '3-ConsensusClustering.ipynb'
output_path = study_dir / '3-ConsensusClustering.ipynb'
execute_notebook(
    str(input_path),
    str(output_path),
    parameters=step_3_params,
    engine_name=None,
    prepare_only=False,
    kernel_name=None,
    progress_bar=True,
    log_output=False,
    start_timeout=60,
    report_mode=False,
    cwd=str(study_dir))
pass

HBox(children=(IntProgress(value=0, max=31), HTML(value='')))

## Step 4 Marker Identification

In [None]:
step_4_params = dict(
    mc_type='CHN',
    load=True,

    # find marker gene
    n_marker_genes=1000,
    gene_score_cutoff=5,
    abs_log_fold_change=1,
    n_genes=10,
    nrows=2,
    coord_base='umap')

In [None]:
input_path = template_dir / '4-MarkerIdentification.ipynb'
output_path = study_dir / '4-MarkerIdentification.ipynb'
execute_notebook(
    str(input_path),
    str(output_path),
    parameters=step_4_params,
    engine_name=None,
    prepare_only=False,
    kernel_name=None,
    progress_bar=True,
    log_output=False,
    start_timeout=60,
    report_mode=False,
    cwd=str(study_dir))
pass

## Step 5 Cluster Manual Annotation
- Prepare Only

In [None]:
step_5_params = dict(load=False, mc_type='CHN', coord_base='umap')

In [None]:
input_path = template_dir / '5-ClusterManualAnnotation.ipynb'
output_path = study_dir / '5-ClusterManualAnnotation.ipynb'
execute_notebook(
    str(input_path),
    str(output_path),
    parameters=step_5_params,
    engine_name=None,
    prepare_only=True,
    kernel_name=None,
    progress_bar=True,
    log_output=False,
    start_timeout=60,
    report_mode=False,
    cwd=str(study_dir))
pass

In [None]:
%%javascript
IPython.notebook.save_notebook()

In [None]:
import subprocess

subprocess.run(['cp', 'papermill_sub_cluster_study.ipynb', str(study_dir / 'papermill_sub_cluster_study.ipynb')])