In [3]:
import pandas as pd
import pathlib
from papermill import execute_notebook

## Parameters

In [4]:
cell_tidy_data_path = '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
rna_adata_path = '/home/hanliu/project/Linnarson_Mouse_Brain/raw/OLF.CPM.log1p.for_integration.h5ad'

# mc
cell_classes = ['Exc', 'Inh']
major_regions = ['OLF']

# rna
pass

# output
output_dir = '/home/hanliu/project/mouse_rostral_brain/study/IntegrationWithRNA/OLF'

In [5]:
output_dir = pathlib.Path(output_dir)
output_dir.mkdir(exist_ok=True)

template_dir = ''
template_dir = pathlib.Path(template_dir)

## Make Study

In [6]:
cell_tidy_data = pd.read_msgpack(cell_tidy_data_path)

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
judge = (cell_tidy_data['CellClass'].isin(cell_classes)) & \
        (cell_tidy_data['MajorRegion'].isin(major_regions))
cell_tidy_data = cell_tidy_data[judge]
cell_tidy_data.to_msgpack(output_dir / 'cell_tidy_data.msg')
cell_tidy_data.shape[0]

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  after removing the cwd from sys.path.


16184

In [8]:
def runner():
    # prepare mC
    params = dict(
        cell_tidy_data_path='cell_tidy_data.msg',
        cluster_col='SubType',
        clustering_feature='gene',
        dask_distribute=True,
        in_memory=False,
        mcds_path_list=[
            str(i) for i in pathlib.Path(
                '/home/hanliu/project/mouse_rostral_brain/dataset/').glob(
                    '*mcds')
        ],
        exclude_chromosome=['chrM', 'chrY'],
        black_list_path=
        '/home/hanliu/project/mouse_rostral_brain/misc/mm10-blacklist.v2.bed.gz',
        min_feature_cov=30,
        max_feature_cov=8000,
        mc_type='CHN',
        filter_by_ncbi=True,
        ncbi_path='/home/hanliu/ref/ncbi/gene2ensembl.mouse.tsv.gz')

    input_path = template_dir / 'mc.prepare_cell_feature_matrix.ipynb'
    output_path = output_dir / 'mc.prepare_cell_feature_matrix.ipynb'

    execute_notebook(str(input_path),
                     str(output_path),
                     parameters=params,
                     engine_name=None,
                     prepare_only=True,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False,
                     start_timeout=60,
                     report_mode=False,
                     cwd=str(output_dir))

    params = dict(cluster_col='SubType',
                  min_cluster_cell_number=10,
                  exclude_str=['Outlier'],
                  adj_p_cutoff=1e-3,
                  log2fc_cutoff=1,
                  top_n=20,
                  cpu=10)

    input_path = template_dir / 'mc.pairwise_marker_selection.ipynb'
    output_path = output_dir / 'mc.pairwise_marker_selection.ipynb'

    execute_notebook(str(input_path),
                     str(output_path),
                     parameters=params,
                     engine_name=None,
                     prepare_only=True,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False,
                     start_timeout=60,
                     report_mode=False,
                     cwd=str(output_dir))

    params = dict(adata_path=rna_adata_path,
                  cluster_col='SubType',
                  cpu=10,
                  top_n=25,
                  adj_p_cutoff=1e-3,
                  log2fc_cutoff=0.7,
                  min_cluster_cell_number=10)

    input_path = template_dir / 'rna.linnarson.pairwise_marker_selection.ipynb'
    output_path = output_dir / 'rna.linnarson.pairwise_marker_selection.ipynb'

    execute_notebook(str(input_path),
                     str(output_path),
                     parameters=params,
                     engine_name=None,
                     prepare_only=True,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False,
                     start_timeout=60,
                     report_mode=False,
                     cwd=str(output_dir))

    params = dict(
        # scanorama
        scanorama_dim=30,
        sigma=100,
        alpha=0,
        knn=20,

        # clustering
        k=30,
        n_pcs=30,
        n_jobs=40)

    input_path = template_dir / 'Integration.ipynb'
    output_path = output_dir / 'Integration.ipynb'

    execute_notebook(str(input_path),
                     str(output_path),
                     parameters=params,
                     engine_name=None,
                     prepare_only=True,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False,
                     start_timeout=60,
                     report_mode=False,
                     cwd=str(output_dir))

    params = dict(n_estimators=200,
                  random_seed=0,
                  n_splits=10,
                  n_jobs=40,
                  cluster_col='SubType')

    input_path = template_dir / 'LabelTransfer.ipynb'
    output_path = output_dir / 'LabelTransfer.ipynb'

    execute_notebook(str(input_path),
                     str(output_path),
                     parameters=params,
                     engine_name=None,
                     prepare_only=True,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False,
                     start_timeout=60,
                     report_mode=False,
                     cwd=str(output_dir))
    return

In [9]:
runner()