In [2]:
from papermill import execute_notebook
import pandas as pd
import pathlib
import anndata

## Parameters

In [3]:
cell_tidy_data_path = '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
template_dir_path = '/home/hanliu/project/mouse_rostral_brain/study/FinalPredictionModelRecipe/'
output_dir = '/home/hanliu/project/mouse_rostral_brain/study/FinalPredictionModelRecipe/NeuronPrediction/'

cluster_levels = ['CellClass', 'MajorType', 'SubType']
downsample_large_cluster_to = 5000

# prepare features
mcds_path_list = list(
    pathlib.Path('/home/hanliu/project/mouse_rostral_brain/dataset/').glob(
        '*mcds'))
mcds_path_list = [str(i) for i in mcds_path_list]

exclude_chromosome = ['chrM', 'chrY']
exclude_cluster_str = ['Outlier', 'Exc', 'Inh']
black_list_path = '/home/hanliu/project/mouse_rostral_brain/misc/mm10-blacklist.v2.bed.gz'
clustering_feature = 'chrom100k'
min_feature_cov = 30
max_feature_cov = 300
mc_type = 'CGN'

# pairwise marker identify
min_cluster_cell_number = 10
adj_p_cutoff = 1e-3
log2fc_cutoff = 0.5
top_n = 25
marker_cpu = 10  # memory intensive but fast, don't use too much

# RFECV and final model
test_portion = 0.1
random_seed = 0
n_estimators = 100
n_estimators_final = 500
training_cpu = 30
cv_splits = 5
cluster_order_path = None

In [4]:
template_dir = pathlib.Path(template_dir_path)
output_dir = pathlib.Path(output_dir)
output_dir.mkdir(exist_ok=True)

## Load tidy data

In [5]:
cell_tidy_data = pd.read_msgpack(cell_tidy_data_path)
cluster_table = cell_tidy_data[cluster_levels].copy()

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
use_cells = cluster_table.applymap(lambda i: sum(
    [j in i for j in exclude_cluster_str])).sum(axis=1) == 0

cell_tidy_data = cell_tidy_data[use_cells].copy()
cluster_table = cluster_table[use_cells].copy()

## Train one level

In [12]:
def one_level_training(cell_tidy_data_path, cluster_col, output_dir):
    # prepare_cell_feature_matrix
    params = dict(cell_tidy_data_path=cell_tidy_data_path,
                  cluster_col=cluster_col,
                  dask_distribute=True,
                  in_memory=False,
                  mcds_path_list=mcds_path_list,
                  exclude_chromosome=exclude_chromosome,
                  black_list_path=black_list_path,
                  clustering_feature=clustering_feature,
                  min_feature_cov=min_feature_cov,
                  max_feature_cov=max_feature_cov,
                  mc_type=mc_type)

    input_path = template_dir / 'prepare_cell_feature_matrix.ipynb'
    output_path = output_dir / 'prepare_cell_feature_matrix.ipynb'

    execute_notebook(str(input_path),
                     str(output_path),
                     parameters=params,
                     engine_name=None,
                     prepare_only=True,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False,
                     start_timeout=60,
                     report_mode=False,
                     cwd=str(output_dir))

    # pairwise_marker_selection.ipynb
    params = dict(cluster_col=cluster_col,
                  min_cluster_cell_number=min_cluster_cell_number,
                  exclude_str=exclude_cluster_str,
                  adj_p_cutoff=adj_p_cutoff,
                  log2fc_cutoff=log2fc_cutoff,
                  top_n=top_n,
                  cpu=marker_cpu)

    input_path = template_dir / 'pairwise_marker_selection.ipynb'
    output_path = output_dir / 'pairwise_marker_selection.ipynb'

    execute_notebook(str(input_path),
                     str(output_path),
                     parameters=params,
                     engine_name=None,
                     prepare_only=True,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False,
                     start_timeout=60,
                     report_mode=False,
                     cwd=str(output_dir))
    
    adata = anndata.read_h5ad(output_dir / 'Markers/cluster_markers.h5ad')
    if adata.shape[1] == 0:
        with open(output_dir / 'finish_flag', 'w') as f:
            f.write('OMG')
        print('No marker found, skip model training')
        return

    # RFECV
    params = dict(cluster_col=cluster_col,
                  test_portion=test_portion,
                  random_seed=random_seed,
                  n_estimators=n_estimators,
                  n_estimators_final=n_estimators_final,
                  n_jobs=training_cpu,
                  n_splits=cv_splits,
                  cluster_order_path=cluster_order_path)

    input_path = template_dir / 'RFECV.ipynb'
    output_path = output_dir / 'RFECV.ipynb'

    execute_notebook(str(input_path),
                     str(output_path),
                     parameters=params,
                     engine_name=None,
                     prepare_only=True,
                     kernel_name=None,
                     progress_bar=True,
                     log_output=False,
                     start_timeout=60,
                     report_mode=False,
                     cwd=str(output_dir))
    return

## First level

In [13]:
def downsample(sub_df):
    if sub_df.shape[0] > downsample_large_cluster_to:
        sub_df = sub_df.sample(downsample_large_cluster_to,
                               random_state=random_seed)
    return pd.Series(sub_df.index)

In [14]:
cluster_col = 'SubType'

# first level
total_cells = pd.Index(
    cluster_table.groupby(cluster_col).apply(downsample).values.flat)
this_output_dir = output_dir / 'NonNeuronPrediction'
this_output_dir.mkdir(exist_ok=True)
cell_tidy_data.reindex(total_cells).to_msgpack(this_output_dir /
                                               'cell_tidy_data.msg')

cell_tidy_data_path = str(this_output_dir / 'cell_tidy_data.msg')

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  if __name__ == '__main__':


In [15]:
if not (this_output_dir / 'finish_flag').exists():
    one_level_training(cell_tidy_data_path, cluster_col, this_output_dir)