In [None]:
from papermill import execute_notebook, PapermillExecutionError
import pandas as pd
from pathlib import Path
import numpy as np

In [None]:
step_notebooks = ['01-prepare-anndata.ipynb',
                  '02-precluster-for-enriched-features.ipynb',
                  '03-dimension-reduction.ipynb',
                  '04-concensus-clustering.ipynb',
                  '05-marker-identification.ipynb',
                  '06-cluster-annotatoin.ipynb',
                 ]
step_prepare_only = [False,False,False,False,False,False]
step_skip = [False,False,False,False,False,False]


selection_dict = {}

correct_batch_col = 'Donor'
correct_method = 'harmony'

# crucial parameters
clustering_name = 'L1',
# this parameter is the final target that limit the total number of clusters
# Higher accuracy means more conservative clustering results and less number of clusters
target_accuracy = 0.96,
min_cluster_size = 20,
n_neighbors = 25,
leiden_resolution = 1,

mc_type = 'CHN'

top_n_markers = 200
auroc_cutoff = 0.7
adj_p_cutoff = 0.01
fc_cutoff = 0.8


mcds_path_list = []


total_cell_meta_path = '~/test/cell_ori_meta.pdpkl'
annotation_table_path = '~/cell_annotation_meta.pdpkl'



study_dirn = '~/clustering-level1'
template_dirn = '~/template/'

auto_annot_prefix = ''

old_annot_path = None

plot_merge_steps = True

In [None]:
step_1_params = dict(
    metadata_path = './CellMetadata.AfterQC.pdpkl',

    mcds_path_list = mcds_path_list,

    # Dimension name used to do clustering
    obs_dim = 'cell',
    var_dim = 'chrom100k',

    min_cov = 250,
    max_cov = 3000,

    black_list_path = '~/refs/human/hg38/blacklist/hg38-blacklist.v2.bed.gz',
    black_list_f = 0.2,

    exclude_chromosome = ['chrM', 'chrY'],
)


In [None]:
step_2_params = dict(
    mch_adata_path = 'mCH.HVF.h5ad',
    mcg_adata_path = 'mCG.HVF.h5ad',

    # Cluster Enriched Features analysis
    top_n_enriched_features=200,
    alpha=0.05,
    stat_plot=True,

    # you may provide a pre calculated cluster version. 
    # If None, will perform basic clustering using parameters below.
    cluster_col = None  ,

    # These parameters only used when cluster_col is None
    n_neighbors=n_neighbors ,
    leiden_resolution=leiden_resolution ,
    cluster_plot=True ,
    min_cluster_size = min_cluster_size,
)


In [None]:
step_3_params = dict(
    metadata_path = './CellMetadata.AfterQC.pdpkl',

    # HVF mC Fraction AnnData Files
    ch_adata_path = 'mCH.HVF.h5ad',
    cg_adata_path = 'mCG.HVF.h5ad',
    
    correct_batch_col = correct_batch_col,
    correct_method = correct_method,

    # use feature type
    # HVF: all highly variable features
    # CEF: cluster enriched features
    feature_type = 'CEF' ,
    pre_cluster_name = 'leiden',

    # n_components
    n_components = 'auto',  # if auto, will use Kolmogorov-Smirnov test to test the adjacent PCs and cut when P > p_cutoff
    p_cutoff = 0.1,  # ks test p value cutoff, only apply when n_components == 'auto'

    # downsample large clusters
    max_cell_prop = 0.05,

    interactive_downsample = 2000,
    interactive_plot = False,
    min_cluster_size = min_cluster_size,
)

In [None]:
step_4_params = dict(
    clustering_name = clustering_name,
    # this parameter is the final target that limit the total number of clusters
    # Higher accuracy means more conservative clustering results and less number of clusters
    target_accuracy = target_accuracy,
    min_cluster_size = min_cluster_size,
    n_neighbors = n_neighbors,
    leiden_resolution = leiden_resolution,

    metadata_path = './CellMetadata.AfterQC.pdpkl',
    adata_path = './adata.with_coords.h5ad',
    coord_base = 'tsne',

    # Other ConsensusClustering parameters
    metric = 'euclidean',
    consensus_rate = 0.7,
    leiden_repeats = 500,
    random_state = 0,
    train_frac = 0.5,
    train_max_n = 500,
    max_iter = 50,
    n_jobs = 40,

    # Dendrogram via Multiscale Bootstrap Resampling
    nboot = 10000,
    method_dist = 'correlation',
    method_hclust = 'average',

    plot_type = 'static',
    plot_merge_steps = plot_merge_steps,
)


In [None]:
step_5_params = dict(
    adata_path = './adata.with_coords.h5ad',
    cluster_col = clustering_name,
    mc_type = None,

    top_n_markers = top_n_markers,
    auroc_cutoff = auroc_cutoff,

    adj_p_cutoff = adj_p_cutoff,

    fc_cutoff = fc_cutoff,

    max_cluster_cells = 2000,
    max_other_fold = 5,

    gene_annotation_path = '~/refs/human/hg38/gencode/v33/gencode.v33.basic.annotation.gene.flat.tsv.gz',
    obs_dim = 'cell',
    var_dim = 'gene',
    chrom_to_remove = ['chrM'],

    min_cov = 5,
    min_cov_ratio = 0.002,
    mcds_path_list = mcds_path_list,
)

In [None]:
step_6_params = dict(
    cluster_col = clustering_name,
    mc_type = mc_type,
    metadata_path = './CellMetadata.AfterQC.pdpkl',
    adata_path = './adata.with_coords.h5ad',


    gene_annotation_path = '~/refs/human/hg38/gencode/v33/gencode.v33.basic.annotation.gene.flat.tsv.gz',
    obs_dim = 'cell',
    var_dim = 'gene',

    plot_top_n_markers = 10,

    n_samples = 20000,

    mcds_path_list = mcds_path_list,
    
    old_annot_path = old_annot_path,

    auto_annot_prefix = auto_annot_prefix,
)


In [None]:
step_params = [
    step_1_params,
    step_2_params,
    step_3_params,
    step_4_params,
    step_5_params,
    step_6_params,
]

In [None]:
template_dir = Path(template_dirn)

study_dir = Path(study_dirn)
study_dir.mkdir(exist_ok=True)


In [None]:
total_cell_meta = pd.read_pickle(total_cell_meta_path)
total_cell_meta = total_cell_meta[total_cell_meta['PassFilter']]
cell_annotation = pd.read_pickle(annotation_table_path).reindex(total_cell_meta.index)


In [None]:
if len(selection_dict)==0:
    judge = np.array([True]*len(total_cell_meta))
else:
    judge = []
    for col, vals in selection_dict.items():
        judge.append(cell_annotation[col].isin(vals))
    judge = np.all(judge, axis=0)


In [None]:
# if judge==True:
#     select_cell_meta = total_cell_meta
# else:
#     select_cell_meta = total_cell_meta[judge]
select_cell_meta = total_cell_meta[judge]

    
cell_meta_path = study_dir / 'CellMetadata.AfterQC.pdpkl'
select_cell_meta.to_pickle(cell_meta_path)

In [None]:
for notebook, prepare_only, skip, params in zip(step_notebooks, step_prepare_only, step_skip, step_params):
    if skip:
        print(f'{notebook} : skipped')
    else:
        execute_notebook(
            str(template_dir/notebook),
            str(study_dir/notebook),
            parameters=params,
            nest_asyncio=True,
            engine_name=None,
            prepare_only=prepare_only,
            kernel_name=None,
            progress_bar=True,
            log_output=False,
            start_timeout=60,
            report_mode=False,
            cwd=str(study_dir)
        )
        if prepare_only:
            print(f'{notebook} : prepared')
        else:
            print(f'{notebook} : executed')
