## Marker Selection

In [1]:
import pathlib
from concurrent.futures import ProcessPoolExecutor, as_completed
from itertools import combinations

import anndata
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import seaborn as sns
import random
from cemba_data.tools.hdf5.anndata import rank_features_groups

## Parameter

In [2]:
cell_tidy_data_path = '/home/hanliu/project/allen/mouse_scrna/raw/Cortex_HPF/Cortex_HPF.74967.cell_tidy_data.msg'
adata_path = '/home/hanliu/project/allen/mouse_scrna/dataset/Cortex_HPF.74967.no_outlier.exon+intron.CPM.log1p.h5ad'

cluster_col = 'MajorType'
use_clusters = [
    'Pvalb', 'L4 IT', 'Vip', 'L2/3 IT Otof', 'Lamp5', 'NP', 'Sst', 'L4/5 IT',
    'L5 IT', 'L6 CT', 'L6 IT', 'Sst Chodl', 'Car3', 'Sncg', 'Lamp5 Lhx6',
    'Serpinf1', 'L5 ET', 'L6b', 'L2/3 IT Cxcl14', 'Meis2', 'CR',
    'RSP/ACA L4/5 IT', 'PIR Six3', 'Sncg/Ndnf HPF', 'RSP_ACA IT Scnn1a',
    'IT RHP Dcn', 'L2/3 IT Plch1', 'L2/3 IT Cdc14a', 'CA1sp', 'DG',
    'CA1sp/SUB-sp Kcnip1', 'CA3sp', 'CA2sp/IG', 'Ly6g6e', 'SUB-Sp Ndst4',
    'L2/3 IT Ndst4 Endou', 'RHP Cplx3', 'POST-PRE-PAR Ptgfr'
]

cpu = 5
top_n = 1000
adj_p_cutoff = 1e-3
log2fc_cutoff = 1
min_cluster_cell_number = 10

### Stable Parameter

In [3]:
max_test_cell_population = 1000
random_seed = 0
chunk_size = 100
exclude_str = []

## Load Data

### Cell Tidy Data and selection

In [4]:
cell_tidy_data = pd.read_msgpack(cell_tidy_data_path)

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


#### Region and CellClass selection

In [5]:
if use_clusters is not None:
    judge = cell_tidy_data[cluster_col].isin(use_clusters)
    cell_tidy_data = cell_tidy_data[judge].copy()

cell_to_use_in_integration = cell_tidy_data.index

#### Cluster Size Selection

In [6]:
cluster_series = cell_tidy_data[cluster_col].astype(str)
cluster_counts = cluster_series.value_counts()


def check_cluster(cluster, count):
    if count < min_cluster_cell_number:
        return False
    for exclude in exclude_str:
        if exclude in cluster:
            return False
    return True


unique_clusters = [
    cluster for cluster, count in cluster_counts.items()
    if check_cluster(cluster, count)
]
cluster_pairs = list(combinations(unique_clusters, 2))
random.shuffle(cluster_pairs)  # shuffle make parallel more even

print(len(unique_clusters), 'pass filter.')
print(len(cluster_pairs), 'pairwise comparison to test.')

38 pass filter.
703 pairwise comparison to test.


In [7]:
judge = judge & cell_tidy_data[cluster_col].isin(unique_clusters)
cell_tidy_data = cell_tidy_data[judge].copy()

  


In [8]:
adata = anndata.read_h5ad(adata_path)
adata = adata[cell_tidy_data.index, :].copy()
adata

AnnData object with n_obs × n_vars = 72893 × 45768 
    obs: 'cluster_color', 'cluster_order', 'cluster_label', 'class_color', 'class_order', 'class_label', 'subclass_color', 'subclass_order', 'subclass_label', 'full_genotype_color', 'full_genotype_order', 'full_genotype_label', 'donor_sex_color', 'donor_sex_order', 'donor_sex_label', 'region_color', 'region_order', 'region_label', 'cortical_layer_color', 'cortical_layer_order', 'cortical_layer_label', 'cell_type_accession_color', 'cell_type_accession_order', 'cell_type_accession_label', 'cell_type_alias_color', 'cell_type_alias_order', 'cell_type_alias_label', 'cell_type_alt_alias_color', 'cell_type_alt_alias_order', 'cell_type_alt_alias_label', 'cell_type_designation_color', 'cell_type_designation_order', 'cell_type_designation_label', 'external_donor_name_color', 'external_donor_name_order', 'external_donor_name_label', 'facs_population_plan_color', 'facs_population_plan_order', 'facs_population_plan_label', 'injection_materials_col

In [9]:
adata.write_h5ad('TEMP_ADATA.h5ad')

## Pairwise test

In [10]:
def get_sig_features(rank_gene_dict):
    pvals_adj = pd.DataFrame(rank_gene_dict['pvals_adj'])
    names = pd.DataFrame(rank_gene_dict['names'])
    logfoldchanges = pd.DataFrame(rank_gene_dict['logfoldchanges'])
    gene_set = set(
        names.where((pvals_adj < adj_p_cutoff)
                    & (logfoldchanges.abs() > log2fc_cutoff)).values.flat)
    return gene_set


def pairwise_tests(pairs):
    adata_path = 'TEMP_ADATA.h5ad'
    adata = anndata.read_h5ad(adata_path)
    total_markers = set()
    pair_marker_count = {}
    for pair in pairs:
        cluster_a, cluster_b = pair
        cluster_a_cells = cluster_series[cluster_series == cluster_a]
        if cluster_a_cells.size > max_test_cell_population:
            cluster_a_cells = cluster_a_cells.sample(max_test_cell_population,
                                                     random_state=random_seed)
        cluster_a_cells = cluster_a_cells.index

        cluster_b_cells = cluster_series[cluster_series == cluster_b]
        if cluster_b_cells.size > max_test_cell_population:
            cluster_b_cells = cluster_b_cells.sample(max_test_cell_population,
                                                     random_state=random_seed)
        cluster_b_cells = cluster_b_cells.index
        cells = cluster_a_cells | cluster_b_cells

        pair_adata = adata[cells, :].copy()
        pair_adata.obs['cluster'] = pair_adata.obs[cluster_col].astype(
            str).astype('category')

        sc.tl.rank_genes_groups(pair_adata,
                                groupby='cluster',
                                n_genes=top_n,
                                method='wilcoxon')
        gene_set = get_sig_features(pair_adata.uns['rank_genes_groups'])
        # drop na
        gene_set = set([i for i in gene_set if isinstance(i, str)])

        total_markers.update(gene_set)
        pair_marker_count[pair] = len(gene_set)
    return total_markers, pair_marker_count

In [11]:
total_markers = set()
pair_marker_counts = {}
with ProcessPoolExecutor(cpu) as executor:
    futures = []
    for chunk_start in range(0, len(cluster_pairs), chunk_size):
        pair_chunk = cluster_pairs[chunk_start:chunk_start + chunk_size]
        future = executor.submit(pairwise_tests, pair_chunk)
        futures.append(future)

    for future in as_completed(futures):
        genes, pair_marker_count = future.result()
        total_markers.update(genes)
        pair_marker_counts.update(pair_marker_count)

## Save results

In [13]:
total_markers = set([i for i in total_markers if isinstance(i, str)])
with open('rna.cluster_markers.txt', 'w') as f:
    f.write('\n'.join(total_markers))

In [14]:
pair_marker_counts = pd.Series(pair_marker_counts)
marker_counts = pair_marker_counts.reset_index()
marker_counts.columns = ['ClusterA', 'ClusterB', 'GeneCount']
marker_counts.to_csv('rna.cluster_pair_marker_counts.csv', index=None)

In [15]:
marker_counts[marker_counts['GeneCount'] < 3]

Unnamed: 0,ClusterA,ClusterB,GeneCount


In [16]:
import subprocess
subprocess.run(['rm', '-f', 'TEMP_ADATA.h5ad'])

CompletedProcess(args=['rm', '-f', 'TEMP_ADATA.h5ad'], returncode=0)

In [17]:
marker_counts.describe()

Unnamed: 0,GeneCount
count,703.0
mean,39.813656
std,0.747826
min,32.0
25%,40.0
50%,40.0
75%,40.0
max,40.0
