## Marker Selection

In [1]:
import pathlib
from concurrent.futures import ProcessPoolExecutor, as_completed
from itertools import combinations
import numpy as np
import anndata
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import seaborn as sns
import xarray as xr
import subprocess
from sklearn.metrics import roc_auc_score
from cemba_data.tools.hdf5.anndata import rank_features_groups
import warnings
warnings.filterwarnings('ignore')

  PANDAS_TYPES = (pd.Series, pd.DataFrame, pd.Panel)
  'DataArray', pd.Series, pd.DataFrame, pd.Panel]:


## Parameter

In [2]:
mcds_pattern = '/home/hanliu/project/mouse_rostral_brain/study/Level1-CellClass/ALL_manual/Adata/GeneWithSlop2kb.gene_da_rate.*.mcds'
min_cluster_cell_number = 10
adj_p_cutoff = 1e-1
top_n = 1000
max_test_cell_population = 100
chunk_size = 1
delta_rate_cutoff = 0.3
auroc_cutoff = 0.8

In [None]:
# Parameters
cluster_col = "MajorSpatial"
tidy_data_path = "SpetialCellMeta.ITSpatial.msg"
use_clusters = ["IT-L4+MOs", "IT-L23+MOs", "IT-L5+MOs", "IT-L6+MOs", "IT-L5+ORB", "IT-L23+ORB", "IT-L5+PFC", "IT-L6+PFC", "IT-L23+PFC", "IT-L4+MOp", "IT-L23+MOp", "IT-L5+MOp", "IT-L6+MOp", "IT-L6+AI", "IT-L5+AI", "IT-L23+AI", "IT-L6+ACA", "IT-L23+ACA", "IT-L5+ACA", "IT-L23+SSp", "IT-L5+SSp", "IT-L4+SSp", "IT-L6+SSp", "IT-L4+SSs", "IT-L6+SSs", "IT-L5+SSs", "IT-L23+SSs"]
mcds_pattern = "/home/hanliu/project/mouse_rostral_brain/study/Level1-CellClass/ALL_manual/Adata/GeneWithSlop2kb.gene_da_rate.*.mcds"
min_cluster_cell_number = 10
adj_p_cutoff = 0.005
top_n = 10000
cpu = 30
max_test_cell_population = 2000
chunk_size = 50
delta_rate_cutoff = 0.3
auroc_cutoff = 0.8


### Stable Parameter

In [4]:
random_seed = 0

## Cell Tidy Data

In [5]:
cell_tidy_data = pd.read_msgpack(tidy_data_path)

if use_clusters is not None:
    cell_tidy_data = cell_tidy_data[cell_tidy_data[cluster_col].isin(
        use_clusters)]
cell_tidy_data.shape[0]

95815

In [6]:
records = []
for cluster, sub_df in cell_tidy_data.groupby(cluster_col):
    if sub_df.shape[0] < max_test_cell_population:
        records.append(sub_df)
    else:
        records.append(
            sub_df.sample(max_test_cell_population, random_state=random_seed))
cell_tidy_data = pd.concat(records)
cell_tidy_data[cluster_col].value_counts()

LSX-Inh      1000
MSN-D1       1000
PAL-Inh      1000
CT-L6        1000
CGE-Lamp5    1000
PT-L5        1000
IT-L6        1000
MSN-D2       1000
L6b          1000
CGE-Vip      1000
IT-L4        1000
NP-L6        1000
OLF          1000
IT-L5        1000
MGE-Sst      1000
CA1          1000
DG           1000
Foxp2        1000
OLF-Exc      1000
MGE-Pvalb    1000
IT-L23       1000
D1L-Fstl4    1000
CA3          1000
CLA           945
IG-CA2        745
EP            666
D1L-PAL       495
CA3-St18      408
Chd7          386
Unc5c         219
DG-po         145
Gfra1         110
Name: MajorType, dtype: int64

In [7]:
cluster_series = cell_tidy_data[cluster_col].astype(str)
cluster_counts = cluster_series.value_counts()


def check_cluster(cluster, count):
    if count < min_cluster_cell_number:
        return False
    return True


unique_clusters = [
    cluster for cluster, count in cluster_counts.items()
    if check_cluster(cluster, count)
]
cluster_pairs = list(combinations(unique_clusters, 2))

print(len(unique_clusters), 'pass filter.')
print(len(cluster_pairs), 'pairwise comparison to test.')

32 pass filter.
496 pairwise comparison to test.


## Gene meta

In [8]:
gene_meta = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusterMethylMarker/gencode.vM22.annotation.gene.flat.filtered_white_genes.tsv.gz',
    index_col='gene_id',
    sep='\t')
gene_meta.index.name = 'gene'
gene_name_to_id = {v: k for k, v in gene_meta['gene_name'].iteritems()}
gene_idbase_to_id = {i.split('.')[0]: i for i in gene_meta.index}

## Adata

In [9]:
gene_mcds = xr.open_mfdataset(mcds_pattern)
use_gene = gene_mcds.get_index('gene') & gene_meta.index
gene_meta = gene_meta.reindex(use_gene)

In [10]:
gene_mcds

<xarray.Dataset>
Dimensions:           (cell: 104340, gene: 55487, mc_type: 2)
Coordinates:
  * mc_type           (mc_type) object 'CGN' 'CHN'
    strand_type       <U4 'both'
  * gene              (gene) object 'ENSMUSG00000102693.1' ... 'ENSMUSG00000064372.1'
    geneslop2k_chrom  (gene) object 'chr1' 'chr1' 'chr1' ... 'chrM' 'chrM'
    geneslop2k_start  (gene) int64 3071252 3100015 3203900 ... 12144 13288 13355
    geneslop2k_end    (gene) int64 3076321 3104124 3673497 ... 16299 16299 16299
  * cell              (cell) object '1A_M_0' '1A_M_1' ... '8J_M_1292'
Data variables:
    gene_da           (cell, gene, mc_type) float64 dask.array<shape=(104340, 55487, 2), chunksize=(10000, 55487, 2)>

In [11]:
gene_mcds = gene_mcds['gene_da'].sel(mc_type='CHN',
                                     cell=cell_tidy_data.index,
                                     gene=gene_meta.index)
gene_mcds

<xarray.DataArray 'gene_da' (cell: 27119, gene: 50231)>
dask.array<shape=(27119, 50231), dtype=float64, chunksize=(2, 50231)>
Coordinates:
    mc_type           <U3 'CHN'
    strand_type       <U4 'both'
  * gene              (gene) object 'ENSMUSG00000102693.1' ... 'ENSMUSG00000064372.1'
    geneslop2k_chrom  (gene) object 'chr1' 'chr1' 'chr1' ... 'chrM' 'chrM'
    geneslop2k_start  (gene) int64 3071252 3100015 3203900 ... 12144 13288 13355
    geneslop2k_end    (gene) int64 3076321 3104124 3673497 ... 16299 16299 16299
  * cell              (cell) object '9H_M_1742' '9H_M_2429' ... '9J_M_572'

In [9]:
cell_tidy_data.to_msgpack('TEMP.msg')

In [None]:
gene_mcds.to_netcdf('TEMP.nc')

## Pairwise test

In [9]:
def get_sig_features(rank_gene_dict):
    pvals_adj = pd.DataFrame(rank_gene_dict['pvals_adj'])
    names = pd.DataFrame(rank_gene_dict['names'])
    return pvals_adj, names


def get_delta(cluster, gene):
    row = cluster_mean_gene_df.loc[gene].copy()
    cluster_value = row.pop(cluster)
    other_mean = row[0] # only two cluster
    delta = cluster_value - other_mean
    return delta


def calculate_single_pair(data_path, pair):
    cluster_a, cluster_b = pair
    output_dir = pathlib.Path(f'TEMP/{cluster_a}')
    output_dir.mkdir(exist_ok=True)
    output_path = output_dir / f'{cluster_b}.msg'
    if output_path.exists():
        return
    
    this_tidy_data = pd.read_msgpack('TEMP.msg')
    this_tidy_data = this_tidy_data[this_tidy_data[cluster_col].isin(pair)]
    
    mcds = xr.open_dataarray(data_path).sel(cell=this_tidy_data.index).load()
    adata = anndata.AnnData(X=mcds.values,
                    obs=pd.DataFrame([], mcds.get_index('cell')),
                    var=pd.DataFrame([], mcds.get_index('gene')))
    adata.obs['cluster'] = this_tidy_data[cluster_col].astype('category')
    
    # reverse_adata, centered by 1 because after normalization all prior is center to 1
    adata.X = (adata.X - 1) * -1 + 1
    
    # calculate cluster delta
    records = {}
    for cluster, sub_df in adata.obs.groupby('cluster'):
        sub_adata = adata[sub_df.index, :]
        gene_mean = sub_adata.X.mean(axis=0)
        records[cluster] = pd.Series(gene_mean, index=sub_adata.var_names)
    cluster_mean_gene_df = pd.DataFrame(records)

    # judge gene by delta
    cluster_delta = (cluster_mean_gene_df[cluster_a] -
                     cluster_mean_gene_df[cluster_b]).abs()
    delta_judge = cluster_delta > delta_rate_cutoff
    use_adata = adata[:, delta_judge]
    
    sc.tl.rank_genes_groups(use_adata,
                            groupby='cluster',
                            n_genes=top_n,
                            method='wilcoxon')
    pvals_adj, names = get_sig_features(
        use_adata.uns['rank_genes_groups'])

    results = []
    for col in use_adata.obs['cluster'].unique():
        if col not in pair:
            continue
        df = pd.DataFrame({
            'pvals_adj': pvals_adj[col].tolist(),
            'gene_id': names[col].tolist()
        })
        df['cluster_from'] = col
        df['cluster_to'] = cluster_a if col == cluster_b else cluster_b
        results.append(df)
    
    # get total results filter by adj_p
    total_results = pd.concat(results)
    total_results['gene_name'] = total_results['gene_id'].map(
        gene_meta['gene_name'])
    total_results['-lgp'] = -np.log10(total_results['pvals_adj'])
    total_results['-lgp'] = total_results['-lgp'].replace(np.inf, 1000)
    total_results = total_results[
        total_results['pvals_adj'] < adj_p_cutoff].copy()

    # judge by auroc
    total_results['AUROC'] = total_results[[
        'gene_id', 'cluster_from'
    ]].apply(lambda i: get_auroc(i['gene_id'], i['cluster_from'],
                                 use_adata),
             axis=1)
    total_results = total_results[total_results['AUROC'] > auroc_cutoff]
    total_results.to_msgpack(output_path)
    return


def get_auroc(gene_id, cluster, adata):
    yscore = adata.obs_vector(gene_id)
    ylabel = adata.obs['cluster'] == cluster
    score = roc_auc_score(ylabel, yscore)
    score = abs(score - 0.5) + 0.5
    return score

In [10]:
data_path = 'TEMP.nc'

## Run pairwise marker

In [11]:
pair_marker_counts = {}
n = 0
chunk_size = cpu
for chunk_start in range(0, len(cluster_pairs), chunk_size):
    pairs = cluster_pairs[chunk_start : chunk_start + chunk_size]
    
    with ProcessPoolExecutor(cpu) as executor:
        temp_dir = 'TEMP'
        pathlib.Path(temp_dir).mkdir(exist_ok=True)
        futures = []
        for pair in pairs:
            future = executor.submit(calculate_single_pair, data_path, pair)
            futures.append(future)
    
        for future in as_completed(futures):
            n += 1
            if n % 100 == 0:
                print(n)
            future.result()

100
200
300
400


## Aggregate DEG

In [12]:
temp_dir = pathlib.Path('TEMP')
deg_list = list(temp_dir.glob('**/*msg'))

df_list = []
for path in deg_list:
    df_list.append(pd.read_msgpack(path))

total_markers = pd.concat(df_list)

In [16]:
assert total_markers.set_index(['gene_id', 'cluster_from', 'cluster_to']).index.duplicated().sum() == 0

## Save results

In [17]:
total_markers.to_msgpack('TotalPairwiseMarker.msg')

In [21]:
marker_counts = total_markers.set_index(['cluster_from', 'cluster_to']).index.value_counts()
marker_counts.index = pd.MultiIndex.from_tuples(marker_counts.index.tolist())
marker_counts = marker_counts.reset_index()
marker_counts.columns = ['ClusterA', 'ClusterB', 'GeneCount']
marker_counts.to_csv('Cluster_pair_marker_counts.csv', index=None)

In [23]:
marker_counts[marker_counts['GeneCount'] < 10]

Unnamed: 0,ClusterA,ClusterB,GeneCount


In [29]:
subprocess.run('rm -rf TEMP*', shell=True)

CompletedProcess(args='rm -rf TEMP*', returncode=0)

In [25]:
with open('TotalGeneID.txt', 'w') as f:
    for g in total_markers['gene_id'].unique():
        f.write(f'{g}\n')

In [26]:
total_markers['gene_id'].unique().size

15561