## Marker Selection

In [1]:
import pathlib
from concurrent.futures import ProcessPoolExecutor, as_completed
from itertools import combinations
import numpy as np
import anndata
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import seaborn as sns
import xarray as xr
import subprocess
from sklearn.metrics import roc_auc_score
from cemba_data.tools.hdf5.anndata import rank_features_groups
import warnings
warnings.filterwarnings('ignore')

## Parameter

In [2]:
mcds_pattern = '/home/hanliu/project/mouse_rostral_brain/study/Level1-CellClass/ALL_manual/Adata/GeneWithSlop2kb.gene_da_rate.*.mcds'
min_cluster_cell_number = 10
adj_p_cutoff = 1e-3
top_n = 30000
max_test_cell_population = 1000
chunk_size = 100
delta_rate_cutoff = 0.1
auroc_cutoff = 0.8

In [3]:
# Parameters
cluster_col = "SubType"
use_clusters = ["ODC odc-small", "PC pc-all", "ODC odc-large", "ANP anp-dg", "OPC opc-large", "ASC cortex-olf", "MGC mgc-all", "ASC str-hpf", "ASC mid", "VLMC Mapk4", "EC Abhd2", "VLMC-Pia vlmc-pia-all", "VLMC Col4a1", "OPC opc-small", "ANP anp-olf-cnu", "EC Sema3g"]
mcds_pattern = "/home/hanliu/project/mouse_rostral_brain/study/Level1-CellClass/ALL_manual/Adata/GeneWithSlop2kb.gene_da_rate.*.mcds"
min_cluster_cell_number = 10
adj_p_cutoff = 0.001
top_n = 30000
cpu = 30
max_test_cell_population = 2000
chunk_size = 30
delta_rate_cutoff = 0.05
auroc_cutoff = 0.8


### Stable Parameter

In [4]:
random_seed = 0

## Cell Tidy Data

In [5]:
tidy_data_path = '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'

In [6]:
cell_tidy_data = pd.read_msgpack(tidy_data_path)

if use_clusters is not None:
    cell_tidy_data = cell_tidy_data[cell_tidy_data[cluster_col].isin(
        use_clusters)]
cell_tidy_data.shape[0]

8167

In [7]:
records = []
for cluster, sub_df in cell_tidy_data.groupby(cluster_col):
    if sub_df.shape[0] < max_test_cell_population:
        records.append(sub_df)
    else:
        records.append(
            sub_df.sample(max_test_cell_population, random_state=random_seed))
cell_tidy_data = pd.concat(records)
cell_tidy_data[cluster_col].value_counts()

ODC odc-large            2000
ASC str-hpf              1075
ODC odc-small             866
MGC mgc-all               853
ASC mid                   734
OPC opc-large             688
ASC cortex-olf            406
VLMC Mapk4                244
VLMC-Pia vlmc-pia-all     225
ANP anp-olf-cnu           210
VLMC Col4a1               175
PC pc-all                 170
ANP anp-dg                121
OPC opc-small              89
EC Abhd2                   83
EC Sema3g                  12
Name: SubType, dtype: int64

In [8]:
cluster_series = cell_tidy_data[cluster_col].astype(str)
cluster_counts = cluster_series.value_counts()


def check_cluster(cluster, count):
    if count < min_cluster_cell_number:
        return False
    return True


unique_clusters = [
    cluster for cluster, count in cluster_counts.items()
    if check_cluster(cluster, count)
]
cluster_pairs = list(combinations(unique_clusters, 2))

print(len(unique_clusters), 'pass filter.')
print(len(cluster_pairs), 'pairwise comparison to test.')

16 pass filter.
120 pairwise comparison to test.


## Gene meta

In [9]:
gene_meta = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusterMethylMarker/gencode.vM22.annotation.gene.flat.filtered_white_genes.tsv.gz',
    index_col='gene_id',
    sep='\t')
gene_meta.index.name = 'gene'
gene_name_to_id = {v: k for k, v in gene_meta['gene_name'].iteritems()}
gene_idbase_to_id = {i.split('.')[0]: i for i in gene_meta.index}

## Adata

In [10]:
gene_mcds = xr.open_mfdataset(mcds_pattern)
use_gene = gene_mcds.get_index('gene') & gene_meta.index
gene_meta = gene_meta.reindex(use_gene)

In [11]:
gene_mcds

In [12]:
gene_mcds = gene_mcds['gene_da'].sel(mc_type='CGN',
                                     cell=cell_tidy_data.index,
                                     gene=gene_meta.index)
gene_mcds

In [13]:
cell_tidy_data.to_msgpack('TEMP.msg')

In [14]:
gene_mcds.to_netcdf('TEMP.nc')

## Pairwise test

In [10]:
def get_sig_features(rank_gene_dict):
    pvals_adj = pd.DataFrame(rank_gene_dict['pvals_adj'])
    names = pd.DataFrame(rank_gene_dict['names'])
    return pvals_adj, names


def get_delta(cluster, gene):
    row = cluster_mean_gene_df.loc[gene].copy()
    cluster_value = row.pop(cluster)
    other_mean = row[0]  # only two cluster
    delta = cluster_value - other_mean
    return delta


def calculate_single_pair(data_path, pair):
    cluster_a, cluster_b = pair
    output_dir = pathlib.Path(f'TEMP/{cluster_a}')
    output_dir.mkdir(exist_ok=True)
    output_path = output_dir / f'{cluster_b}.msg'
    if output_path.exists():
        return

    this_tidy_data = pd.read_msgpack('TEMP.msg')
    this_tidy_data = this_tidy_data[this_tidy_data[cluster_col].isin(pair)]

    mcds = xr.open_dataarray(data_path).sel(cell=this_tidy_data.index).load()
    adata = anndata.AnnData(X=mcds.values,
                            obs=pd.DataFrame([], mcds.get_index('cell')),
                            var=pd.DataFrame([], mcds.get_index('gene')))
    adata.obs['cluster'] = this_tidy_data[cluster_col].astype('category')

    # reverse_adata, centered by 1 because after normalization all prior is center to 1
    adata.X = (adata.X - 1) * -1 + 1

    # calculate cluster delta
    records = {}
    for cluster, sub_df in adata.obs.groupby('cluster'):
        sub_adata = adata[sub_df.index, :]
        gene_mean = sub_adata.X.mean(axis=0)
        records[cluster] = pd.Series(gene_mean, index=sub_adata.var_names)
    cluster_mean_gene_df = pd.DataFrame(records)
    
    # judge gene by delta
    cluster_delta = (cluster_mean_gene_df[cluster_a] -
                     cluster_mean_gene_df[cluster_b]).abs()
    delta_judge = cluster_delta > delta_rate_cutoff
    use_adata = adata # [:, delta_judge]
    
    
    sc.tl.rank_genes_groups(use_adata,
                            groupby='cluster',
                            n_genes=top_n,
                            method='wilcoxon')
    pvals_adj, names = get_sig_features(use_adata.uns['rank_genes_groups'])

    results = []
    for col in use_adata.obs['cluster'].unique():
        if col not in pair:
            continue
        df = pd.DataFrame({
            'pvals_adj': pvals_adj[col].tolist(),
            'gene_id': names[col].tolist()
        })
        df['cluster_from'] = col
        df['cluster_to'] = cluster_a if col == cluster_b else cluster_b
        results.append(df)

    # get total results filter by adj_p
    total_results = pd.concat(results)
    total_results['gene_name'] = total_results['gene_id'].map(
        gene_meta['gene_name'])
    total_results['-lgp'] = -np.log10(total_results['pvals_adj'])
    total_results['-lgp'] = total_results['-lgp'].replace(np.inf, 1000)
    total_results = total_results[
        total_results['pvals_adj'] < adj_p_cutoff].copy()
    
    if total_results.shape[0] > 0:
        # judge by auroc
        total_results['AUROC'] = total_results[[
            'gene_id', 'cluster_from'
        ]].apply(lambda i: get_auroc(i['gene_id'], i['cluster_from'], use_adata),
                 axis=1)
        total_results = total_results[total_results['AUROC'] > auroc_cutoff]
    total_results.to_msgpack(output_path)
    return


def get_auroc(gene_id, cluster, adata):
    yscore = adata.obs_vector(gene_id)
    ylabel = adata.obs['cluster'] == cluster
    score = roc_auc_score(ylabel, yscore)
    score = abs(score - 0.5) + 0.5
    return score

In [11]:
data_path = 'TEMP.nc'

## Run pairwise marker

In [12]:
pair_marker_counts = {}
n = 0
for chunk_start in range(0, len(cluster_pairs), chunk_size):
    pairs = cluster_pairs[chunk_start : chunk_start + chunk_size]
    with ProcessPoolExecutor(cpu) as executor:
        temp_dir = 'TEMP'
        pathlib.Path(temp_dir).mkdir(exist_ok=True)
        futures = []
        for pair in pairs:
            future = executor.submit(calculate_single_pair, data_path, pair)
            futures.append(future)
    
        for future in as_completed(futures):
            n += 1
            if n % 100 == 0:
                print(n)
            future.result()

100


## Aggregate DEG

In [14]:
temp_dir = pathlib.Path('TEMP')
deg_list = list(temp_dir.glob('**/*msg'))

df_list = []
for path in deg_list:
    df_list.append(pd.read_msgpack(path))

total_markers = pd.concat(df_list)

## Add cluster mean

In [15]:
mcds = xr.open_dataarray(data_path).load()
gene_df = mcds.to_pandas()
cluster_mean = gene_df.groupby(cell_tidy_data['SubType']).mean()

In [28]:
total_markers['cluster_from_rate'] = total_markers.apply(
    lambda i: cluster_mean.at[i['cluster_from'], i['gene_id']], axis=1)
total_markers['cluster_to_rate'] = total_markers.apply(
    lambda i: cluster_mean.at[i['cluster_to'], i['gene_id']], axis=1)

In [29]:
total_markers['log2fc'] = np.log2(total_markers['cluster_from_rate'] / total_markers['cluster_to_rate'])
total_markers['delta'] = total_markers['cluster_from_rate'] - total_markers['cluster_to_rate']
total_markers = total_markers[(total_markers['delta'] < -0.3)].copy()

In [30]:
assert total_markers.set_index(['gene_id', 'cluster_from', 'cluster_to']).index.duplicated().sum() == 0

## Save results

In [31]:
total_markers.to_msgpack('TotalPairwiseMarker.msg')

In [32]:
marker_counts = total_markers.set_index(['cluster_from', 'cluster_to']).index.value_counts()
marker_counts.index = pd.MultiIndex.from_tuples(marker_counts.index.tolist())
marker_counts = marker_counts.reset_index()
marker_counts.columns = ['ClusterA', 'ClusterB', 'GeneCount']
marker_counts.to_csv('Cluster_pair_marker_counts.csv', index=None)

In [33]:
marker_counts[marker_counts['GeneCount'] < 10]

Unnamed: 0,ClusterA,ClusterB,GeneCount
219,ANP anp-olf-cnu,EC Sema3g,9
220,ASC str-hpf,ANP anp-olf-cnu,3
221,ASC str-hpf,ANP anp-dg,2
222,OPC opc-small,OPC opc-large,1
223,ANP anp-olf-cnu,ASC cortex-olf,1
224,ANP anp-olf-cnu,ASC mid,1


In [34]:
subprocess.run('rm -rf TEMP*', shell=True)

CompletedProcess(args='rm -rf TEMP*', returncode=0)

In [35]:
with open('TotalGeneID.txt', 'w') as f:
    for g in total_markers['gene_id'].unique():
        f.write(f'{g}\n')

In [36]:
total_markers['gene_id'].unique().size

1976