## Marker Selection

In [1]:
import pathlib
from concurrent.futures import ProcessPoolExecutor, as_completed
from itertools import combinations
import numpy as np
import anndata
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import seaborn as sns
import xarray as xr
import subprocess
from sklearn.metrics import roc_auc_score
from cemba_data.tools.hdf5.anndata import rank_features_groups
import warnings
warnings.filterwarnings('ignore')

## Parameter

In [2]:
mcds_pattern = '/home/hanliu/project/mouse_rostral_brain/study/Level1-CellClass/ALL_manual/Adata/GeneWithSlop2kb.gene_da_rate.*.mcds'
min_cluster_cell_number = 10
adj_p_cutoff = 1e-3
top_n = 30000
max_test_cell_population = 1000
chunk_size = 100
delta_rate_cutoff = 0.5
auroc_cutoff = 0.8

In [3]:
# Parameters
cluster_col = "SubType"
use_clusters = ["CA3 Cadm2", "CA1 Chrm3", "CA3-St18 Tead1", "Gfra1 Gfra1", "IT-L5 Etv1", "CA1 Ptprg", "NP-L6 Cntnap4", "CA3-St18 Nuak1", "CT-L6 Megf9", "IG-CA2 Chrm3", "IG-CA2 Peak1", "DG-po Calb2", "DG dg-all", "CA1 Kif26a", "CA3 Efnb2", "CA1 Ak5", "DG-po Bcl11a", "PT-L5 Tenm2", "CA1 Lingo2", "CA3-St18 Epha5", "IG-CA2 Xpr1", "DG-po Kctd8", "CT-L6 Il1rap", "L6b Adcy8", "IT-L6 Man1c1", "NP-L6 Olfml2b", "PT-L5 Abca12", "PT-L5 Nectin1", "IT-L23 Cux1", "IT-L23 Foxp1", "IT-L4 Shc3", "IT-L5 Cdh8", "IT-L5 Grik3", "PT-L5 Tmtc2", "IT-L23 Tenm2", "NP-L6 Cntnap5a", "CT-L6 Hcrtr2", "PT-L5 Plcb4", "IT-L23 Ptprt", "CT-L6 Map4", "NP-L6 Boc", "PT-L5 Kcnh1", "OLF-Exc Bmpr1b", "PT-L5 Astn2", "IT-L6 Fstl4", "CLA Bcl11a", "NP-L6 Cyp7b1", "CLA Cdh8", "IT-L6 Cadps2", "PT-L5 Ptprt", "NP-L6 Kcnab1", "IT-L6 Oxr1", "OLF-Exc Pld5", "OLF-Exc Lrrtm3", "OLF-Exc Cdh9", "OLF-Exc Unc13c", "L6b Nrp2", "OLF-Exc Sgcd", "OLF-Exc Rmst", "PT-L5 Unc5b", "L6b Pkhd1", "L6b Kcnk2", "IT-L4 Astn2", "CLA Nrp2", "EP Tspan5", "OLF-Exc Cux2", "EP Rgs8", "EP Adcy8"]
mcds_pattern = "/home/hanliu/project/mouse_rostral_brain/study/Level1-CellClass/ALL_manual/Adata/GeneWithSlop2kb.gene_da_rate.*.mcds"
min_cluster_cell_number = 10
adj_p_cutoff = 0.001
top_n = 30000
cpu = 15
max_test_cell_population = 2000
chunk_size = 15
delta_rate_cutoff = 0.3
auroc_cutoff = 0.8


### Stable Parameter

In [4]:
random_seed = 0

## Cell Tidy Data

In [5]:
tidy_data_path = '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'

In [6]:
cell_tidy_data = pd.read_msgpack(tidy_data_path)

if use_clusters is not None:
    cell_tidy_data = cell_tidy_data[cell_tidy_data[cluster_col].isin(
        use_clusters)]
cell_tidy_data.shape[0]

67324

In [7]:
records = []
for cluster, sub_df in cell_tidy_data.groupby(cluster_col):
    if sub_df.shape[0] < max_test_cell_population:
        records.append(sub_df)
    else:
        records.append(
            sub_df.sample(max_test_cell_population, random_state=random_seed))
cell_tidy_data = pd.concat(records)
cell_tidy_data[cluster_col].value_counts()

CA1 Chrm3       2000
IT-L23 Cux1     2000
IT-L4 Astn2     2000
CT-L6 Il1rap    2000
IT-L5 Cdh8      2000
                ... 
DG-po Bcl11a      55
CA1 Lingo2        54
DG-po Calb2       51
DG-po Kctd8       39
L6b Pkhd1         37
Name: SubType, Length: 68, dtype: int64

In [8]:
cluster_series = cell_tidy_data[cluster_col].astype(str)
cluster_counts = cluster_series.value_counts()

def check_cluster(cluster, count):
    if count < min_cluster_cell_number:
        return False
    return True


unique_clusters = [
    cluster for cluster, count in cluster_counts.items()
    if check_cluster(cluster, count)
]
cluster_pairs = list(combinations(unique_clusters, 2))

print(len(unique_clusters), 'pass filter.')
print(len(cluster_pairs), 'pairwise comparison to test.')

68 pass filter.
2278 pairwise comparison to test.


## Gene meta

In [9]:
gene_meta = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusterMethylMarker/gencode.vM22.annotation.gene.flat.filtered_white_genes.tsv.gz',
    index_col='gene_id',
    sep='\t')
gene_meta.index.name = 'gene'
gene_name_to_id = {v: k for k, v in gene_meta['gene_name'].iteritems()}
gene_idbase_to_id = {i.split('.')[0]: i for i in gene_meta.index}

## Adata

In [10]:
# if dask_distribute:
#     from dask.distributed import Client
#     client = Client(dashboard_address=':5555')

In [11]:
gene_mcds = xr.open_mfdataset(mcds_pattern)
use_gene = gene_mcds.get_index('gene') & gene_meta.index
gene_meta = gene_meta.reindex(use_gene)

In [12]:
gene_mcds

In [13]:
gene_mcds = gene_mcds['gene_da'].sel(mc_type='CHN',
                                     cell=cell_tidy_data.index,
                                     gene=gene_meta.index)
gene_mcds

In [14]:
cell_tidy_data.to_msgpack('TEMP.msg')

In [15]:
gene_mcds.to_netcdf('TEMP.nc')

## Pairwise test

In [10]:
def get_sig_features(rank_gene_dict):
    pvals_adj = pd.DataFrame(rank_gene_dict['pvals_adj'])
    names = pd.DataFrame(rank_gene_dict['names'])
    return pvals_adj, names


def get_delta(cluster, gene):
    row = cluster_mean_gene_df.loc[gene].copy()
    cluster_value = row.pop(cluster)
    other_mean = row[0] # only two cluster
    delta = cluster_value - other_mean
    return delta


def calculate_single_pair(data_path, pair):
    cluster_a, cluster_b = pair
    output_dir = pathlib.Path(f'TEMP/{cluster_a}')
    output_dir.mkdir(exist_ok=True)
    output_path = output_dir / f'{cluster_b}.msg'
    if output_path.exists():
        return
    
    this_tidy_data = pd.read_msgpack('TEMP.msg')
    this_tidy_data = this_tidy_data[this_tidy_data[cluster_col].isin(pair)]
    
    mcds = xr.open_dataarray(data_path).sel(cell=this_tidy_data.index).load()
    adata = anndata.AnnData(X=mcds.values,
                    obs=pd.DataFrame([], mcds.get_index('cell')),
                    var=pd.DataFrame([], mcds.get_index('gene')))
    adata.obs['cluster'] = this_tidy_data[cluster_col].astype('category')
    
    # reverse_adata, centered by 1 because after normalization all prior is center to 1
    adata.X = (adata.X - 1) * -1 + 1
    
    # calculate cluster delta
    records = {}
    for cluster, sub_df in adata.obs.groupby('cluster'):
        sub_adata = adata[sub_df.index, :]
        gene_mean = sub_adata.X.mean(axis=0)
        records[cluster] = pd.Series(gene_mean, index=sub_adata.var_names)
    cluster_mean_gene_df = pd.DataFrame(records)

    # judge gene by delta
    cluster_delta = (cluster_mean_gene_df[cluster_a] -
                     cluster_mean_gene_df[cluster_b]).abs()
    delta_judge = cluster_delta > delta_rate_cutoff
    use_adata = adata[:, delta_judge]
    
    sc.tl.rank_genes_groups(use_adata,
                            groupby='cluster',
                            n_genes=top_n,
                            method='wilcoxon')
    pvals_adj, names = get_sig_features(
        use_adata.uns['rank_genes_groups'])

    results = []
    for col in use_adata.obs['cluster'].unique():
        if col not in pair:
            continue
        df = pd.DataFrame({
            'pvals_adj': pvals_adj[col].tolist(),
            'gene_id': names[col].tolist()
        })
        df['cluster_from'] = col
        df['cluster_to'] = cluster_a if col == cluster_b else cluster_b
        results.append(df)
    
    # get total results filter by adj_p
    total_results = pd.concat(results)
    total_results['gene_name'] = total_results['gene_id'].map(
        gene_meta['gene_name'])
    total_results['-lgp'] = -np.log10(total_results['pvals_adj'])
    total_results['-lgp'] = total_results['-lgp'].replace(np.inf, 1000)
    total_results = total_results[
        total_results['pvals_adj'] < adj_p_cutoff].copy()
    
    # judge by auroc
    total_results['AUROC'] = total_results[[
        'gene_id', 'cluster_from'
    ]].apply(lambda i: get_auroc(i['gene_id'], i['cluster_from'],
                                 use_adata),
             axis=1)
    total_results = total_results[total_results['AUROC'] > auroc_cutoff]
    
    # add rate
    # total_results['cluster_from_rate'] = total_results.apply(
    #     lambda i: cluster_mean_gene_df.at[i['gene_id'], i['cluster_from']],
    #     axis=1)
    # total_results['cluster_to_rate'] = total_results.apply(
    #     lambda i: cluster_mean_gene_df.at[i['gene_id'], i['cluster_to']],
    #     axis=1)
    
    total_results.to_msgpack(output_path)
    return


def get_auroc(gene_id, cluster, adata):
    yscore = adata.obs_vector(gene_id)
    ylabel = adata.obs['cluster'] == cluster
    score = roc_auc_score(ylabel, yscore)
    score = abs(score - 0.5) + 0.5
    return score

In [11]:
data_path = 'TEMP.nc'

## Run pairwise marker

In [14]:
pair_marker_counts = {}
n = 0
for chunk_start in range(0, len(cluster_pairs), chunk_size):
    pairs = cluster_pairs[chunk_start : chunk_start + chunk_size]
    with ProcessPoolExecutor(cpu) as executor:
        temp_dir = 'TEMP'
        pathlib.Path(temp_dir).mkdir(exist_ok=True)
        futures = []
        for pair in pairs:
            future = executor.submit(calculate_single_pair, data_path, pair)
            futures.append(future)
        
        for future in as_completed(futures):
            future.result()
            n += 1
            if n % 100 == 0:
                print(n)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200


## Aggregate DEG

In [15]:
temp_dir = pathlib.Path('TEMP')
deg_list = list(temp_dir.glob('**/*msg'))

df_list = []
for path in deg_list:
    df_list.append(pd.read_msgpack(path))

total_markers = pd.concat(df_list)

## Annotate rate

In [29]:
gene_mean_mch = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/AssignGeneToTree/SubType.gene_mean_mch.msg')

total_markers['cluster_from_rate'] = total_markers.apply(
    lambda i: gene_mean_mch.at[i['cluster_from'], i['gene_id']], axis=1)
total_markers['cluster_to_rate'] = total_markers.apply(
    lambda i: gene_mean_mch.at[i['cluster_to'], i['gene_id']], axis=1)

In [41]:
total_markers['log2fc'] = np.log2(total_markers['cluster_from_rate'] / total_markers['cluster_to_rate'])
total_markers['delta'] = total_markers['cluster_from_rate'] - total_markers['cluster_to_rate']
total_markers = total_markers[(total_markers['delta'] < -0.3)].copy()

## Save results

In [42]:
total_markers.to_msgpack('TotalPairwiseMarker.msg')

In [43]:
marker_counts = total_markers.set_index(['cluster_from', 'cluster_to']).index.value_counts()
marker_counts.index = pd.MultiIndex.from_tuples(marker_counts.index.tolist())
marker_counts = marker_counts.reset_index()
marker_counts.columns = ['ClusterA', 'ClusterB', 'GeneCount']
marker_counts.to_csv('Cluster_pair_marker_counts.csv', index=None)

In [44]:
marker_counts[marker_counts['GeneCount'] < 10]

Unnamed: 0,ClusterA,ClusterB,GeneCount
4552,PT-L5 Ptprt,PT-L5 Tenm2,7
4553,CT-L6 Megf9,CT-L6 Il1rap,6
4554,NP-L6 Olfml2b,NP-L6 Cntnap5a,3
4555,CT-L6 Il1rap,CT-L6 Megf9,1


In [45]:
subprocess.run('rm -rf TEMP*', shell=True)

CompletedProcess(args='rm -rf TEMP*', returncode=0)

In [46]:
with open('TotalGeneID.txt', 'w') as f:
    for g in total_markers['gene_id'].unique():
        f.write(f'{g}\n')

In [47]:
total_markers['gene_id'].unique().size

28721

In [51]:
counts = total_markers['gene_id'].value_counts()

In [56]:
counts[counts > 20]

ENSMUSG00000024642.17    1595
ENSMUSG00000030583.16    1555
ENSMUSG00000021338.17    1549
ENSMUSG00000021700.10    1548
ENSMUSG00000033306.15    1546
                         ... 
ENSMUSG00000077392.1       21
ENSMUSG00000062012.14      21
ENSMUSG00000021022.9       21
ENSMUSG00000106380.1       21
ENSMUSG00000062310.7       21
Name: gene_id, Length: 18323, dtype: int64