In [1]:
from concurrent.futures import ProcessPoolExecutor, as_completed

import anndata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pybedtools
import scipy.stats as stats
import seaborn as sns
from statsmodels.stats.multitest import multipletests

## Parameters

In [66]:
# relavent score cutoff
rs_cutoff = 0.3
min_dmr_to_test = 1000 # on either side

# motif enrichment
or_cutoff = 1.6
neg_lgp_cutoff = 10
mask_quantile_to_max = 0.8

## Load Data

### Motif gene

In [44]:
motif_gene_anno = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/study/MotifClustering/JASPAR2020_CORE_vertebrates_non-redundant.mouse_genes.with_motif_group.199.csv', 
    index_col=0
)
motif_gene_anno.head()

Unnamed: 0_level_0,motif_name,motif_genes,gene_ids,gene_names,motif_group
motif_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MA0006.1,Ahr::Arnt,"Ahr,Arnt","ENSMUSG00000019256.17,ENSMUSG00000015522.18","Ahr,Arnt",MotifGroup178
MA0854.1,Alx1,Alx1,ENSMUSG00000036602.14,Alx1,MotifGroup3
MA0634.1,ALX3,ALX3,ENSMUSG00000014603.3,Alx3,MotifGroup3
MA0853.1,Alx4,Alx4,ENSMUSG00000040310.12,Alx4,MotifGroup3
MA0007.3,Ar,Ar,ENSMUSG00000046532.8,Ar,MotifGroup32


### Node Data

In [4]:
adata = anndata.read_h5ad('NodeDMRResults.h5ad')

In [12]:
use_dmr = adata.var_names[((adata.X != 0).sum(axis=0) != 0).A1]

### DMR Bed

In [14]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5') as hdf:
    dmr_bed_df = hdf['bed'].loc[use_dmr]

### DMR annot

In [21]:
dmr_annot = anndata.read_h5ad(
    '/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/MotifScan.h5ad'
)
# mask small motif scores
motif_cutoff = pd.Series(dmr_annot.X.max(axis=0).todense().A1 * mask_quantile_to_max, index=dmr_annot.var_names)

dmr_annot = dmr_annot[use_dmr, :].copy()
dmr_annot

AnnData object with n_obs × n_vars = 664466 × 719 
    obs: 'chrom', 'start', 'end'

## Refilter scores

In [22]:
# only keep value larger than the cutoff for each motif
dmr_annot.X = dmr_annot.X.multiply(
    (dmr_annot.X >
     motif_cutoff[dmr_annot.var_names].values[None, :]).astype(int)).tocsr()

## Test for each node

In [29]:
node = '134'

In [68]:
def node_dmr_motif_enrichment(node):
    # get node relavent score
    node_dmr_relavent_scores = pd.Series(adata.var_vector(node),
                                         index=adata.var_names)

    # get node left right dmr
    left_dmr = node_dmr_relavent_scores[node_dmr_relavent_scores > rs_cutoff]
    right_dmr = node_dmr_relavent_scores[node_dmr_relavent_scores < -rs_cutoff]

    # get dmr motif hits annotation
    left_dmr_annot = dmr_annot[left_dmr.index, :]
    right_dmr_annot = dmr_annot[right_dmr.index, :]

    # if DMR is not enough, skip and return empty record
    if (left_dmr.size < min_dmr_to_test) or (left_dmr.size < min_dmr_to_test):
        empty_record = pd.DataFrame([],
                                    columns=[
                                        'oddsratio', 'p_value', 'adj_p',
                                        '-lgp', 'left_hit', 'left_no_hit',
                                        'right_hit', 'right_no_hit',
                                        'left_hit_rate', 'right_hit_rate',
                                        'Node'
                                    ])
        return empty_record

    # get table
    motif_ids = dmr_annot.var_names
    # calculate motif occurence, not considering hits here
    left = (left_dmr_annot[:, motif_ids].X > 0).sum(axis=0)
    left_total = left_dmr_annot.shape[0]

    right = (right_dmr_annot.X > 0).sum(axis=0)
    right_total = right_dmr_annot.shape[0]

    tables = {}
    for motif, _left, _right in zip(motif_ids, left.A1, right.A1):
        table = [[_left, left_total - _left], [_right, right_total - _right]]
        tables[motif] = table

    # do test
    results = {}
    with ProcessPoolExecutor(40) as executor:
        fs = {}
        for motif, t in tables.items():
            f = executor.submit(stats.fisher_exact, t, alternative='two-sided')
            fs[f] = motif

        for f in as_completed(fs):
            motif = fs[f]
            odds, p = f.result()
            results[motif] = {'oddsratio': odds, 'p_value': p}
    motif_enrich_df = pd.DataFrame(results).T

    # p value correction
    _, p, _, _ = multipletests(motif_enrich_df['p_value'], method='fdr_bh')
    motif_enrich_df['adj_p'] = p
    motif_enrich_df['-lgp'] = -np.log10(motif_enrich_df['adj_p']).replace(
        -np.inf, -300)

    # assemble final results
    records = {}
    for motif, t in tables.items():
        tp, tn = t[0]
        fp, fn = t[1]
        tp_rate = tp / left_total
        fp_rate = fp / right_total
        records[motif] = dict(left_hit=tp,
                              left_no_hit=tn,
                              right_hit=fp,
                              right_no_hit=fn,
                              left_hit_rate=tp_rate,
                              right_hit_rate=fp_rate)
    counts = pd.DataFrame(records).T
    motif_enrich_df = pd.concat([motif_enrich_df, counts], axis=1, sort=True)
    motif_enrich_df['Node'] = node

    # apply a minimum filter
    motif_enrich_df = motif_enrich_df[motif_enrich_df['-lgp'] > 1]

    return motif_enrich_df

In [73]:
total_data = []
for node in adata.obs_names:
    data = node_dmr_motif_enrichment(node)
    print(node, data.shape[0])
    total_data.append(data)

68 0
69 0
70 0
71 0
72 0
73 0
74 55
75 0
76 0
77 0
78 46
79 0
80 0
81 0
82 0
83 0
84 0
85 0
86 0
87 0
88 104
89 111
90 0
91 65
92 290
93 177
94 0
95 173
96 0
97 149
98 0
99 0
100 0
101 246
102 36
103 171
104 0
105 219
106 105
107 274
108 0
109 164
110 0
111 287
112 164
113 240
114 225
115 248
116 334
117 376


  result = getattr(ufunc, method)(*inputs, **kwargs)


118 210
119 310


  result = getattr(ufunc, method)(*inputs, **kwargs)


120 333
121 250
122 198
123 246
124 359
125 336
126 223
127 286
128 273
129 329
130 385
131 382


  result = getattr(ufunc, method)(*inputs, **kwargs)


132 494
133 336
134 299


In [74]:
total_data = pd.concat(total_data)
total_data['log2odds'] = np.log2(total_data['oddsratio'])
total_data.to_msgpack('NodeMotifEnrichment.msg')

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  


In [None]:
total_data[(total_data['log2odds'].abs() > 0.5)
           & (total_data['-lgp'] > 4)]['Node'].value_counts().reindex(
               adata.obs_names)

In [1]:
import pandas as pd

In [2]:
markers = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusterMethylMarker/ExcSubTypePairwiseMarker/TotalPairwiseMarker.msg'
)

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
markers[(markers['cluster_from'] == 'CA3 Cadm2') & (markers['cluster_to'] == 'CA1 Chrm3')]

Unnamed: 0,pvals_adj,gene_id,cluster_from,cluster_to,gene_name,-lgp,AUROC,cluster_from_rate,cluster_to_rate,log2fc,delta
0,0.000000e+00,ENSMUSG00000030839.12,CA3 Cadm2,CA1 Chrm3,Sergef,1000.000000,0.999509,0.805568,2.599935,-1.690397,-1.794367
1,0.000000e+00,ENSMUSG00000051331.15,CA3 Cadm2,CA1 Chrm3,Cacna1c,1000.000000,0.999443,0.548179,1.384185,-1.336318,-0.836006
2,0.000000e+00,ENSMUSG00000042757.16,CA3 Cadm2,CA1 Chrm3,Tmem108,1000.000000,0.999369,0.526803,1.385357,-1.394922,-0.858554
3,0.000000e+00,ENSMUSG00000032017.14,CA3 Cadm2,CA1 Chrm3,Grik4,1000.000000,0.999266,0.364851,1.673336,-2.197347,-1.308485
4,0.000000e+00,ENSMUSG00000056222.15,CA3 Cadm2,CA1 Chrm3,Spock1,1000.000000,0.998970,0.651296,2.076884,-1.673035,-1.425588
5,0.000000e+00,ENSMUSG00000058145.16,CA3 Cadm2,CA1 Chrm3,Adamts17,1000.000000,0.998558,0.884658,1.676737,-0.922464,-0.792078
6,0.000000e+00,ENSMUSG00000021221.15,CA3 Cadm2,CA1 Chrm3,Dpf3,1000.000000,0.998058,0.560739,1.451610,-1.372253,-0.890871
7,0.000000e+00,ENSMUSG00000005089.15,CA3 Cadm2,CA1 Chrm3,Slc1a2,1000.000000,0.997807,0.412615,1.467578,-1.830568,-1.054963
8,0.000000e+00,ENSMUSG00000046318.16,CA3 Cadm2,CA1 Chrm3,Ccbe1,1000.000000,0.997372,0.545062,1.452975,-1.414517,-0.907913
9,0.000000e+00,ENSMUSG00000001741.12,CA3 Cadm2,CA1 Chrm3,Il16,1000.000000,0.997072,0.568977,1.680193,-1.562184,-1.111216
