In [1]:
from concurrent.futures import ProcessPoolExecutor, as_completed

import anndata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pybedtools
import scipy.stats as stats
import seaborn as sns
from statsmodels.stats.multitest import multipletests

## Parameters

In [2]:
# relavent score cutoff
rs_cutoff = 0.3
min_dmr_to_test = 1000 # on either side

# motif enrichment
or_cutoff = 1.6
neg_lgp_cutoff = 10
mask_quantile_to_max = 0.8

## Load Data

### Motif gene

In [3]:
motif_gene_anno = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/study/MotifClustering/JASPAR2020_CORE_vertebrates_non-redundant.mouse_genes.with_motif_group.199.csv', 
    index_col=0
)
motif_gene_anno.head()

Unnamed: 0_level_0,motif_name,motif_genes,gene_ids,gene_names,motif_group
motif_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MA0006.1,Ahr::Arnt,"Ahr,Arnt","ENSMUSG00000019256.17,ENSMUSG00000015522.18","Ahr,Arnt",MotifGroup178
MA0854.1,Alx1,Alx1,ENSMUSG00000036602.14,Alx1,MotifGroup3
MA0634.1,ALX3,ALX3,ENSMUSG00000014603.3,Alx3,MotifGroup3
MA0853.1,Alx4,Alx4,ENSMUSG00000040310.12,Alx4,MotifGroup3
MA0007.3,Ar,Ar,ENSMUSG00000046532.8,Ar,MotifGroup32


### Node Data

In [4]:
adata = anndata.read_h5ad('NodeDMRResults.h5ad')

In [5]:
use_dmr = adata.var_names[((adata.X != 0).sum(axis=0) != 0).A1]

### DMR Bed

In [6]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5') as hdf:
    dmr_bed_df = hdf['bed'].loc[use_dmr]

### DMR annot

In [7]:
dmr_annot = anndata.read_h5ad(
    '/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/MotifScan.h5ad'
)
# mask small motif scores
motif_cutoff = pd.Series(dmr_annot.X.max(axis=0).todense().A1 * mask_quantile_to_max, index=dmr_annot.var_names)

dmr_annot = dmr_annot[use_dmr, :].copy()
dmr_annot

AnnData object with n_obs × n_vars = 297427 × 719 
    obs: 'chrom', 'start', 'end'

## Refilter scores

In [8]:
# only keep value larger than the cutoff for each motif
dmr_annot.X = dmr_annot.X.multiply(
    (dmr_annot.X >
     motif_cutoff[dmr_annot.var_names].values[None, :]).astype(int)).tocsr()

## Test for each node

In [9]:
node = '134'

In [10]:
def node_dmr_motif_enrichment(node):
    # get node relavent score
    node_dmr_relavent_scores = pd.Series(adata.var_vector(node),
                                         index=adata.var_names)

    # get node left right dmr
    left_dmr = node_dmr_relavent_scores[node_dmr_relavent_scores > rs_cutoff]
    right_dmr = node_dmr_relavent_scores[node_dmr_relavent_scores < -rs_cutoff]

    # get dmr motif hits annotation
    left_dmr_annot = dmr_annot[left_dmr.index, :]
    right_dmr_annot = dmr_annot[right_dmr.index, :]

    # if DMR is not enough, skip and return empty record
    if (left_dmr.size < min_dmr_to_test) or (left_dmr.size < min_dmr_to_test):
        empty_record = pd.DataFrame([],
                                    columns=[
                                        'oddsratio', 'p_value', 'adj_p',
                                        '-lgp', 'left_hit', 'left_no_hit',
                                        'right_hit', 'right_no_hit',
                                        'left_hit_rate', 'right_hit_rate',
                                        'Node'
                                    ])
        return empty_record

    # get table
    motif_ids = dmr_annot.var_names
    # calculate motif occurence, not considering hits here
    left = (left_dmr_annot[:, motif_ids].X > 0).sum(axis=0)
    left_total = left_dmr_annot.shape[0]

    right = (right_dmr_annot.X > 0).sum(axis=0)
    right_total = right_dmr_annot.shape[0]

    tables = {}
    for motif, _left, _right in zip(motif_ids, left.A1, right.A1):
        table = [[_left, left_total - _left], [_right, right_total - _right]]
        tables[motif] = table

    # do test
    results = {}
    with ProcessPoolExecutor(40) as executor:
        fs = {}
        for motif, t in tables.items():
            f = executor.submit(stats.fisher_exact, t, alternative='two-sided')
            fs[f] = motif

        for f in as_completed(fs):
            motif = fs[f]
            odds, p = f.result()
            results[motif] = {'oddsratio': odds, 'p_value': p}
    motif_enrich_df = pd.DataFrame(results).T

    # p value correction
    _, p, _, _ = multipletests(motif_enrich_df['p_value'], method='fdr_bh')
    motif_enrich_df['adj_p'] = p
    motif_enrich_df['-lgp'] = -np.log10(motif_enrich_df['adj_p']).replace(
        -np.inf, -300)

    # assemble final results
    records = {}
    for motif, t in tables.items():
        tp, tn = t[0]
        fp, fn = t[1]
        tp_rate = tp / left_total
        fp_rate = fp / right_total
        records[motif] = dict(left_hit=tp,
                              left_no_hit=tn,
                              right_hit=fp,
                              right_no_hit=fn,
                              left_hit_rate=tp_rate,
                              right_hit_rate=fp_rate)
    counts = pd.DataFrame(records).T
    motif_enrich_df = pd.concat([motif_enrich_df, counts], axis=1, sort=True)
    motif_enrich_df['Node'] = node

    # apply a minimum filter
    motif_enrich_df = motif_enrich_df[motif_enrich_df['-lgp'] > 1]

    return motif_enrich_df

In [11]:
total_data = []
for node in adata.obs_names:
    data = node_dmr_motif_enrichment(node)
    print(node, data.shape[0])
    total_data.append(data)

77 0
78 0
79 0
80 0
81 0
82 0
83 0
84 0
85 0
86 0
87 0
88 0
89 0
90 0
91 0
92 0
93 6
94 0
95 0
96 29
97 0
98 0
99 16
100 0
101 0
102 0
103 0
104 0
105 0
106 0
107 0
108 0
109 27
110 25
111 0
112 51
113 0
114 44
115 12
116 0
117 0
118 0
119 99
120 0
121 0
122 0
123 103
124 141
125 45
126 0
127 24
128 113
129 149
130 132
131 0
132 0
133 64
134 266
135 163
136 87
137 196
138 69
139 0
140 164
141 285
142 39
143 229
144 16
145 276
146 67
147 166
148 278
149 79
150 123
151 84
152 225


In [12]:
total_data = pd.concat(total_data)
total_data['log2odds'] = np.log2(total_data['oddsratio'])
total_data.to_msgpack('NodeMotifEnrichment.msg')

  result = getattr(ufunc, method)(*inputs, **kwargs)
It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
total_data[(total_data['log2odds'].abs() > 0.5)
           & (total_data['-lgp'] > 4)]['Node'].value_counts().reindex(
               adata.obs_names)[20:]

index
97       NaN
98       NaN
99       1.0
100      NaN
101      NaN
102      NaN
103      NaN
104      NaN
105      NaN
106      NaN
107      NaN
108      NaN
109      NaN
110      NaN
111      NaN
112      6.0
113      NaN
114      NaN
115      NaN
116      NaN
117      NaN
118      NaN
119     21.0
120      NaN
121      NaN
122      NaN
123      2.0
124     35.0
125      3.0
126      NaN
127      NaN
128     17.0
129     24.0
130      9.0
131      NaN
132      NaN
133      5.0
134     52.0
135     19.0
136     17.0
137     69.0
138      1.0
139      NaN
140     36.0
141     76.0
142      4.0
143     22.0
144      NaN
145     74.0
146      3.0
147     43.0
148    128.0
149     13.0
150     16.0
151      7.0
152     67.0
Name: Node, dtype: float64