In [27]:
from concurrent.futures import ProcessPoolExecutor, as_completed
import json
import anndata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pybedtools
import scipy.stats as stats
import seaborn as sns
import pathlib
from statsmodels.stats.multitest import multipletests

In [28]:
or_cutoff = 1.6
neg_lgp_cutoff = 2
mask_quantile_to_max = 0.8

gene_cluster = '0'

In [29]:
output_dir = 'MotifEnrichment'
output_dir = pathlib.Path(output_dir)
output_dir.mkdir(exist_ok=True)

## DMR hits

In [3]:
with open('GeneCluster.relatedDMR.index.json') as f:
    use_dmrs = json.load(f)[gene_cluster]

In [4]:
gene_clusters = anndata.read_h5ad('GeneClustering.h5ad').obs['leiden']
use_genes = gene_clusters[gene_clusters == gene_cluster].index

In [5]:
print(use_genes.size, 'genes in gene cluster', gene_cluster)
print(len(use_dmrs), 'related DMRs')

60 genes in gene cluster 0
703 related DMRs


In [6]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5') as hdf:
    dmr_bed_df = hdf['bed'].loc[use_dmrs].copy()
dmr_bed_df.shape

(703, 4)

In [7]:
dmr_annot = anndata.read_h5ad(
    '/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/MotifScan.h5ad'
)
# mask small motif scores
motif_cutoff = pd.Series(dmr_annot.X.max(axis=0).todense().A1 * mask_quantile_to_max, index=dmr_annot.var_names)

In [8]:
dmr_annot = dmr_annot[use_dmrs, :].copy()
dmr_annot

AnnData object with n_obs × n_vars = 703 × 719 
    obs: 'chrom', 'start', 'end'

## Background Hits

In [9]:
background_motif_hits = anndata.read_h5ad(
    '/home/hanliu/project/mouse_rostral_brain/DMR/BackgroundDMR/MotifScan.h5ad'
)

In [10]:
dmr_bed = pybedtools.BedTool().from_dataframe(dmr_bed_df)
bg_bed = pybedtools.BedTool().from_dataframe(
    background_motif_hits.obs.reset_index().iloc[:, [1, 2, 3, 0]])

In [11]:
# exclude background that overlap with DMR
bg_no_overlap = bg_bed.intersect(dmr_bed, v=True)
use_bg = bg_no_overlap.to_dataframe().iloc[:, -1].values
background_motif_hits = background_motif_hits[use_bg, :]

# make sure col in same order
background_motif_hits = background_motif_hits[:, dmr_annot.var_names].copy()
background_motif_hits

AnnData object with n_obs × n_vars = 346752 × 719 
    obs: 'chrom', 'start', 'end'

## Redo motif score filter

In [12]:
# only keep value larger than the cutoff for each motif
dmr_annot.X = dmr_annot.X.multiply(
    (dmr_annot.X >
     motif_cutoff[dmr_annot.var_names].values[None, :]).astype(int)).tocsr()

In [13]:
# only keep value larger than the cutoff for each motif
background_motif_hits.X = background_motif_hits.X.multiply(
    (background_motif_hits.X >
     motif_cutoff[background_motif_hits.var_names].values[None, :]).astype(int)).tocsr()

## Motif hits contingency table

In [14]:
motif_ids = dmr_annot.var_names

# calculate motif occurence, not considering hits here
pos = (dmr_annot[:, motif_ids].X > 0).sum(axis=0)
pos_total = dmr_annot.shape[0]

neg = (background_motif_hits.X > 0).sum(axis=0)
neg_total = background_motif_hits.shape[0]

In [15]:
tables = {}
for motif, _pos, _neg in zip(motif_ids, pos.A1, neg.A1):
    table = [[_pos, pos_total - _pos], [_neg, neg_total - _neg]]
    tables[motif] = table

In [16]:
results = {}
with ProcessPoolExecutor(40) as executor:
    fs = {}
    for motif, t in tables.items():
        f = executor.submit(stats.fisher_exact, t, alternative='greater')
        fs[f] = motif

    for f in as_completed(fs):
        motif = fs[f]
        odds, p = f.result()
        results[motif] = {'oddsratio': odds, 'p_value': p}
motif_enrich_df = pd.DataFrame(results).T

_, p, _, _ = multipletests(motif_enrich_df['p_value'], method='fdr_bh')
motif_enrich_df['adj_p'] = p

motif_enrich_df['-lgp'] = -np.log10(motif_enrich_df['adj_p']).replace(
    -np.inf, -300)

records = {}
for motif, t in tables.items():
    tp, tn = t[0]
    fp, fn = t[1]
    tp_rate = tp / pos_total
    fp_rate = fp / neg_total
    records[motif] = dict(tp=tp,
                          tn=tn,
                          fp=fp,
                          fn=fn,
                          tp_rate=tp_rate,
                          fp_rate=fp_rate)
counts = pd.DataFrame(records).T
motif_enrich_df = pd.concat([motif_enrich_df, counts], axis=1, sort=True)

In [17]:
motif_enrich_df['GeneCluster'] = gene_cluster
motif_enrich_df['DMRType'] = 'Hypo'

In [18]:
motif_enrich_df = motif_enrich_df[motif_enrich_df['oddsratio'] > 1].copy()

## Add gene info

In [19]:
motif_gene_anno = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/study/MotifClustering/JASPAR2020_CORE_vertebrates_non-redundant.mouse_genes.with_motif_group.199.csv', 
    index_col=0
)
motif_gene_anno.head()

Unnamed: 0_level_0,motif_name,motif_genes,gene_ids,gene_names,motif_group
motif_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MA0006.1,Ahr::Arnt,"Ahr,Arnt","ENSMUSG00000019256.17,ENSMUSG00000015522.18","Ahr,Arnt",MotifGroup178
MA0854.1,Alx1,Alx1,ENSMUSG00000036602.14,Alx1,MotifGroup3
MA0634.1,ALX3,ALX3,ENSMUSG00000014603.3,Alx3,MotifGroup3
MA0853.1,Alx4,Alx4,ENSMUSG00000040310.12,Alx4,MotifGroup3
MA0007.3,Ar,Ar,ENSMUSG00000046532.8,Ar,MotifGroup32


In [20]:
motif_enrich_df = pd.concat([motif_enrich_df, motif_gene_anno.reindex(motif_enrich_df.index)], axis=1)

In [30]:
motif_enrich_df.to_msgpack(output_dir / f'{gene_cluster}.Hypo.motif_enrichment.msg')

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  """Entry point for launching an IPython kernel.


In [22]:
# final filter
filtered_motif_df = motif_enrich_df[(motif_enrich_df['oddsratio'] > or_cutoff)
                                    &
                                    (motif_enrich_df['-lgp'] > neg_lgp_cutoff)]
filtered_motif_df.shape[0]

8

In [23]:
filtered_motif_df

Unnamed: 0,oddsratio,p_value,adj_p,-lgp,tp,tn,fp,fn,tp_rate,fp_rate,GeneCluster,DMRType,motif_name,motif_genes,gene_ids,gene_names,motif_group
MA0631.1,21.781406,5e-06,0.000907,3.042374,5.0,698.0,114.0,346638.0,0.007112,0.000329,0,Hypo,Six3,Six3,ENSMUSG00000038805.10,Six3,MotifGroup93
MA0662.1,1.753441,0.000105,0.007518,2.123916,57.0,646.0,16613.0,330139.0,0.081081,0.04791,0,Hypo,MIXL1,MIXL1,ENSMUSG00000026497.7,Mixl1,MotifGroup3
MA0668.1,2.362828,7.6e-05,0.006062,2.217383,27.0,676.0,5764.0,340988.0,0.038407,0.016623,0,Hypo,NEUROD2,NEUROD2,ENSMUSG00000038255.6,Neurod2,MotifGroup5
MA0675.1,1.78447,3e-06,0.000907,3.042374,83.0,620.0,24198.0,322554.0,0.118065,0.069785,0,Hypo,NKX6-2,NKX6-2,ENSMUSG00000041309.17,Nkx6-2,MotifGroup3
MA0707.1,1.794309,1e-06,0.000907,3.042374,87.0,616.0,25302.0,321450.0,0.123755,0.072969,0,Hypo,MNX1,MNX1,ENSMUSG00000001566.9,Mnx1,MotifGroup3
MA0718.1,1.769509,7.4e-05,0.006062,2.217383,58.0,645.0,16769.0,329983.0,0.082504,0.04836,0,Hypo,RAX,RAX,ENSMUSG00000024518.4,Rax,MotifGroup3
MA0797.1,8.978211,1.9e-05,0.002314,2.635578,7.0,696.0,388.0,346364.0,0.009957,0.001119,0,Hypo,TGIF2,TGIF2,ENSMUSG00000062175.13,Tgif2,MotifGroup69
MA1571.1,6.001521,4e-06,0.000907,3.042374,11.0,692.0,916.0,345836.0,0.015647,0.002642,0,Hypo,TGIF2LX,TGIF2LX,ENSMUSG00000100133.1,Tgif2lx1,MotifGroup69
