In [1]:
from concurrent.futures import ProcessPoolExecutor, as_completed
import json
import anndata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pybedtools
import scipy.stats as stats
import seaborn as sns
import pathlib
from statsmodels.stats.multitest import multipletests

In [2]:
or_cutoff = 1.6
neg_lgp_cutoff = 2
mask_quantile_to_max = 0.8

gene_cluster = '0'

In [3]:
# Parameters
gene_cluster = "8"
or_cutoff = 1.6
neg_lgp_cutoff = 3
mask_quantile_to_max = 0.8


In [4]:
output_dir = 'MotifEnrichment'
output_dir = pathlib.Path(output_dir)
output_dir.mkdir(exist_ok=True)

## DMR hits

In [5]:
with open('GeneCluster.relatedDMR.index.json') as f:
    use_dmrs = json.load(f)[gene_cluster]

In [6]:
gene_clusters = anndata.read_h5ad('GeneClustering.h5ad').obs['leiden']
use_genes = gene_clusters[gene_clusters == gene_cluster].index

In [7]:
print(use_genes.size, 'genes in gene cluster', gene_cluster)
print(len(use_dmrs), 'related DMRs')

44 genes in gene cluster 8
7953 related DMRs


In [8]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5') as hdf:
    dmr_bed_df = hdf['bed'].loc[use_dmrs].copy()
dmr_bed_df.shape

(7953, 4)

In [9]:
dmr_annot = anndata.read_h5ad(
    '/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/MotifScan.h5ad'
)
# mask small motif scores
motif_cutoff = pd.Series(dmr_annot.X.max(axis=0).todense().A1 * mask_quantile_to_max, index=dmr_annot.var_names)

In [10]:
dmr_annot = dmr_annot[use_dmrs, :].copy()
dmr_annot

AnnData object with n_obs × n_vars = 7953 × 719 
    obs: 'chrom', 'start', 'end'

## Background Hits

In [11]:
background_motif_hits = anndata.read_h5ad(
    '/home/hanliu/project/mouse_rostral_brain/DMR/BackgroundDMR/MotifScan.h5ad'
)

In [12]:
dmr_bed = pybedtools.BedTool().from_dataframe(dmr_bed_df)
bg_bed = pybedtools.BedTool().from_dataframe(
    background_motif_hits.obs.reset_index().iloc[:, [1, 2, 3, 0]])

In [13]:
# exclude background that overlap with DMR
bg_no_overlap = bg_bed.intersect(dmr_bed, v=True)
use_bg = bg_no_overlap.to_dataframe().iloc[:, -1].values
background_motif_hits = background_motif_hits[use_bg, :]

# make sure col in same order
background_motif_hits = background_motif_hits[:, dmr_annot.var_names].copy()
background_motif_hits

AnnData object with n_obs × n_vars = 345786 × 719 
    obs: 'chrom', 'start', 'end'

## Redo motif score filter

In [14]:
# only keep value larger than the cutoff for each motif
dmr_annot.X = dmr_annot.X.multiply(
    (dmr_annot.X >
     motif_cutoff[dmr_annot.var_names].values[None, :]).astype(int)).tocsr()

In [15]:
# only keep value larger than the cutoff for each motif
background_motif_hits.X = background_motif_hits.X.multiply(
    (background_motif_hits.X >
     motif_cutoff[background_motif_hits.var_names].values[None, :]).astype(int)).tocsr()

## Motif hits contingency table

In [16]:
motif_ids = dmr_annot.var_names

# calculate motif occurence, not considering hits here
pos = (dmr_annot[:, motif_ids].X > 0).sum(axis=0)
pos_total = dmr_annot.shape[0]

neg = (background_motif_hits.X > 0).sum(axis=0)
neg_total = background_motif_hits.shape[0]

In [17]:
tables = {}
for motif, _pos, _neg in zip(motif_ids, pos.A1, neg.A1):
    table = [[_pos, pos_total - _pos], [_neg, neg_total - _neg]]
    tables[motif] = table

In [18]:
results = {}
with ProcessPoolExecutor(40) as executor:
    fs = {}
    for motif, t in tables.items():
        f = executor.submit(stats.fisher_exact, t, alternative='greater')
        fs[f] = motif

    for f in as_completed(fs):
        motif = fs[f]
        odds, p = f.result()
        results[motif] = {'oddsratio': odds, 'p_value': p}
motif_enrich_df = pd.DataFrame(results).T

_, p, _, _ = multipletests(motif_enrich_df['p_value'], method='fdr_bh')
motif_enrich_df['adj_p'] = p

motif_enrich_df['-lgp'] = -np.log10(motif_enrich_df['adj_p']).replace(
    -np.inf, -300)

records = {}
for motif, t in tables.items():
    tp, tn = t[0]
    fp, fn = t[1]
    tp_rate = tp / pos_total
    fp_rate = fp / neg_total
    records[motif] = dict(tp=tp,
                          tn=tn,
                          fp=fp,
                          fn=fn,
                          tp_rate=tp_rate,
                          fp_rate=fp_rate)
counts = pd.DataFrame(records).T
motif_enrich_df = pd.concat([motif_enrich_df, counts], axis=1, sort=True)

In [19]:
motif_enrich_df['GeneCluster'] = gene_cluster
motif_enrich_df['DMRType'] = 'Hypo'

In [20]:
motif_enrich_df = motif_enrich_df[motif_enrich_df['oddsratio'] > 1].copy()

## Add gene info

In [21]:
motif_gene_anno = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/study/MotifClustering/JASPAR2020_CORE_vertebrates_non-redundant.mouse_genes.with_motif_group.199.csv', 
    index_col=0
)
motif_gene_anno.head()

Unnamed: 0_level_0,motif_name,motif_genes,gene_ids,gene_names,motif_group
motif_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MA0006.1,Ahr::Arnt,"Ahr,Arnt","ENSMUSG00000019256.17,ENSMUSG00000015522.18","Ahr,Arnt",MotifGroup178
MA0854.1,Alx1,Alx1,ENSMUSG00000036602.14,Alx1,MotifGroup3
MA0634.1,ALX3,ALX3,ENSMUSG00000014603.3,Alx3,MotifGroup3
MA0853.1,Alx4,Alx4,ENSMUSG00000040310.12,Alx4,MotifGroup3
MA0007.3,Ar,Ar,ENSMUSG00000046532.8,Ar,MotifGroup32


In [22]:
motif_enrich_df = pd.concat([motif_enrich_df, motif_gene_anno.reindex(motif_enrich_df.index)], axis=1)

In [23]:
motif_enrich_df.to_msgpack(output_dir / f'{gene_cluster}.Hypo.motif_enrichment.msg')

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  """Entry point for launching an IPython kernel.


In [24]:
# final filter
filtered_motif_df = motif_enrich_df[(motif_enrich_df['oddsratio'] > or_cutoff)
                                    &
                                    (motif_enrich_df['-lgp'] > neg_lgp_cutoff)]
filtered_motif_df.shape[0]

10

In [25]:
filtered_motif_df

Unnamed: 0,oddsratio,p_value,adj_p,-lgp,tp,tn,fp,fn,tp_rate,fp_rate,GeneCluster,DMRType,motif_name,motif_genes,gene_ids,gene_names,motif_group
MA0623.2,1.770442,6.294511e-11,3.771461e-09,8.42349,160.0,7793.0,3964.0,341822.0,0.020118,0.011464,8,Hypo,NEUROG1,NEUROG1,ENSMUSG00000048904.5,Neurog1,MotifGroup5
MA0668.1,1.786405,1.938195e-15,2.787124e-13,12.554844,233.0,7720.0,5745.0,340041.0,0.029297,0.016614,8,Hypo,NEUROD2,NEUROD2,ENSMUSG00000038255.6,Neurod2,MotifGroup5
MA0669.1,1.705954,1.862617e-07,8.928146e-06,5.049239,113.0,7840.0,2897.0,342889.0,0.014208,0.008378,8,Hypo,NEUROG2,NEUROG2,ENSMUSG00000027967.8,Neurog2,MotifGroup5
MA0678.1,1.789233,1.337725e-07,6.870175e-06,5.163032,99.0,7854.0,2419.0,343367.0,0.012448,0.006996,8,Hypo,OLIG2,OLIG2,ENSMUSG00000039830.9,Olig2,MotifGroup5
MA0818.1,1.806176,1.047009e-11,6.843634e-10,9.164713,163.0,7790.0,3960.0,341826.0,0.020495,0.011452,8,Hypo,BHLHE22,BHLHE22,ENSMUSG00000025128.7,Bhlhe22,MotifGroup5
MA0826.1,1.709228,1.163413e-06,4.64719e-05,4.33281,97.0,7856.0,2480.0,343306.0,0.012197,0.007172,8,Hypo,OLIG1,OLIG1,ENSMUSG00000046160.6,Olig1,MotifGroup5
MA0827.1,1.946115,5.322447e-15,6.378065e-13,12.195311,175.0,7778.0,3952.0,341834.0,0.022004,0.011429,8,Hypo,OLIG3,OLIG3,ENSMUSG00000045591.6,Olig3,MotifGroup5
MA1109.1,2.042273,1.383616e-45,3.316067e-43,42.479377,528.0,7425.0,11635.0,334151.0,0.06639,0.033648,8,Hypo,NEUROD1,NEUROD1,ENSMUSG00000034701.9,Neurod1,MotifGroup5
MA1568.1,2.040932,2.019096e-05,0.0006912999,3.160333,44.0,7909.0,940.0,344846.0,0.005533,0.002718,8,Hypo,TCF21(var.2),TCF21,ENSMUSG00000045680.8,Tcf21,MotifGroup5
MA1642.1,2.02049,1.073178e-70,7.716151e-68,67.112599,883.0,7070.0,20130.0,325656.0,0.111027,0.058215,8,Hypo,NEUROG2(var.2),NEUROG2,ENSMUSG00000027967.8,Neurog2,MotifGroup5
