In [1]:
from concurrent.futures import ProcessPoolExecutor, as_completed

import anndata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pybedtools
import scipy.stats as stats
import seaborn as sns
from statsmodels.stats.multitest import multipletests
import pathlib
import subprocess
import warnings
warnings.filterwarnings('ignore')
pathlib.Path('bed_tmp').mkdir(exist_ok=True)
pybedtools.set_tempdir('bed_tmp')

In [2]:
cluster_type = 'SubType'
clusters = ['PT-L5_Unc5b']
dmr_types = ['Hyper', 'Hypo']

# DMR that overlap (1) with any of these feature cols will be included in analysis
use_cols = None

or_cutoff = 1.3
neg_lgp_cutoff = 10

In [3]:
def get_data_by_cluster(dmr_annot, total_background_motif_hits, bg_bed, 
                        dmr_type, cluster, use_cols=None):
    dmr_bed_df = pd.read_csv(
        f'/home/hanliu/project/mouse_rostral_brain/study/DMRAccessibility/{cluster_type}/OpenDMR/{cluster}.{dmr_type}DMR.Open.bed',
        header=None,
        sep='\t',
        index_col=-1,
        names=['chrom', 'start', 'end', 'SubDMR'])
    dmr_bed = pybedtools.BedTool().from_dataframe(dmr_bed_df)

    this_dmr_annot = dmr_annot[dmr_bed_df.index, :]
    motif_ids = this_dmr_annot.var_names[this_dmr_annot.var['FeatureType'] ==
                                    'MotifHits']

    if use_cols is not None:
        dmr_judge = (this_dmr_annot[:, use_cols].X.sum(axis=1) > 0).A1

        this_dmr_annot = this_dmr_annot[dmr_judge, :]
        
    # exclude background that overlap with DMR
    bg_no_overlap = bg_bed.intersect(dmr_bed, v=True)
    use_bg = bg_no_overlap.to_dataframe().iloc[:, -1].values
    background_motif_hits = total_background_motif_hits[use_bg, :]

    # make sure col in same order
    background_motif_hits = background_motif_hits[:, motif_ids]
    
    return this_dmr_annot.copy(), background_motif_hits.copy()


def motif_enrichment(dmr_annot, background_motif_hits, cluster, dmr_type):
    
    motif_ids = dmr_annot.var_names[dmr_annot.var['FeatureType'] == 'MotifHits']
    # calculate motif occurence, not considering hits here
    pos = (dmr_annot[:, motif_ids].X > 0).sum(axis=0)
    pos_total = dmr_annot.shape[0]
    
    neg = (background_motif_hits.X > 0).sum(axis=0)
    neg_total = background_motif_hits.shape[0]
    
    # prepare tables
    tables = {}
    for motif, _pos, _neg in zip(motif_ids, pos.A1, neg.A1):
        table = [[_pos, pos_total - _pos], [_neg, neg_total - _neg]]
        tables[motif] = table
        
    results = {}
    with ProcessPoolExecutor(40) as executor:
        fs = {}
        for motif, t in tables.items():
            f = executor.submit(stats.fisher_exact, t, alternative='greater')
            fs[f] = motif
    
        for f in as_completed(fs):
            motif = fs[f]
            odds, p = f.result()
            results[motif] = {'oddsratio': odds, 'p_value': p}
    motif_enrich_df = pd.DataFrame(results).T
    
    _, p, _, _ = multipletests(motif_enrich_df['p_value'], method='fdr_bh')
    motif_enrich_df['adj_p'] = p
    
    motif_enrich_df['-lgp'] = -np.log10(motif_enrich_df['adj_p']).replace(
        -np.inf, -300)
    
    records = {}
    for motif, t in tables.items():
        tp, tn = t[0]
        fp, fn = t[1]
        tp_rate = tp / pos_total
        fp_rate = fp / neg_total
        records[motif] = dict(tp=tp,
                              tn=tn,
                              fp=fp,
                              fn=fn,
                              tp_rate=tp_rate,
                              fp_rate=fp_rate)
    counts = pd.DataFrame(records).T
    motif_enrich_df = pd.concat([motif_enrich_df, counts], axis=1, sort=True)
    
    motif_enrich_df[cluster_type] = cluster
    motif_enrich_df['DMRType'] = dmr_type
    
    # final filter
    filtered_motif_df = motif_enrich_df[(motif_enrich_df['oddsratio'] > or_cutoff)
                                        &
                                        (motif_enrich_df['-lgp'] > neg_lgp_cutoff)]
    print(filtered_motif_df.shape[0])
    
    motif_enrich_df.to_msgpack(f'{cluster}.{dmr_type}.motif_enrichment.msg')

## DMR hits

In [4]:
total_dmr_annot = anndata.read_h5ad(
    '/home/hanliu/project/mouse_rostral_brain/DMR/SubDMRAnnotation/DMRAnnotation.h5ad'
)

## Background Hits

In [5]:
total_background_motif_hits = anndata.read_h5ad(
    '/home/hanliu/project/mouse_rostral_brain/DMR/MotifScan/BackgroundAdultTissueDMR.MotifHits.with_region_bed.h5ad'
)
bg_bed = pybedtools.BedTool().from_dataframe(
    total_background_motif_hits.obs.reset_index().iloc[:, [1, 2, 3, 0]])

## Get cluster data

In [6]:
dmr_type = dmr_types[0]
cluster = clusters[0]

for cluster in clusters:
    for dmr_type in dmr_types:
        print(cluster, dmr_type)
        output_path = f'{cluster}.{dmr_type}.motif_enrichment.msg'
        if pathlib.Path(output_path).exists():
            continue
        dmr_data, bg_data = get_data_by_cluster(total_dmr_annot, total_background_motif_hits, bg_bed, 
                                                dmr_type, cluster, use_cols=use_cols)
        motif_enrichment(dmr_data, bg_data, cluster, dmr_type)

PT-L5_Unc5b Hyper
48
PT-L5_Unc5b Hypo
77


In [7]:
subprocess.run(['rm', '-f', 'bed_tmp'])

CompletedProcess(args=['rm', '-f', 'bed_tmp'], returncode=1)

In [8]:
with open('final_flag.txt', 'w') as f:
    f.write('Oh yeah')