In [1]:
from concurrent.futures import ProcessPoolExecutor, as_completed

import anndata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pybedtools
import scipy.stats as stats
import seaborn as sns
from statsmodels.stats.multitest import multipletests
import pathlib
import subprocess
import warnings
warnings.filterwarnings('ignore')
pathlib.Path('bed_tmp').mkdir(exist_ok=True)
pybedtools.set_tempdir('bed_tmp')

In [2]:
cluster_type = 'SubType'
clusters = ['PT-L5_Unc5b']
dmr_types = ['Hyper', 'Hypo']

# DMR that overlap (1) with any of these feature cols will be included in analysis
use_cols = [
    'DNA.DNA', 'DNA.MULE-MuDR', 'DNA.MuDR', 'DNA.PiggyBac', 'DNA.TcMar',
    'DNA.TcMar-Mariner', 'DNA.TcMar-Pogo', 'DNA.TcMar-Tc2', 'DNA.TcMar-Tigger',
    'DNA.hAT', 'DNA.hAT-Blackjack', 'DNA.hAT-Charlie', 'DNA.hAT-Tip100'
]

or_cutoff = 1.3
neg_lgp_cutoff = 10

In [3]:
# Parameters
cluster_col = "SubType"
clusters = ["MGE-Sst_Rxra", "CA3_Cadm2", "CA1_Chrm3", "CA3-St18_Tead1", "Unc5c_Unc5c", "Gfra1_Gfra1", "ODC_odc-small", "PC_pc-all", "ODC_odc-large", "ANP_anp-dg", "IT-L5_Etv1", "CA1_Ptprg", "MGE-Sst_Ptpre", "NP-L6_Cntnap4", "CA3-St18_Nuak1", "CGE-Lamp5_Dock5", "CT-L6_Megf9", "IG-CA2_Chrm3", "IG-CA2_Peak1", "DG-po_Calb2", "DG_dg-all", "CGE-Vip_Ntng1", "CA1_Kif26a", "CA3_Efnb2", "CGE-Vip_Ptprm", "CA1_Ak5", "DG-po_Bcl11a", "OPC_opc-large", "ASC_cortex-olf", "MGC_mgc-all", "PT-L5_Tenm2", "ASC_str-hpf", "CGE-Vip_Robo1", "CA1_Lingo2", "ASC_mid", "MGE-Pvalb_Gfra2", "VLMC_Mapk4", "CA3-St18_Epha5", "PAL-Inh_Meis2", "IG-CA2_Xpr1", "EC_Abhd2", "VLMC-Pia_vlmc-pia-all", "MGE-Sst_Unc5b", "MGE-Pvalb_Thsd7a", "CGE-Vip_Grm8", "MGE-Sst_Dock4", "CGE-Lamp5_Grk5", "OLF_Xkr6", "VLMC_Col4a1", "OPC_opc-small", "ANP_anp-olf-cnu", "DG-po_Kctd8", "MSN-D2_Slc24a2", "CGE-Lamp5_Sorcs1", "CT-L6_Il1rap", "L6b_Adcy8", "MGE-Pvalb_Entpd3", "IT-L6_Man1c1", "MGE-Pvalb_Ptprk", "CGE-Vip_Ccser1", "NP-L6_Olfml2b", "CGE-Lamp5_Grid1", "MGE-Pvalb_Sema5a", "MGE-Sst_Kcnip4", "PT-L5_Abca12", "MGE-Sst_Frmd6", "MGE-Pvalb_Cnih3", "MGE-Sst_Ubtd1", "PT-L5_Nectin1", "MGE-Sst_Rerg", "CGE-Vip_Fstl4", "CGE-Vip_Galnt17", "MGE-Sst_Etv1", "IT-L23_Cux1", "IT-L23_Foxp1", "EC_Sema3g", "CGE-Vip_Clstn2", "IT-L4_Shc3", "IT-L5_Cdh8", "IT-L5_Grik3", "PT-L5_Tmtc2", "IT-L23_Tenm2", "NP-L6_Cntnap5a", "CT-L6_Hcrtr2", "PT-L5_Plcb4", "IT-L23_Ptprt", "CGE-Lamp5_Nrxn3", "CT-L6_Map4", "MGE-Sst_Chodl", "NP-L6_Boc", "PT-L5_Kcnh1", "OLF-Exc_Bmpr1b", "OLF_Trpc4", "PT-L5_Astn2", "IT-L6_Fstl4", "CLA_Bcl11a", "NP-L6_Cyp7b1", "CLA_Cdh8", "IT-L6_Cadps2", "PT-L5_Ptprt", "NP-L6_Kcnab1", "IT-L6_Oxr1", "Foxp2_Homer2", "MGE-Pvalb_Cacna1i", "MSN-D1_Khdrbs3", "MSN-D1_Plxnc1", "OLF_Mapk10", "MSN-D1_Hrh1", "Foxp2_Trpc7", "OLF_Pag1", "MSN-D2_Col14a1", "MGE-Sst_Bmper", "OLF-Exc_Pld5", "OLF_Gabbr2", "OLF_Kcnd3", "PAL-Inh_Deptor", "OLF-Exc_Lrrtm3", "OLF-Exc_Cdh9", "OLF-Exc_Unc13c", "PAL-Inh_Meis1", "L6b_Nrp2", "LSX-Inh_Cacna1i", "OLF-Exc_Sgcd", "OLF-Exc_Rmst", "PT-L5_Unc5b", "L6b_Pkhd1", "L6b_Kcnk2", "IT-L4_Astn2", "CLA_Nrp2", "D1L-Fstl4_Sipa1l2", "EP_Tspan5", "PAL-Inh_Rarb", "MSN-D2_Nrp2", "D1L-Fstl4_Trps1", "Foxp2_Dchs2", "OLF-Exc_Cux2", "PAL-Inh_Chat", "D1L-PAL_Flrt2", "EP_Rgs8", "PAL-Inh_Igdcc3", "PAL-Inh_Tmem178", "MSN-D1_Ntn1", "Foxp2_Inpp4b", "MSN-D2_Casz1", "Chd7_Kcnc2", "PAL-Inh_Tcf7l2", "D1L-Fstl4_Grm3", "D1L-Fstl4_Cadm1", "Chd7_Trpc7", "PAL-Inh_Ptprd", "D1L-Fstl4_Crim1", "Chd7_Megf11", "EP_Adcy8", "D1L-PAL_Plcxd3", "PAL-Inh_Onecut2", "LSX-Inh_Foxp2", "LSX-Inh_Enox1", "LSX-Inh_Dock10", "LSX-Inh_Nxph1", "LSX-Inh_Zeb2", "LSX-Inh_Lats2"]
dmr_types = ["Hyper", "Hypo"]
use_cols = ["DNA.DNA", "DNA.MULE-MuDR", "DNA.MuDR", "DNA.PiggyBac", "DNA.TcMar", "DNA.TcMar-Mariner", "DNA.TcMar-Pogo", "DNA.TcMar-Tc2", "DNA.TcMar-Tigger", "DNA.hAT", "DNA.hAT-Blackjack", "DNA.hAT-Charlie", "DNA.hAT-Tip100"]


In [4]:
def get_data_by_cluster(dmr_annot, total_background_motif_hits, bg_bed, 
                        dmr_type, cluster, use_cols=None):
    dmr_bed_df = pd.read_csv(
        f'/home/hanliu/project/mouse_rostral_brain/DMR/SubType/{dmr_type}Bed/{cluster}.{dmr_type}DMR.DMS2.bed',
        header=None,
        sep='\t',
        index_col=-1,
        names=['chrom', 'start', 'end', 'SubDMR'])
    dmr_bed = pybedtools.BedTool().from_dataframe(dmr_bed_df)

    this_dmr_annot = dmr_annot[dmr_bed_df.index, :]
    motif_ids = this_dmr_annot.var_names[this_dmr_annot.var['FeatureType'] ==
                                    'MotifHits']

    if use_cols is not None:
        dmr_judge = (this_dmr_annot[:, use_cols].X.sum(axis=1) > 0).A1

        this_dmr_annot = this_dmr_annot[dmr_judge, :]
        
    # exclude background that overlap with DMR
    bg_no_overlap = bg_bed.intersect(dmr_bed, v=True)
    use_bg = bg_no_overlap.to_dataframe().iloc[:, -1].values
    background_motif_hits = total_background_motif_hits[use_bg, :]

    # make sure col in same order
    background_motif_hits = background_motif_hits[:, motif_ids]
    
    return this_dmr_annot.copy(), background_motif_hits.copy()


def motif_enrichment(dmr_annot, background_motif_hits, cluster, dmr_type):
    
    motif_ids = dmr_annot.var_names[dmr_annot.var['FeatureType'] == 'MotifHits']
    # calculate motif occurence, not considering hits here
    pos = (dmr_annot[:, motif_ids].X > 0).sum(axis=0)
    pos_total = dmr_annot.shape[0]
    
    neg = (background_motif_hits.X > 0).sum(axis=0)
    neg_total = background_motif_hits.shape[0]
    
    # prepare tables
    tables = {}
    for motif, _pos, _neg in zip(motif_ids, pos.A1, neg.A1):
        table = [[_pos, pos_total - _pos], [_neg, neg_total - _neg]]
        tables[motif] = table
        
    results = {}
    with ProcessPoolExecutor(40) as executor:
        fs = {}
        for motif, t in tables.items():
            f = executor.submit(stats.fisher_exact, t, alternative='greater')
            fs[f] = motif
    
        for f in as_completed(fs):
            motif = fs[f]
            odds, p = f.result()
            results[motif] = {'oddsratio': odds, 'p_value': p}
    motif_enrich_df = pd.DataFrame(results).T
    
    _, p, _, _ = multipletests(motif_enrich_df['p_value'], method='fdr_bh')
    motif_enrich_df['adj_p'] = p
    
    motif_enrich_df['-lgp'] = -np.log10(motif_enrich_df['adj_p']).replace(
        -np.inf, -300)
    
    records = {}
    for motif, t in tables.items():
        tp, tn = t[0]
        fp, fn = t[1]
        tp_rate = tp / pos_total
        fp_rate = fp / neg_total
        records[motif] = dict(tp=tp,
                              tn=tn,
                              fp=fp,
                              fn=fn,
                              tp_rate=tp_rate,
                              fp_rate=fp_rate)
    counts = pd.DataFrame(records).T
    motif_enrich_df = pd.concat([motif_enrich_df, counts], axis=1, sort=True)
    
    motif_enrich_df[cluster_type] = cluster
    motif_enrich_df['DMRType'] = dmr_type
    
    # final filter
    filtered_motif_df = motif_enrich_df[(motif_enrich_df['oddsratio'] > or_cutoff)
                                        &
                                        (motif_enrich_df['-lgp'] > neg_lgp_cutoff)]
    print(filtered_motif_df.shape[0])
    
    motif_enrich_df.to_msgpack(f'{cluster}.{dmr_type}.motif_enrichment.msg')

## DMR hits

In [5]:
total_dmr_annot = anndata.read_h5ad(
    '/home/hanliu/project/mouse_rostral_brain/DMR/SubDMRAnnotation/DMRAnnotation.h5ad'
)

## Background Hits

In [6]:
total_background_motif_hits = anndata.read_h5ad(
    '/home/hanliu/project/mouse_rostral_brain/DMR/MotifScan/BackgroundAdultTissueDMR.MotifHits.with_region_bed.h5ad'
)
bg_bed = pybedtools.BedTool().from_dataframe(
    total_background_motif_hits.obs.reset_index().iloc[:, [1, 2, 3, 0]])

## Get cluster data

In [7]:
dmr_type = dmr_types[0]
cluster = clusters[0]

for cluster in clusters:
    for dmr_type in dmr_types:
        print(cluster, dmr_type)
        output_path = f'{cluster}.{dmr_type}.motif_enrichment.msg'
        if pathlib.Path(output_path).exists():
            continue
        dmr_data, bg_data = get_data_by_cluster(total_dmr_annot, total_background_motif_hits, bg_bed, 
                                                dmr_type, cluster, use_cols=use_cols)
        motif_enrichment(dmr_data, bg_data, cluster, dmr_type)

MGE-Sst_Rxra Hyper
56
MGE-Sst_Rxra Hypo
29
CA3_Cadm2 Hyper
28
CA3_Cadm2 Hypo
54
CA1_Chrm3 Hyper
36
CA1_Chrm3 Hypo
68
CA3-St18_Tead1 Hyper
20
CA3-St18_Tead1 Hypo
82
Unc5c_Unc5c Hyper
64
Unc5c_Unc5c Hypo
85
Gfra1_Gfra1 Hyper
0
Gfra1_Gfra1 Hypo
82
ODC_odc-small Hyper
71
ODC_odc-small Hypo
28
PC_pc-all Hyper
41
PC_pc-all Hypo
74
ODC_odc-large Hyper
68
ODC_odc-large Hypo
44
ANP_anp-dg Hyper
34
ANP_anp-dg Hypo
108
IT-L5_Etv1 Hyper
104
IT-L5_Etv1 Hypo
46
CA1_Ptprg Hyper
5
CA1_Ptprg Hypo
84
MGE-Sst_Ptpre Hyper
31
MGE-Sst_Ptpre Hypo
25
NP-L6_Cntnap4 Hyper
35
NP-L6_Cntnap4 Hypo
32
CA3-St18_Nuak1 Hyper
13
CA3-St18_Nuak1 Hypo
74
CGE-Lamp5_Dock5 Hyper
49
CGE-Lamp5_Dock5 Hypo
71
CT-L6_Megf9 Hyper
103
CT-L6_Megf9 Hypo
32
IG-CA2_Chrm3 Hyper
47
IG-CA2_Chrm3 Hypo
83
IG-CA2_Peak1 Hyper
42
IG-CA2_Peak1 Hypo
93
DG-po_Calb2 Hyper
4
DG-po_Calb2 Hypo
31
DG_dg-all Hyper
41
DG_dg-all Hypo
110
CGE-Vip_Ntng1 Hyper
22
CGE-Vip_Ntng1 Hypo
50
CA1_Kif26a Hyper
0
CA1_Kif26a Hypo
110
CA3_Efnb2 Hyper
17
CA3_Efnb2 Hypo
11

In [8]:
subprocess.run(['rm', '-f', 'bed_tmp'])

CompletedProcess(args=['rm', '-f', 'bed_tmp'], returncode=1)

In [9]:
with open('final_flag.txt', 'w') as f:
    f.write('Oh yeah')