In [1]:
from concurrent.futures import ProcessPoolExecutor, as_completed
import anndata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pybedtools
import scipy.stats as stats
import seaborn as sns
import joblib
import pathlib
from statsmodels.stats.multitest import multipletests

## Parameters

In [2]:
# relavent score cutoff
rs_cutoff = 0.3
min_dmr_to_test = 1000 # on either side

# motif enrichment
or_cutoff = 1.6
neg_lgp_cutoff = 10
mask_quantile_to_max = 0.8

## Load Data

### Motif gene

In [3]:
motif_gene_anno = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/study/MotifClustering/JASPAR2020_CORE_vertebrates_non-redundant.mouse_genes.with_motif_group.199.csv', 
    index_col=0
)
motif_gene_anno.head()

Unnamed: 0_level_0,motif_name,motif_genes,gene_ids,gene_names,motif_group
motif_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MA0006.1,Ahr::Arnt,"Ahr,Arnt","ENSMUSG00000019256.17,ENSMUSG00000015522.18","Ahr,Arnt",MotifGroup178
MA0854.1,Alx1,Alx1,ENSMUSG00000036602.14,Alx1,MotifGroup3
MA0634.1,ALX3,ALX3,ENSMUSG00000014603.3,Alx3,MotifGroup3
MA0853.1,Alx4,Alx4,ENSMUSG00000040310.12,Alx4,MotifGroup3
MA0007.3,Ar,Ar,ENSMUSG00000046532.8,Ar,MotifGroup32


### Node Data

In [4]:
adata = anndata.read_h5ad('PairwiseDMR.h5ad')

In [5]:
use_dmr = adata.var_names[((adata.X != 0).sum(axis=0) != 0).A1]

### DMR Bed and Rate

In [6]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5') as hdf:
    dmr_bed_df = hdf['bed'].loc[use_dmr].copy()
    dmr_rate = hdf['Rate'].loc[use_dmr].copy()

### DMR annot

In [7]:
dmr_annot = anndata.read_h5ad(
    '/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/MotifScan.h5ad'
)
# mask small motif scores
motif_cutoff = pd.Series(dmr_annot.X.max(axis=0).todense().A1 * mask_quantile_to_max, index=dmr_annot.var_names)

dmr_annot = dmr_annot[use_dmr, :].copy()
dmr_annot

AnnData object with n_obs × n_vars = 2217640 × 719 
    obs: 'chrom', 'start', 'end'

## Refilter scores

In [8]:
# only keep value larger than the cutoff for each motif
dmr_annot.X = dmr_annot.X.multiply(
    (dmr_annot.X >
     motif_cutoff[dmr_annot.var_names].values[None, :]).astype(int)).tocsr()

## Prepare test input

In [4]:
def prepare_table(pair_id):
    # get pair dmr
    a, b = adata.obs.loc[pair_id]
    this_dmrs = adata.var_names[adata.var_vector(pair_id).astype(bool)]
    this_dmr_rate = dmr_rate.loc[this_dmrs, [a, b]]
    a_hypo = this_dmr_rate.index[this_dmr_rate[a] < this_dmr_rate[b]]
    b_hypo = this_dmr_rate.index[this_dmr_rate[a] > this_dmr_rate[b]]
    
    # get dmr motif hits annotation
    left_dmr_annot = dmr_annot[a_hypo, :]
    right_dmr_annot = dmr_annot[b_hypo, :]

    # if DMR is not enough, skip and return empty record
    if (a_hypo.size < min_dmr_to_test) or (b_hypo.size < min_dmr_to_test):
        empty_record = pd.DataFrame([],
                                    columns=[
                                        'oddsratio', 'p_value', 'adj_p',
                                        '-lgp', 'left_hit', 'left_no_hit',
                                        'right_hit', 'right_no_hit',
                                        'left_hit_rate', 'right_hit_rate',
                                        'Node'
                                    ])
        return empty_record

    # get table
    motif_ids = dmr_annot.var_names
    # calculate motif occurence, not considering hits here
    left = (left_dmr_annot[:, motif_ids].X > 0).sum(axis=0)
    left_total = left_dmr_annot.shape[0]

    right = (right_dmr_annot.X > 0).sum(axis=0)
    right_total = right_dmr_annot.shape[0]

    tables = {}
    for motif, _left, _right in zip(motif_ids, left.A1, right.A1):
        table = [[_left, left_total - _left], [_right, right_total - _right]]
        tables[motif] = table
    return tables

In [None]:
table_records = {}
for pair_id in adata.obs_names:
    print(pair_id)
    a, b = adata.obs.loc[pair_id]
    data = prepare_table(pair_id)
    table_records[(a, b)] = data

In [11]:
joblib.dump(table_records, 'PairMotifCountTables.lib')

['PairMotifCountTables.lib']

## Test

In [4]:
def test_one_pair(a, b, tables):
    # do test
    results = {}    
    for motif, t in tables.items():
        odds, p = stats.fisher_exact(t, alternative='two-sided')    
        results[motif] = {'oddsratio': odds, 'p_value': p}
    motif_enrich_df = pd.DataFrame(results).T

    # p value correction
    _, p, _, _ = multipletests(motif_enrich_df['p_value'], method='fdr_bh')
    motif_enrich_df['adj_p'] = p
    motif_enrich_df['-lgp'] = -np.log10(motif_enrich_df['adj_p']).replace(
        -np.inf, -300)

    # assemble final results
    records = {}
    for motif, t in tables.items():
        tp, tn = t[0]
        fp, fn = t[1]
        tp_rate = tp / (tp + tn)
        fp_rate = fp / (fp + fn)
        records[motif] = dict(left_hit=tp,
                              left_no_hit=tn,
                              right_hit=fp,
                              right_no_hit=fn,
                              left_hit_rate=tp_rate,
                              right_hit_rate=fp_rate)
    counts = pd.DataFrame(records).T
    motif_enrich_df = pd.concat([motif_enrich_df, counts], axis=1, sort=True)
    motif_enrich_df['ClusterA'] = a
    motif_enrich_df['ClusterB'] = b
    
    # apply a minimum filter
    motif_enrich_df = motif_enrich_df[motif_enrich_df['-lgp'] > 2]

    return motif_enrich_df

In [5]:
table_records = joblib.load('PairMotifCountTables.lib')

In [6]:
temp_dir = 'TEMP'
pathlib.Path(temp_dir).mkdir(exist_ok=True)

In [7]:
with ProcessPoolExecutor(40) as executor:
    futures = {}
    for (a, b), tables in table_records.items():
        if not isinstance(tables, dict):
            continue
        output_path = f'{temp_dir}/{a}-{b}.msg'
        if pathlib.Path(output_path).exists():
            continue
        
        future = executor.submit(test_one_pair, a, b, tables)
        futures[future] = (a, b, output_path)
        
    for future in as_completed(futures):
        a, b, output_path = futures[future]
        data  = future.result()
        data.to_msgpack(output_path)
        print(a, b, data.shape[0], sep='\t')

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  app.launch_new_instance()


OLF-Exc_Cux2	OLF-Exc_Lrrtm3	400


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	OLF-Exc_Unc13c	414


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	IT-L23_Cux1	420


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	IT-L23_Ptprt	328
OLF-Exc_Sgcd	OLF-Exc_Unc13c	362


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	OLF-Exc_Sgcd	432


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	IT-L6_Cadps2	524


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	IT-L6_Oxr1	496
OLF-Exc_Cux2	IT-L5_Cdh8	403


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cdh9	IT-L5_Grik3	407


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cdh9	IT-L5_Cdh8	396


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	IT-L23_Tenm2	354


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cdh9	IT-L23_Ptprt	444


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	IT-L23_Foxp1	430
OLF-Exc_Cux2	IT-L5_Grik3	480


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Sgcd	OLF-Exc_Lrrtm3	327


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	IT-L4_Shc3	465


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	IT-L4_Astn2	508


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	IT-L6_Fstl4	522


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cdh9	IT-L23_Foxp1	397


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cdh9	IT-L6_Fstl4	443


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cdh9	IT-L6_Cadps2	474
OLF-Exc_Cdh9	IT-L23_Tenm2	413


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cdh9	IT-L5_Etv1	395


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cdh9	IT-L4_Astn2	442


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cdh9	IT-L4_Shc3	398


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	EP_Tspan5	509


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	CLA_Bcl11a	519


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cdh9	IT-L6_Man1c1	391


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	IT-L6_Man1c1	488


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	IT-L5_Etv1	427


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	CLA_Cdh8	511


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cdh9	EP_Adcy8	397
OLF-Exc_Lrrtm3	OLF-Exc_Unc13c	239


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	EP_Rgs8	410


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cdh9	IT-L6_Oxr1	427


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	CLA_Nrp2	467


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cux2	EP_Adcy8	448


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cdh9	EP_Tspan5	387


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cdh9	CLA_Bcl11a	456


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cdh9	CLA_Nrp2	392


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cdh9	EP_Rgs8	363


  result = getattr(ufunc, method)(*inputs, **kwargs)


OLF-Exc_Cdh9	CLA_Cdh8	426
OLF-Exc_Sgcd	EP_Adcy8	430
OLF-Exc_Sgcd	EP_Tspan5	433
OLF-Exc_Sgcd	CLA_Bcl11a	469
OLF-Exc_Sgcd	EP_Rgs8	421
OLF-Exc_Sgcd	IT-L6_Cadps2	478
OLF-Exc_Sgcd	IT-L6_Fstl4	464
OLF-Exc_Sgcd	IT-L6_Man1c1	436
OLF-Exc_Sgcd	IT-L6_Oxr1	474
OLF-Exc_Sgcd	IT-L5_Grik3	466
OLF-Exc_Sgcd	CLA_Cdh8	448
OLF-Exc_Sgcd	CLA_Nrp2	405
OLF-Exc_Sgcd	IT-L4_Shc3	453
OLF-Exc_Sgcd	IT-L5_Etv1	454
OLF-Exc_Sgcd	IT-L23_Foxp1	447
OLF-Exc_Sgcd	IT-L23_Cux1	448
OLF-Exc_Sgcd	IT-L23_Tenm2	446
OLF-Exc_Sgcd	IT-L23_Ptprt	446
OLF-Exc_Lrrtm3	EP_Tspan5	405
OLF-Exc_Sgcd	IT-L4_Astn2	464
OLF-Exc_Lrrtm3	IT-L6_Fstl4	433
OLF-Exc_Lrrtm3	IT-L6_Cadps2	461
OLF-Exc_Lrrtm3	CLA_Bcl11a	457
OLF-Exc_Sgcd	IT-L5_Cdh8	464
OLF-Exc_Lrrtm3	IT-L6_Oxr1	409
OLF-Exc_Lrrtm3	EP_Rgs8	391
OLF-Exc_Lrrtm3	EP_Adcy8	402
OLF-Exc_Lrrtm3	CLA_Cdh8	431
OLF-Exc_Lrrtm3	IT-L6_Man1c1	384
OLF-Exc_Lrrtm3	CLA_Nrp2	425
OLF-Exc_Lrrtm3	IT-L23_Cux1	419
EP_Adcy8	EP_Tspan5	384
OLF-Exc_Lrrtm3	IT-L4_Shc3	407
OLF-Exc_Lrrtm3	IT-L5_Grik3	420
OLF-Exc_Lrrtm3	IT-L4_Astn2	4

In [8]:
total_enrichment = pd.concat([pd.read_msgpack(p) for p in pathlib.Path(temp_dir).glob('*msg')])
total_enrichment.to_msgpack('PairwiseMotifEnrichment.msg')

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  """Entry point for launching an IPython kernel.
It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  


In [9]:
total_enrichment.head()

Unnamed: 0,oddsratio,p_value,adj_p,-lgp,left_hit,left_no_hit,right_hit,right_no_hit,left_hit_rate,right_hit_rate,ClusterA,ClusterB
MA0002.2,0.847514,2.531116e-16,8.348038e-16,15.078416,5680.0,94320.0,4898.0,68932.0,0.0568,0.066342,Gfra1_Gfra1,CA1_Ak5
MA0003.4,0.847453,0.0001631131,0.0003521871,3.453227,1141.0,98859.0,992.0,72838.0,0.01141,0.013436,Gfra1_Gfra1,CA1_Ak5
MA0017.2,1.38061,5.177632e-12,1.590905e-11,10.798356,1295.0,98705.0,695.0,73135.0,0.01295,0.009414,Gfra1_Gfra1,CA1_Ak5
MA0025.2,2.054668,3.8841530000000005e-128,6.649299e-127,126.177224,3908.0,96092.0,1433.0,72397.0,0.03908,0.019409,Gfra1_Gfra1,CA1_Ak5
MA0027.2,0.717857,6.434664e-54,4.44858e-53,52.351779,4598.0,95402.0,4645.0,69185.0,0.04598,0.062915,Gfra1_Gfra1,CA1_Ak5
