In [15]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict
import pathlib
from scipy.stats import poisson
from statsmodels.stats.multitest import multipletests

In [None]:
region_records = {}
n = 0
for output_path in pathlib.Path('rmdl_temp_dir/').glob('*.gz'):
    n += 1
    if n % 500 == 0:
        print(n, end=' ')
    with open(output_path) as f:
        cell_record = json.load(f)
        for chrom, chrom_bins in cell_record['bins'].items():
            if chrom not in region_records:
                region_records[chrom] = defaultdict(int)
            for bin_id in chrom_bins:
                region_records[chrom][bin_id] += 1

In [95]:
def calculate_blacklist_region(region_records):
    # calculate region poisson mu
    sum_of_bin = 0
    n_bin = 0
    for chrom, chrom_values in region_records.items():
        sum_of_bin += sum(chrom_values.values())
        n_bin += len(chrom_values)
    mu = sum_of_bin / n_bin
    
    # calculate region FDR p cutoff
    total_p = []
    for chrom, chrom_values in region_records.items():
        chrom_values = pd.Series(chrom_values)
        p_values = poisson.sf(chrom_values.values, mu)
        total_p.append(p_values)
    total_p = np.concatenate(total_p)
    judge, *_ = multipletests(total_p, alpha=0.01, method='fdr_bh')
    p_max = total_p[judge].max()
    del total_p, judge
    
    # calculate region blacklist
    final_blacklist = {}
    for chrom, chrom_values in region_records.items():
        chrom_values = pd.Series(chrom_values)
        p_values = poisson.sf(chrom_values.values, mu)
        final_blacklist[chrom] = list(chrom_values[p_values < p_max].index)
    return final_blacklist

In [97]:
final_blacklist = {k: set(v) for k, v in final_blacklist.items()}