In [None]:
import collections

import pandas as pd
from scipy.stats import binom
from statsmodels.sandbox.stats.multicomp import multipletests

from tqdm import tqdm_notebook as tqdm

from utils import load_config
from tad_helper_functions import get_tad_lengths, EmptyTAD, OverlappingTADS

In [None]:
config = load_config()

# Load data

In [None]:
results_dir = config['output_dirs']['results']
df = pd.read_table(f'{results_dir}/disgenet_enhanced.tsv')
display(df.head())

disease_cancer_map = df.set_index('diseaseId').to_dict()['is_cancer']

In [None]:
TAD_border_types = df.filter(like='TAD_').columns.tolist()
TAD_border_types

# Get TAD stats

In [None]:
hg38_genome_length = 3_099_734_149  # https://www.ncbi.nlm.nih.gov/grc/human/data?asm=GRCh38.p11

results_dir = config['output_dirs']['results']
df_tads = pd.read_table(f'{results_dir}/tads_hg38.tsv')
df_tads.head()

In [None]:
df_tads['prev_tad_stop'] = df_tads.tad_stop.shift(1)
df_tads['next_tad_start'] = df_tads.tad_start.shift(-1)
df_tads['prev_tad_chr'] = df_tads.chrname.shift(1)
df_tads['next_tad_chr'] = df_tads.chrname.shift(-1)

In [None]:
tad_statistics = collections.defaultdict(dict)
for tad_type in tqdm(TAD_border_types):
    type_ = tad_type[4:]  # remove TAD_

    tad_len = 0
    boundary_len = 0
    chrom_lens = collections.defaultdict(list)
    for row in df_tads.itertuples():
        try:
            b1_range, tad_range, b2_range = get_tad_lengths(row, type_)
        except (EmptyTAD, OverlappingTADS):
            continue

        tad_len += tad_range.stop - tad_range.start
        boundary_len += \
            (b1_range.stop - b1_range.start) \
            + (b2_range.stop - b2_range.start)

        chrom_lens[row.chrname].append(row.tad_stop)

    none_len = hg38_genome_length - tad_len - boundary_len

    tad_statistics[type_]['chrom'] = hg38_genome_length
    tad_statistics[type_]['tad'] = tad_len
    tad_statistics[type_]['boundary'] = boundary_len
    tad_statistics[type_]['none'] = none_len
tad_statistics = dict(tad_statistics)
    
tad_statistics

# Compute enrichments

In [None]:
enr_result = []
for disease, group in tqdm(df.groupby('diseaseId'), total=df['diseaseId'].unique().size):
    for tad_type in TAD_border_types:
        # get TAD-related statistics
        tads = group[tad_type].tolist()
        
        N = group.shape[0]
        counts = collections.Counter(tads)
        assert N == len(tads), (N, tads)
        
        if counts['boundary'] == 0:
            continue
        
        # get overall lengths
        type_ = tad_type[4:]  # remove TAD_
        cur_cl = tad_statistics[type_]['chrom']
        cur_tl = tad_statistics[type_]['tad']
        cur_bl = tad_statistics[type_]['boundary']
        cur_nl = tad_statistics[type_]['none']
        
        # compute enrichment
        cdf_tad = binom.cdf(counts['tad'], N, cur_tl/cur_cl) #if counts['tad'] > 0 else 0
        cdf_boundary = binom.cdf(counts['boundary'], N, cur_bl/cur_cl) #if counts['boundary'] > 0 else 0
        cdf_none = binom.cdf(counts['nan'], N, cur_nl/cur_cl) #if counts['nan'] > 0 else 0
        
        enr_result.append({
            'disease': disease,
            '#snp': N,
            '#border_snp': counts['boundary'],
            'pval_tad': 1 - cdf_tad,
            'pval_boundary': 1 - cdf_boundary,
            'pval_none': 1 - cdf_none,
            'TAD_type': type_
        })
df_enr = pd.DataFrame(enr_result)

# Multiple-testing correction

In [None]:
def correct_pvalues(pval_list):
    _, pval_corr, _, _ = multipletests(pval_list, method='fdr_bh')
    return pval_corr

In [None]:
df_enr_tmp = df_enr.copy()
df_enr_tmp['is_cancer'] = df_enr_tmp['disease'].apply(lambda x: disease_cancer_map[x])
df_enr_corr = df_enr_tmp.groupby(['TAD_type', 'is_cancer']).transform(correct_pvalues)

df_enr['pval_boundary'] = df_enr_corr['pval_boundary']
df_enr['pval_none'] = df_enr_corr['pval_none']
df_enr['pval_tad'] = df_enr_corr['pval_tad']

# Save result

In [None]:
results_dir = config['output_dirs']['results']
df_enr.to_csv(f'{results_dir}/TAD_enrichment.csv', index=False)
df_enr.head()