In [None]:
import collections

import numpy as np
import pandas as pd
from scipy.stats import binom, hypergeom
from statsmodels.sandbox.stats.multicomp import multipletests

from tqdm.auto import tqdm

from bioinf_common.tools import multipletests_nan

from tad_helper_functions import get_tad_lengths, EmptyTAD, TADTooSmall, OverlappingTADs

# Parameters

In [None]:
db_fname = snakemake.input.db_fname
tads_fname = snakemake.input.tads_fname
info_fname = snakemake.input.info_fname

source = snakemake.wildcards.source
filter_type = snakemake.wildcards.filter
tad_borders = snakemake.config['tad_borders']
allow_snp_multiplicity_in_enrichment = snakemake.config['parameters']['allow_snp_multiplicity_in_enrichment']
enrichment_distribution = snakemake.config['parameters']['enrichment_distribution']
enrichment_null_model = snakemake.config['parameters']['enrichment_null_model']

fname_out = snakemake.output.fname

# Load data

In [None]:
df = pd.read_csv(db_fname)
display(df.head())

disease_cancer_map = df.set_index('diseaseId').to_dict()['is_cancer']

In [None]:
df_info = pd.read_csv(info_fname, index_col=1)
genome_assembly = df_info.loc[source, 'genome_assembly']
genome_assembly

# Get TAD stats

In [None]:
genome_length = {
    'hg19': 2_991_688_216,  # https://www.ncbi.nlm.nih.gov/grc/human/data?asm=GRCh37.p13
    'hg38': 3_092_480_053,  # https://www.ncbi.nlm.nih.gov/grc/human/data?asm=GRCh38.p11
}[genome_assembly]

df_tads = pd.read_csv(tads_fname)
df_tads.head()

In [None]:
df_tads['prev_tad_stop'] = df_tads.tad_stop.shift(1)
df_tads['next_tad_start'] = df_tads.tad_start.shift(-1)
df_tads['prev_tad_chr'] = df_tads.chrname.shift(1)
df_tads['next_tad_chr'] = df_tads.chrname.shift(-1)

In [None]:
tad_statistics = collections.defaultdict(dict)

for border_name, border_range in tqdm(tad_borders.items()):
    tad_len = 0
    border_len = 0
    chrom_lens = collections.defaultdict(list)
    for row in df_tads.itertuples():
        try:
            b1_range, tad_range, b2_range = get_tad_lengths(row, border_range)
        except (EmptyTAD, TADTooSmall, OverlappingTADs):
            continue

        tad_len += tad_range.stop - tad_range.start
        border_len += \
            (b1_range.stop - b1_range.start) \
            + (b2_range.stop - b2_range.start)

        chrom_lens[row.chrname].append(row.tad_stop)

    outside_len = genome_length - tad_len - border_len

    tad_statistics[border_name]['chrom'] = genome_length
    tad_statistics[border_name]['tad'] = tad_len
    tad_statistics[border_name]['border'] = border_len
    tad_statistics[border_name]['outside'] = outside_len

tad_statistics = dict(tad_statistics)
tad_statistics

# Compute enrichments

In [None]:
filter_column = f'filter_{filter_type}_{genome_assembly}'
filter_column

In [None]:
df_filter_sub = df[df[filter_column]]
print(df.shape, df_filter_sub.shape)

In [None]:
enr_result = []
for disease, group in tqdm(df_filter_sub.groupby('diseaseId'), total=df_filter_sub['diseaseId'].nunique()):
    for border_name in tad_borders.keys():
        # get TAD-related statistics
        if allow_snp_multiplicity_in_enrichment:
            tads = group[border_name].tolist()
            
            snp_counts = {
                'total': df_filter_sub['snpId'].shape[0],
                'tad': df_filter_sub.loc[df_filter_sub[border_name] == 'tad', 'snpId'].shape[0],
                'border': df_filter_sub.loc[df_filter_sub[border_name] == 'border', 'snpId'].shape[0],
                'outside': df_filter_sub.loc[df_filter_sub[border_name] == 'outside', 'snpId'].shape[0]
            }
        else:
            tads = group[['snpId', border_name]].drop_duplicates(subset='snpId')[border_name]
            
            snp_counts = {
                'total': df_filter_sub['snpId'].drop_duplicates().shape[0],
                'tad': df_filter_sub.loc[df_filter_sub[border_name] == 'tad', 'snpId'].drop_duplicates().shape[0],
                'border': df_filter_sub.loc[df_filter_sub[border_name] == 'border', 'snpId'].drop_duplicates().shape[0],
                'outside': df_filter_sub.loc[df_filter_sub[border_name] == 'outside', 'snpId'].drop_duplicates().shape[0]
            }
        
        N = len(tads)
        counts = collections.Counter(tads)
        
        # compute enrichment
        if counts['border'] == 0:
            cdf_tad = np.nan
            cdf_border = np.nan
            cdf_outside = np.nan
        else:
            if enrichment_null_model == 'base_sample':
                # get overall lengths
                cur_cl = tad_statistics[border_name]['chrom']
                cur_tl = tad_statistics[border_name]['tad']
                cur_bl = tad_statistics[border_name]['border']
                cur_nl = tad_statistics[border_name]['outside']
                
                if enrichment_distribution == 'binom':
                    cdf_tad = binom.cdf(counts['tad'], N, cur_tl/cur_cl)
                    cdf_border = binom.cdf(counts['border'], N, cur_bl/cur_cl)
                    cdf_outside = binom.cdf(counts['outside'], N, cur_nl/cur_cl)
                elif enrichment_distribution == 'hypergeom':
                    cdf_tad = hypergeom.cdf(counts['tad'], cur_cl, N, cur_tl)
                    cdf_border = hypergeom.cdf(counts['border'], cur_cl, N, cur_bl)
                    cdf_outside = hypergeom.cdf(counts['outside'], cur_cl, N, cur_nl)
            elif enrichment_null_model == 'snp_sample':
                if enrichment_distribution == 'binom':
                    cdf_tad = binom.cdf(counts['tad'], N, snp_counts['tad']/snp_counts['total'])
                    cdf_border = binom.cdf(counts['border'], N, snp_counts['border']/snp_counts['total'])
                    cdf_outside = binom.cdf(counts['outside'], N, snp_counts['outside']/snp_counts['total'])
                elif enrichment_distribution == 'hypergeom':
                    cdf_tad = hypergeom.cdf(counts['tad'], snp_counts['total'], N, snp_counts['tad'])
                    cdf_border = hypergeom.cdf(counts['border'], snp_counts['total'], N, snp_counts['border'])
                    cdf_outside = hypergeom.cdf(counts['outside'], snp_counts['total'], N, snp_counts['outside'])
        
        enr_result.append({
            'diseaseId': disease,
            '#snp': N,
            '#border_snp': counts['border'],
            'pval_tad': 1 - cdf_tad,
            'pval_border': 1 - cdf_border,
            'pval_outside': 1 - cdf_outside,
            'TAD_type': border_name
        })
df_enr = pd.DataFrame(enr_result)

In [None]:
df_enr.head()

# Multiple-testing correction

In [None]:
df_enr['is_cancer'] = df_enr['diseaseId'].apply(lambda x: disease_cancer_map[x])
df_enr.head()

In [None]:
df_enr_tmp = df_enr.copy()
df_enr_tmp['is_cancer'] = df_enr_tmp['diseaseId'].apply(lambda x: disease_cancer_map[x])
df_enr_corr = df_enr_tmp.groupby(['TAD_type', 'is_cancer'])[['pval_border', 'pval_tad', 'pval_outside']].transform(multipletests_nan)

df_enr['pval_border__notcorrected'] = df_enr['pval_border']
df_enr['pval_outside__notcorrected'] = df_enr['pval_outside']
df_enr['pval_tad__notcorrected'] = df_enr['pval_tad']

df_enr['pval_border'] = df_enr_corr['pval_border']
df_enr['pval_outside'] = df_enr_corr['pval_outside']
df_enr['pval_tad'] = df_enr_corr['pval_tad']

In [None]:
df_enr.head()

# Save result

In [None]:
df_enr.to_csv(fname_out, index=False)
df_enr.head()