In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.set_context('talk')

# Parameters

In [None]:
db_fname = snakemake.input.db_fname
enr_fname = snakemake.input.enr_fname

tad_borders = snakemake.config['tad_borders']

outdir = Path(snakemake.output.outdir)

# Load data

In [None]:
df = pd.read_csv(db_fname)
df.head()

In [None]:
df_enr = pd.read_csv(enr_fname)

# mark cancer diseases
iscancer_map = df[['diseaseId', 'is_cancer']].set_index('diseaseId').to_dict()['is_cancer']
df_enr['is_cancer'] = df_enr['diseaseId'].map(iscancer_map)

# add disease name
# disname_map = df[['diseaseId', 'diseaseName']].set_index('diseaseId').to_dict()['diseaseName']
# df_enr['disease_name'] = df_enr['diseaseId'].map(disname_map)

df_enr.head()

# Plots

In [None]:
tmp = {}

# copy some columns
copy_col = ['TAD_type', 'is_cancer']
for col in copy_col:
    tmp[col] = df_enr[col]

# transform p-values
df_enr_copy = df_enr.copy()
for col in df_enr_copy.columns:
    if not col.startswith('pval_'):
        continue
    
    df_enr_copy.loc[df_enr_copy[col]==0, col] = 1e-16
    tmp[col] = df_enr_copy[col].apply(lambda x: -np.log10(x))

# create dataframe
df_enr_log = pd.DataFrame(tmp)
# df_enr_log = df_enr_log.replace([np.inf, -np.inf], np.nan)
df_enr_log.head()

In [None]:
df_enr_log_long = pd.melt(df_enr_log, id_vars=['TAD_type', 'is_cancer'])

df_enr_log_long['pvalue_type'] = df_enr_log_long['variable'].apply(lambda x: 'notcorrected' if x.endswith('__notcorrected') else 'corrected')
df_enr_log_long['variable'] = df_enr_log_long['variable'].str.split('__').str[0]

df_enr_log_long.head()

## TAD border enrichment

In [None]:
for pvalue_type, group in df_enr_log_long.groupby('pvalue_type'):
    # filter data
    max_x = 3  # group.loc[group['variable'] == 'pval_border', 'value'].max()

    tmp = group.copy()
    tmp = tmp[tmp['variable'] == 'pval_border']
    tmp.loc[tmp['value'] > max_x, 'value'] = max_x

    # plot
    g = sns.FacetGrid(
        tmp, col='TAD_type', col_wrap=3, hue='is_cancer',
        col_order=tad_borders.keys(),
        height=5, aspect=1.2)

    bins = np.linspace(0, max_x, int(max_x * 10))

    g = g.map(sns.distplot, 'value', hist_kws=dict(alpha=.5), bins=bins, kde=False, norm_hist=True)
    g = g.map(plt.axvline, x=-np.log10(.05), color='red', linestyle='dashed')

    g.set(xlim=(0, max_x))
    g.add_legend()

    plt.suptitle(rf'values $>{max_x}$ are set to ${max_x}$ ({pvalue_type})')

    #plt.tight_layout()
    plt.subplots_adjust(top=.92)
    plt.savefig(outdir / f'tad_border_enrichment_{pvalue_type}.pdf')