In [None]:
import os

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from bioinf_common.plotting import annotated_barplot

In [None]:
sns.set_context('talk')

# Load data

In [None]:
target_dir = 'aggregated_results/pipeline_run/results/'
snpdb_fname = 'aggregated_results/pipeline_run/results/snpdb_enhanced__input_files+tad_coordinates:data_tads_hESC_hg19_with_ids.csv;git_branch:master.tsv'

## Enrichment data

In [None]:
df_list = []
for entry in os.scandir(target_dir):
    if not entry.name.startswith('TAD_enrichment__'):
        continue

    inp, _ = entry.name.split('__')[-1].split(';')
    data_source = inp.split(':')[1].split('.')[0]
    data_source = data_source[len('data_'):]  # cut off common prefix
    
    tmp = pd.read_csv(entry.path)
    tmp['data_source'] = data_source
    df_list.append(tmp)
    
    print(data_source, tmp.shape)
    
df = pd.concat(df_list)

## Identify cancer diseases

In [None]:
df_snpdb = pd.read_table(snpdb_fname)

iscancer_map = df_snpdb[['diseaseId', 'is_cancer']].set_index('diseaseId').to_dict()['is_cancer']
df['is_cancer'] = df['disease'].map(iscancer_map)

## Data overview

In [None]:
df.head()

# Define signal measure

In [None]:
def signal_measure(df):
    tmp = df.copy()
    tmp = tmp[tmp['TAD_type'] == '20in']
    tmp.loc[tmp.pval_boundary==0, 'pval_boundary'] = 1e-16

    tmp['pval_boundary_neglog'] = tmp['pval_boundary'].apply(lambda x: -np.log10(x))
    tmp.loc[tmp['pval_boundary'] > 0.05, 'pval_boundary_neglog'] = 0

    cancer_signal = tmp.loc[tmp['is_cancer'], 'pval_boundary_neglog'].mean()
    noncancer_signal = tmp.loc[~tmp['is_cancer'], 'pval_boundary_neglog'].mean()

    print(df['data_source'].iloc[0], cancer_signal, noncancer_signal, cancer_signal / noncancer_signal)
    return cancer_signal / noncancer_signal

# Apply measure

In [None]:
signal_data = []
for data_source, group in df.groupby('data_source'):
    m = signal_measure(group)
    signal_data.append({
        'data_source': data_source,
        'signal': m
    })
df_signal = pd.DataFrame(signal_data)

In [None]:
df_signal.head()

# Visualize result

In [None]:
plt.figure(figsize=(18,12))
g = annotated_barplot(
    x='data_source', y='signal', data=df_signal, 
    color=sns.color_palette()[0],
    anno_kws=dict(label_offset=10))

plt.xticks(rotation=90)
plt.ylabel(r'$\frac{\langle-\log_{10}(p_{cancer,boundary_{20in}})\rangle}{\langle-\log_{10}(p_{noncancer,boundary_{20in}})\rangle}$')

plt.tight_layout()
plt.savefig('images/signal_vs_datasource.pdf')