In [None]:
import os

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from natsort import natsorted

from bioinf_common.plotting import annotated_barplot

In [None]:
sns.set_context('talk')

# Load data

In [None]:
target_dir = 'aggregated_results/pipeline_run/results/'

In [None]:
df_list = []
for entry in os.scandir(target_dir):
    if not entry.name.startswith('final__'):
        continue

    inp, _ = entry.name.split('__')[-1].split(';')
    info = dict(e.split(':') for e in '.'.join(inp.split('.')[:-1]).split(';'))

    data_source = info['input_files+tad_coordinates']
    data_source = data_source[len('data_'):]  # cut off common prefix

    tmp = pd.read_csv(entry.path, low_memory=False)
    tmp['data_source'] = data_source
    df_list.append(tmp)
    
    print(data_source, tmp.shape)
    
df = pd.concat(df_list)

## Data overview

In [None]:
df.head()

# Define signal measures

In [None]:
def enrichment_quotient(df, p_thres=0.05):
    tmp = df.copy()
    tmp.loc[tmp['pval_boundary_neglog'] < -np.log10(p_thres), 'pval_boundary_neglog'] = 0

    cancer_signal = tmp.loc[tmp['is_cancer'], 'pval_boundary_neglog'].mean()
    noncancer_signal = tmp.loc[~tmp['is_cancer'], 'pval_boundary_neglog'].mean()

    #print(df['data_source'].iloc[0], cancer_signal, noncancer_signal, cancer_signal / noncancer_signal)
    return cancer_signal / noncancer_signal

In [None]:
def count_quotient(df, p_thres=0.05):
    tmp = df.copy()

    sig_cancer_num = tmp[tmp['is_cancer'] & (tmp['pval_boundary_neglog'] >= -np.log10(p_thres))].shape[0]
    all_cancer_num = tmp[tmp['is_cancer']].shape[0]
    
    sig_noncancer_num = tmp[(~tmp['is_cancer']) & (tmp['pval_boundary_neglog'] >= -np.log10(p_thres))].shape[0]
    all_noncancer_num = tmp[~tmp['is_cancer']].shape[0]
    
    #print(df['data_source'].iloc[0], sig_cancer_num, all_cancer_num, sig_noncancer_num / all_noncancer_num)
    return (sig_cancer_num/all_cancer_num) / (sig_noncancer_num/all_noncancer_num)

# Apply measure

In [None]:
border_type = '20in'
pvalue_type = 'pval_boundary'

In [None]:
# pre-transform data
df_trans = df.groupby(['data_source', 'diseaseId', 'TAD_type']).first().reset_index().copy()
df_trans = df_trans[df_trans['TAD_type'] == border_type]
df_trans.loc[df_trans[pvalue_type]==0, pvalue_type] = 1e-16

df_trans['pval_boundary_neglog'] = df_trans[pvalue_type].apply(lambda x: -np.log10(x))

In [None]:
signal_data = []
for data_source, group in df_trans.groupby('data_source'):
    signal_data.extend([
        {
            'data_source': data_source,
            'signal': enrichment_quotient(group),
            'type': r'$\frac{\langle-\log_{10}(p_{cancer,boundary_{20in}})\rangle}{\langle-\log_{10}(p_{noncancer,boundary_{20in}})\rangle}$'
        },
        {
            'data_source': data_source,
            'signal': count_quotient(group),
            'type': r'$\frac{|\mathrm{cancer}_{sig}| / |\mathrm{cancer}_{all}|}{|\mathrm{noncancer}_{sig}| / |\mathrm{noncancer}_{all}|}$'
        }
    ])
df_signal = pd.DataFrame(signal_data)

In [None]:
df_signal.head()

# Visualize result

In [None]:
plt.figure(figsize=(18,12))
g = annotated_barplot(
    x='data_source', y='signal', data=df_signal, hue='type',
    order=natsorted(df_signal['data_source'].unique()),
    anno_kws=dict(label_offset=10, label_size=13))

plt.xticks(rotation=90)

plt.tight_layout()
plt.savefig('images/signal_vs_datasource.pdf')