In [None]:
import os

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from natsort import natsorted
from tqdm import tqdm_notebook as tqdm

from bioinf_common.plotting import annotated_barplot

In [None]:
sns.set_context('talk')

# Load data

In [None]:
target_dir = 'aggregated_results/pipeline_run/results/'

In [None]:
df_list = []
for entry in tqdm(os.scandir(target_dir), total=len(os.listdir(target_dir))):
    if not entry.name.startswith('final__'):
        continue

    inp = '.'.join(entry.name.split('__')[-1].split('.')[:-1])
    info = dict(e.split(':') for e in inp.split(';'))

    data_source = info['input_files+tad_coordinates']
    data_source = data_source[len('data_'):]  # cut off common prefix

    tmp = pd.read_csv(entry.path, low_memory=False)
    tmp['data_source'] = data_source
    df_list.append(tmp)
    
    #print(data_source, tmp.shape)
    
df = pd.concat(df_list)

In [None]:
# remove common prefix/suffix
source_prefix = os.path.commonprefix(df['data_source'].unique().tolist())
source_suffix = '.csv'

print(f'Removing: "{source_prefix}"..."{source_suffix}"')

df['data_source'] = df['data_source'].str[len(source_prefix):-len(source_suffix)]

## Data overview

In [None]:
df.head()

# Define signal measures

In [None]:
def enrichment_quotient(df, p_thres=0.05):
    tmp = df.copy()
    tmp.loc[tmp['pval_boundary_neglog'] < -np.log10(p_thres), 'pval_boundary_neglog'] = 0

    cancer_signal = tmp.loc[tmp['is_cancer'], 'pval_boundary_neglog'].mean()
    noncancer_signal = tmp.loc[~tmp['is_cancer'], 'pval_boundary_neglog'].mean()

    #print(df['data_source'].iloc[0], cancer_signal, noncancer_signal, cancer_signal / noncancer_signal)
    return cancer_signal / noncancer_signal

In [None]:
def count_quotient(df, p_thres=0.05):
    tmp = df.copy()

    sig_cancer_num = tmp[tmp['is_cancer'] & (tmp['pval_boundary_neglog'] >= -np.log10(p_thres))].shape[0]
    all_cancer_num = tmp[tmp['is_cancer']].shape[0]
    
    sig_noncancer_num = tmp[(~tmp['is_cancer']) & (tmp['pval_boundary_neglog'] >= -np.log10(p_thres))].shape[0]
    all_noncancer_num = tmp[~tmp['is_cancer']].shape[0]
    
    #print(df['data_source'].iloc[0], sig_cancer_num, all_cancer_num, sig_noncancer_num / all_noncancer_num)
    return (sig_cancer_num/all_cancer_num) / (sig_noncancer_num/all_noncancer_num)

# Apply measure

In [None]:
border_type = '20in'
pvalue_type = 'pval_boundary'

In [None]:
# pre-transform data
df_trans = df.groupby(['data_source', 'diseaseId', 'TAD_type']).first().reset_index().copy()
df_trans = df_trans[df_trans['TAD_type'] == border_type]
df_trans.loc[df_trans[pvalue_type]==0, pvalue_type] = 1e-16

df_trans['pval_boundary_neglog'] = df_trans[pvalue_type].apply(lambda x: -np.log10(x))

In [None]:
signal_data = []
for data_source, group in df_trans.groupby('data_source'):
    signal_data.extend([
        {
            'data_source': data_source,
            'signal': enrichment_quotient(group),
            'type': r'$\frac{\langle-\log_{10}(p_{cancer,boundary_{20in}})\rangle}{\langle-\log_{10}(p_{noncancer,boundary_{20in}})\rangle}$'
        },
        {
            'data_source': data_source,
            'signal': count_quotient(group),
            'type': r'$\frac{|\mathrm{cancer}_{sig}| / |\mathrm{cancer}_{all}|}{|\mathrm{noncancer}_{sig}| / |\mathrm{noncancer}_{all}|}$'
        }
    ])
df_signal = pd.DataFrame(signal_data)

In [None]:
df_signal.head()

# Visualize result

In [None]:
df_signal_sub = df_signal.copy()

df_signal_sub['subset'] = df_signal_sub['data_source'].apply(lambda x: '-'.join(x.split('_')[:-1]))
df_signal_sub['data_source'] = df_signal_sub['data_source'].apply(lambda x: x.split('_')[-1])

df_signal_sub.head()

In [None]:
g = sns.FacetGrid(
    df_signal_sub, row='subset',
    sharex=False, sharey=True,
    height=7, aspect=2)

def my_bar(*args, **kwargs):
    annotated_barplot(
        *args, **kwargs,
        order=natsorted(kwargs['data']['data_source'].unique()),
        anno_kws=dict(label_offset=6, label_size=9),
        palette=sns.color_palette())

g.map_dataframe(my_bar, x='data_source', y='signal', hue='type')

plt.xlabel('window size')
for ax in g.axes.ravel():
    ax.legend()
    
g.savefig('images/signal_vs_datasource.pdf')

# Generate lists

In [None]:
requested_diseases = {'EFO0000611', 'EFO_0001071', 'EFO_0000571', 'EFO_0000708'}
requested_diseases & set(df['diseaseId'])

In [None]:
border_type = '20in'
pvalue_type = 'pval_boundary'

In [None]:
df.head()

In [None]:
fname = 'results/aggregated_snp_lists__{content}.bed'
sub = df[df['TAD_type'] == border_type]

with open(fname.format(content='all'), 'w') as fd_all, open(fname.format(content='sig'), 'w') as fd_sig:
    for (disease, data_source), group in tqdm(sub.groupby(['diseaseId', 'data_source'])):
        assert group[pvalue_type].unique().size == 1
        if disease not in requested_diseases:
            continue

        pval = group[pvalue_type].iloc[0]
        for row in group.itertuples():
            if row.TAD_relation != 'boundary':
                continue

            txt = (
                f'chr{row.chromosome}\t'
                f'{row.position}\t'
                f'{row.position+1}\t'
                    f'snpId:{row.snpId};'
                    f'diseaseId:{row.diseaseId};'
                    f'boundary_pvalue:{pval};'
                    f'data_source:{row.data_source};'
                    f'boundary_type:{border_type}'
                '\n')

            fd_all.write(txt)
            if pval < 0.05:
                fd_sig.write(txt)