In [None]:
import os
import itertools
from pathlib import Path

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from natsort import natsorted
from tqdm.auto import tqdm

from bioinf_common.plotting import annotated_barplot, add_identity

In [None]:
sns.set_context('talk')
pd.set_option('display.max_columns', None)

# Parameters

In [None]:
db_fname = snakemake.input.db_fname

outdir = Path(snakemake.output.outdir)

# Load data

In [None]:
df = pd.read_csv(db_fname)
df.head()

## Data overview

In [None]:
df.shape

In [None]:
df.describe()

# Define signal measures

In [None]:
def enrichment_quotient(df, p_thres=0.05):
    tmp = df.copy()
    tmp.loc[tmp['pval_boundary_neglog'] < -np.log10(p_thres), 'pval_boundary_neglog'] = 0

    cancer_signal = tmp.loc[tmp['is_cancer'], 'pval_boundary_neglog'].mean()
    noncancer_signal = tmp.loc[~tmp['is_cancer'], 'pval_boundary_neglog'].mean()

#     print(df['tad_source'].iloc[0], cancer_signal, noncancer_signal, cancer_signal / noncancer_signal)
    return cancer_signal / noncancer_signal

In [None]:
def count_quotient(df, p_thres=0.05):
    tmp = df.copy()

    sig_cancer_num = tmp[tmp['is_cancer'] & (tmp['pval_boundary_neglog'] >= -np.log10(p_thres))].shape[0]
    all_cancer_num = tmp[tmp['is_cancer']].shape[0]
    
    sig_noncancer_num = tmp[(~tmp['is_cancer']) & (tmp['pval_boundary_neglog'] >= -np.log10(p_thres))].shape[0]
    all_noncancer_num = tmp[~tmp['is_cancer']].shape[0]
    
#     print(df['tad_source'].iloc[0], sig_cancer_num, all_cancer_num, sig_noncancer_num, all_noncancer_num)
    return (sig_cancer_num/all_cancer_num) / (sig_noncancer_num/all_noncancer_num)

# Apply measure

In [None]:
border_type = '20in'
pvalue_type = 'pval_boundary'  # pval_boundary__notcorrected
filter_type = 'nofilter'

In [None]:
# pre-transform data
df_trans = df.groupby(['tad_source', 'window_size', 'filter', 'diseaseId', 'TAD_type']).first().reset_index().copy()

df_trans = df_trans[df_trans['TAD_type'] == border_type]
df_trans = df_trans[df_trans['filter'] == filter_type]

df_trans.loc[df_trans[pvalue_type]==0, pvalue_type] = 1e-16

df_trans['pval_boundary_neglog'] = df_trans[pvalue_type].apply(lambda x: -np.log10(x))

In [None]:
df_trans.head()

In [None]:
signal_data = []
for (tad_source, window_size), group in df_trans.groupby(['tad_source', 'window_size']):
    signal_data.extend([
        {
            'tad_source': tad_source,
            'window_size': window_size,
            'signal': enrichment_quotient(group),
            'type': r'$\frac{\langle-\log_{10}(p_{cancer,boundary_{20in}})\rangle}{\langle-\log_{10}(p_{noncancer,boundary_{20in}})\rangle}$'
        },
        {
            'tad_source': tad_source,
            'window_size': window_size,
            'signal': count_quotient(group),
            'type': r'$\frac{|\mathrm{cancer}_{sig}| / |\mathrm{cancer}_{all}|}{|\mathrm{noncancer}_{sig}| / |\mathrm{noncancer}_{all}|}$'
        }
    ])
df_signal = pd.DataFrame(signal_data)

In [None]:
df_signal.head()

# Visualize result

In [None]:
def my_bar(*args, **kwargs):
    # adjust baseline
    baseline = 1
    tmp = kwargs['data'].copy()
    tmp[kwargs['y']] -= baseline
    kwargs['data'] = tmp
    
    # plot
    annotated_barplot(
        *args, **kwargs,
        order=natsorted(kwargs['data']['window_size'].unique()),
        anno_kws=dict(label_offset=6, label_size=9),
        palette=sns.color_palette(),
        bottom=baseline)

In [None]:
g = sns.FacetGrid(
    df_signal,
    col='tad_source', col_wrap=2,
    sharex=False, sharey=True,
    height=7, aspect=2)

g.map_dataframe(my_bar, x='window_size', y='signal', hue='type')

plt.xlabel('Window size')
for ax in g.axes.ravel():
    ax.legend()
    
g.savefig(outdir / 'signal_vs_datasource.pdf')

# SNP counts

## Aggregate counts

In [None]:
sub = df[df['TAD_type'] == '20in'][['TAD_type', 'diseaseId', '#border_snp', '#snp', 'tad_source', 'window_size', 'is_cancer', 'filter']].drop_duplicates()
sub['snp_fraction'] = sub['#border_snp'] / sub['#snp']

sub = sub[sub['filter'] == 'nofilter']

sub.head()

## Plot count data

In [None]:
g = sns.FacetGrid(
    sub,
    col='tad_source', col_wrap=min(2, sub['tad_source'].unique().size),
    col_order=natsorted(sub['tad_source'].unique()),
    sharex=False, sharey=True,
    height=7, aspect=2)

g.map_dataframe(
    sns.boxplot,
    x='window_size', y='snp_fraction', hue='is_cancer',
    order=natsorted(sub['window_size'].unique()))

# g.set(yscale='log')
g.set_axis_labels('Window size', r'$\frac{|snp_{boundary}|}{|snp_{all}|}$ per disease')
g.add_legend(title='is_cancer')

g.savefig(outdir / 'snp_numbers.pdf')

# Variant type effect

## Prepare data

In [None]:
df_trans = df.groupby(['tad_source', 'window_size', 'filter', 'diseaseId', 'TAD_type']).first().reset_index().copy()
df_trans = df_trans[df_trans['TAD_type'] == '20in']

df_trans.head()

In [None]:
df_none = df_trans[df_trans['filter'] == 'nofilter']
df_exonic = df_trans[df_trans['filter'] == 'variant_group=exonic']
df_intronic = df_trans[df_trans['filter'] == 'variant_group=intronic']
df_intergenic = df_trans[df_trans['filter'] == 'variant_group=intergenic']

In [None]:
for name, df_tmp in [
    ('none', df_none), ('exonic', df_exonic), 
    ('intronic', df_intronic), ('intergenic', df_intergenic)
]:
    print(name)
    print('#entries:', df_tmp.shape[0])
    print('Unique variant types:', df_tmp['variant_type'].unique().tolist())
    print()

## Aggregate data

In [None]:
process = lambda x: x.drop_duplicates('diseaseId').set_index(['is_cancer', 'diseaseId'])['pval_boundary'].apply(lambda x: -np.log10(x)).replace([np.inf, -np.inf], np.nan)

df_merged = pd.DataFrame({
    'enrichment_none': process(df_none),
    'enrichment_exonic': process(df_exonic),
    'enrichment_intronic': process(df_intronic),
    'enrichment_intergenic': process(df_intergenic)
})

In [None]:
df_merged.head()

## Visualize

In [None]:
def custom_scatter(x=None, y=None, data=None, color=None):
    ax = sns.scatterplot(x=x, y=y, color=color, data=data)

    ax.axhline(-np.log10(.05), color='red', ls='dashed')
    ax.axvline(-np.log10(.05), color='red', ls='dashed')
    add_identity(ax, color='grey', ls='dashed')

    max_ = np.nanmax(df_merged[[x, y]].values)
    
    if not np.isnan(max_):
        max_ *= 1.05
        ax.set_xlim((0, max_))
        ax.set_ylim((0, max_))
    else:
        print('Warning, `max_` is Nan')
    
    # annotate diseases
    tmp = data.reset_index()

    if not tmp[y].empty:
        sel = tmp.loc[tmp[y].idxmax()]

        ax.annotate(
            sel.diseaseId,
            xy=(sel[x], sel[y]), xytext=(50, 0),
            xycoords='data', textcoords='offset points',
            fontsize=10, ha='center', va='center',
            arrowprops=dict(arrowstyle='->')
        )
    else:
        print('Warning, no disease annotation possible')

In [None]:
for x_axis_data_source, y_axis_data_source in itertools.combinations(df_merged.columns, 2):
    g = sns.FacetGrid(
        df_merged.reset_index(level=0), col='is_cancer', 
        sharex=True, sharey=True,
        height=5, aspect=1)

    g.map_dataframe(custom_scatter, x=x_axis_data_source, y=y_axis_data_source)
    g.set_axis_labels(x_axis_data_source, y_axis_data_source)

    g.savefig(outdir / f'enrichment_variants__{x_axis_data_source}_{y_axis_data_source}.pdf')