In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import networkx as nx

import seaborn as sns
import matplotlib.pyplot as plt

from natsort import natsorted
from tqdm.auto import tqdm

In [None]:
sns.set_context('talk')
pd.set_option('display.max_columns', None)

# Parameters

In [None]:
fname_data = snakemake.input.fname_data
fname_enr = snakemake.input.fname_enr

outdir = Path(snakemake.output.outdir)

# Load data

## Database data

In [None]:
df_data = pd.read_csv(fname_data, low_memory=True)
df_data.head()

## Enrichment data

In [None]:
df_enr = pd.read_csv(fname_enr, low_memory=True)

In [None]:
max_enrichment = 16
df_enr['pval_boundary_trans'] = df_enr['pval_boundary'].apply(lambda x: max_enrichment if x == 0 else -np.log10(x))

In [None]:
df_enr.head()

# Create figures

## Figure 1: Overview Sketch

In [None]:
# TODO

## Figure 2: p-value Histogram for nice case

In [None]:
def plot_histogram(df, fname, bins=np.linspace(0, 3, 50)):
    plt.figure(figsize=(8, 6))
    
    for is_cancer, group in df.groupby('is_cancer'):
        tmp = group.copy()

        tmp.loc[tmp['pval_boundary_trans'] > 3, 'pval_boundary_trans'] = 3

        sns.distplot(
            tmp['pval_boundary_trans'],
            kde=False, norm_hist=True, bins=bins,
            label=is_cancer
        )

    plt.xlabel(r'$-log_{10}(\mathrm{pvalue})$')
    plt.ylabel('Frequency')
    
    plt.ylim(0, 1)

    plt.axvline(-np.log10(.05), ls='dashed', color='red')
    plt.legend(loc='best', title='is_cancer')

    plt.tight_layout()
    plt.savefig(outdir / fname)

### Subfigure 2a: nofilter

In [None]:
sub_nofilter = df_enr[
    (df_enr['filter'] == 'nofilter') &
    (df_enr['TAD_type'] == '20in') &
    (df_enr['#border_snp'] > 10)
]

In [None]:
plot_histogram(sub_nofilter, 'pvalue_histogram_nofilter.pdf')

### Subfigure 2b: intergenic filter

In [None]:
sub_intergenic = df_enr[
    (df_enr['filter'] == 'intergenic') &
    (df_enr['TAD_type'] == '20in') &
    (df_enr['#border_snp'] > 10)
]

In [None]:
plot_histogram(sub_intergenic, 'pvalue_histogram_intergenic.pdf')

## Figure 3: Zoomed-out multi-dataset overview

In [None]:
df_majority = (df_enr.groupby(['TAD_type', 'filter', 'tad_source', 'is_cancer', 'diseaseId'])
       .apply(lambda x: (x['pval_boundary'] <= .05).mean() > .5)
       .to_frame('majority_is_sig')
       .reset_index()
       .drop('diseaseId', axis=1)
       .groupby(['TAD_type', 'filter', 'tad_source', 'is_cancer'])['majority_is_sig']
       .apply(lambda x: x.mean())
       .to_frame('sig_frac')
       .reset_index()
)
df_majority.head()

In [None]:
g = sns.FacetGrid(df_majority, row='TAD_type', col='tad_source', height=5, aspect=2)

g.map_dataframe(sns.barplot, x='filter', y='sig_frac', hue='is_cancer', palette='tab10')

g.set_axis_labels('Filter', 'Disease fraction sig. in $>0.5$ cases')
g.add_legend(title='is_cancer')

for ax in g.axes.flat:
    ax.tick_params(labelbottom=True)

g.savefig(outdir / 'dataset_overview.pdf')

## Figure 4: Comparison between 20in and 40in borders

In [None]:
# TODO

## Figure 5: multipartite graph

In [None]:
sub = df_data[df_data['20in'] == 'boundary']

In [None]:
graph = nx.from_pandas_edgelist(sub, 'diseaseId', 'snpId')
print(nx.info(graph))

In [None]:
graph_proj = nx.bipartite.projected_graph(graph, sub['diseaseId'].unique().tolist())
print(nx.info(graph_proj))

In [None]:
pos = nx.drawing.nx_agraph.graphviz_layout(graph_proj, prog='neato', args='-Goverlap=scale')

In [None]:
iscancer_map = sub.set_index('diseaseId').to_dict()['is_cancer']
node_color_list = ['orange' if iscancer_map[n] else 'blue' for n in graph_proj.nodes()]

In [None]:
plt.figure(figsize=(2*8, 2*6))

nx.draw_networkx_nodes(graph_proj, pos, node_size=20, node_color=node_color_list)
nx.draw_networkx_edges(graph_proj, pos, alpha=.2)

plt.axis('off')

plt.tight_layout()
plt.savefig(outdir / 'network.pdf')