In [None]:
%matplotlib inline

In [None]:
import os
import collections

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm

from utils import load_config

In [None]:
sns.set_context('talk')
pd.set_option('display.max_columns', 99)

In [None]:
config = load_config()

# Load data

In [None]:
results_dir = config['output_dirs']['results']
df = pd.read_table(f'{results_dir}/snpdb_enhanced.tsv')
df.head()

In [None]:
results_dir = config['output_dirs']['results']
df_enr = pd.read_csv(f'{results_dir}/TAD_enrichment.csv')

# mark cancer diseases
iscancer_map = df[['diseaseId', 'is_cancer']].set_index('diseaseId').to_dict()['is_cancer']
df_enr['is_cancer'] = df_enr['disease'].map(iscancer_map)

# add disease name
disname_map = df[['diseaseId', 'diseaseName']].set_index('diseaseId').to_dict()['diseaseName']
df_enr['disease_name'] = df_enr['disease'].map(disname_map)

df_enr.sample(5)

# Tables

## Save general table containing all information

In [None]:
df_all = df.merge(df_enr.drop(columns='is_cancer'), left_on='diseaseId', right_on='disease')
efo_lbl_map = pd.read_csv(f'{results_dir}/disease_efolabels.csv', index_col=0).to_dict()['label']
df_all['diseaseEFOName'] = df_all['diseaseId'].apply(lambda x: efo_lbl_map[x])
df_all.rename(columns={'#border_snp': 'num_bordersnp', '#snp': 'num_snp'}, inplace=True)

In [None]:
df_all.head(1)

In [None]:
tmp = []
for row in tqdm(df_all.itertuples(), total=df_all.shape[0]):
    tmp.append(collections.OrderedDict([
        # disease properties
        ('diseaseId', row.diseaseId),
        ('diseaseEFOName', row.diseaseEFOName),
        ('#border_snp', row.num_bordersnp),
        ('#snp', row.num_snp),
        ('is_cancer', row.is_cancer),
        
        # enrichment information
        ('TAD_type', row.TAD_type),
        ('pval_boundary', row.pval_boundary),
        
        # SNP properties
        ('snpId', row.snpId),
        ('chromosome', row.chromosome),
        ('position', row.position),
        ('TAD_relation', row._asdict()[f'TAD_{row.TAD_type}']),
        ('variant_type', row.variant_type)
    ]))

df_agg = pd.DataFrame(tmp)
df_agg.drop_duplicates(inplace=True)
df_agg.set_index(['TAD_type', 'diseaseId'], inplace=True)
df_agg.sort_index(inplace=True)

In [None]:
df_agg.to_csv(os.path.join(results_dir, 'final.csv'))

print(df_agg.shape)
df_agg.head(3)

### Test special cases

## SNP lists

In [None]:
TAD_boundary_types = [b.split('_')[-1] for b in df.filter(like='TAD_').columns.tolist()]
TAD_boundary_types

In [None]:
for boundary_type in TAD_boundary_types:
    # merge data sources
    df_enr_tmp = df_enr[df_enr['TAD_type'] == boundary_type]
    tmp = df.merge(df_enr_tmp, left_on='diseaseId', right_on='disease')

    tmp_s = tmp[tmp['pval_boundary'] < .05]
    tmp_b = tmp_s[tmp_s[f'TAD_{boundary_type}'] == 'boundary'].copy()
    
    # add EFO disease labels
    efo_lbl_map = pd.read_csv(f'{results_dir}/disease_efolabels.csv', index_col=0).to_dict()['label']
    tmp_b['diseaseEFOName'] = tmp_b['diseaseId'].apply(lambda x: efo_lbl_map[x])
    
    # generate tables
    df_lists = {True: [], False: []}
    for (is_cancer, dixd), group in tmp_b.groupby(['is_cancer_x', 'diseaseId']):
        cur = group[
            ['diseaseId', 'diseaseEFOName', 'snpId', 'chromosome', 'position', 'odds_ratio', 'variant_type']
        ].drop_duplicates(subset='snpId')
        #display(cur)

        df_lists[is_cancer].append(cur.copy())
            
    # save result
    print(boundary_type)
    
    if len(df_lists[True]) > 0:
        df_cancer = pd.concat(df_lists[True])
        display(df_cancer.head())
        df_cancer.to_csv(f'{results_dir}/snplist_cancer_{boundary_type}.csv', index=False)
    else:
        print(' > No cancer boundary SNPs found')

    if len(df_lists[False]) > 0:
        df_noncancer = pd.concat(df_lists[False])
        display(df_noncancer.head())
        df_noncancer.to_csv(f'{results_dir}/snplist_noncancer_{boundary_type}.csv', index=False)
    else:
        print(' > No non-cancer boundary SNPs found')

## Enriched cancers

In [None]:
for tad_type, group in df_enr.groupby('TAD_type'):
    sub = group[group['is_cancer'] & (group['pval_boundary'] < .05)]
    
    sub = sub.sort_values('pval_boundary')
    sub = sub[['disease', 'disease_name', 'pval_boundary', '#snp', '#border_snp']]
    
    display(tad_type, sub.head())

# Plots

In [None]:
images_dir = config['output_dirs']['images']

In [None]:
# rescale p-values
df_enr_log = df_enr.copy()
df_enr_log.loc[df_enr_log.pval_boundary==0, 'pval_boundary'] = 1e-16

df_enr_log['pval_boundary'] = df_enr_log['pval_boundary'].apply(lambda x: -np.log10(x))
df_enr_log['pval_tad'] = df_enr_log['pval_tad'].apply(lambda x: -np.log10(x))
df_enr_log['pval_none'] = df_enr_log['pval_none'].apply(lambda x: -np.log10(x))

df_enr_log = df_enr_log.replace([np.inf, -np.inf], np.nan)

## TAD border enrichment

In [None]:
# filter data
max_x = 3  # df_enr_log['pval_boundary'].max()
tmp = df_enr_log.copy()
tmp.loc[tmp['pval_boundary'] > max_x, 'pval_boundary'] = max_x

# plot
g = sns.FacetGrid(
    tmp, col='TAD_type', col_wrap=3, hue='is_cancer',
    col_order=TAD_boundary_types,
    height=5, aspect=1.2)

bins = np.linspace(0, max_x, max_x * 10)

g = g.map(sns.distplot, 'pval_boundary', hist_kws=dict(alpha=.5), bins=bins, kde=False, norm_hist=True)
g = g.map(plt.axvline, x=-np.log10(.05), color='red', linestyle='dashed')

g.set(xlim=(0, max_x))
g.add_legend()

plt.suptitle(rf'values $>{max_x}$ are set to ${max_x}$')

#plt.tight_layout()
plt.subplots_adjust(top=.92)
plt.savefig(f'{images_dir}/tad_border_enrichment.pdf')