In [None]:
%matplotlib inline

In [None]:
import os
import collections

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.lines as mlines

from tqdm import tqdm_notebook as tqdm

from utils import load_config
from tad_helper_functions import get_tad_lengths, EmptyTAD, OverlappingTADS

In [None]:
sns.set_context('talk')

In [None]:
config = load_config()

In [None]:
results_dir = config['output_dirs']['results']
images_dir = config['output_dirs']['images']

# Old enrichments vs new enrichments

## Read data

In [None]:
# read data
df_enr_old = pd.read_csv(f'data/TAD_enrichment__old_version_adjusted.csv')
df_enr_new = pd.read_csv(f'{results_dir}/TAD_enrichment.csv')

# subset to shared diseases
shared_diseases = list(set(df_enr_new['disease'].tolist()) & set(df_enr_old['disease'].tolist()))

print('#diseases (old):', df_enr_old['disease'].unique().size)
print('#diseases (new):', df_enr_new['disease'].unique().size)
print('#diseases (shared):', len(shared_diseases))

df_enr_old = df_enr_old[df_enr_old['disease'].isin(shared_diseases)]
df_enr_new = df_enr_new[df_enr_new['disease'].isin(shared_diseases)]
assert df_enr_old['disease'].unique().size == df_enr_new['disease'].unique().size == len(shared_diseases)

# annotate diseases
df_iscancer = pd.read_csv(f'{results_dir}/disease_cancer_classification.csv')

df_enr_old = df_enr_old.merge(
    df_iscancer, left_on='disease', right_on='diseaseId').drop(columns='diseaseId')
df_enr_new = df_enr_new.merge(
    df_iscancer, left_on='disease', right_on='diseaseId').drop(columns='diseaseId')

# set appropriate index
df_enr_new.set_index(['disease', 'TAD_type'], inplace=True)
df_enr_old.set_index(['disease', 'TAD_type'], inplace=True)

In [None]:
df_enr_new.sample(5)

## Plot result

In [None]:
pval_cols = [c for c in df_enr_new.columns if c.startswith('pval')]
mylog = np.vectorize(lambda x: np.log10(x) if x > 0 else np.nan)

def plot(df_old, df_new, s_genediff, ax, marker='o', colorbar=True):
    lbl = '-log(p_value)'
    df_tmp = pd.DataFrame({
        f'{lbl} (old)': -mylog(df_old),
        f'{lbl} (new)': -mylog(df_new),
        'log(abs(#snp diff))': mylog(s_genediff)
    })
    df_tmp.plot(
        x=f'{lbl} (old)', y=f'{lbl} (new)', c='log(abs(#snp diff))',
        colormap='viridis', kind='scatter', ax=ax, loglog=True,
        colorbar=colorbar, marker=marker, legend=False)  #, xlim=(1e-1, 1e1), ylim=(1e-1, 1e1)
    ax.set_aspect('equal')
    
    ax.axvline(x=-np.log10(.05), color='red', linestyle='dashed', linewidth=.5)
    ax.axhline(y=-np.log10(.05), color='red', linestyle='dashed', linewidth=.5)
    
    # hacky way to draw a diagonal line through the origin
    x0 = min(ax.get_xlim()[0], ax.get_ylim()[0])
    y0 = max(ax.get_xlim()[1], ax.get_ylim()[1])
    diag_line, = ax.plot([x0, y0], [x0, y0], color='grey', linestyle='dashed', linewidth=.5)  #, transform=ax.transAxes
    def on_change(axes):
        x0 = min(ax.get_xlim()[0], ax.get_ylim()[0])
        y0 = max(ax.get_xlim()[1], ax.get_ylim()[1])
        diag_line.set_data([x0, y0], [x0, y0])
    ax.callbacks.connect('xlim_changed', on_change)
    ax.callbacks.connect('ylim_changed', on_change)

    
TAD_type_list = df_enr_new.index.get_level_values('TAD_type').unique().tolist()
for pcol in pval_cols:
    s = 6
    fig, ax_arr = plt.subplots(
        2, len(TAD_type_list)//2,
        figsize=((len(TAD_type_list)//2)*s,2*s))

    for (tad_type, group_new), ax in zip(df_enr_new.groupby('TAD_type'), ax_arr.flatten()):
        group_old = df_enr_old.loc[group_new.index]

        ax.set_title(tad_type)
        
        # plot cancer
        plot(
            group_old.loc[group_old.is_cancer, pcol], group_new.loc[group_new.is_cancer, pcol],
            abs(group_old.loc[group_old.is_cancer, 'gene_num'] - group_new.loc[group_new.is_cancer, '#snp']),
            ax, marker='^')
        
        # plot non-cancer
        plot(
            group_old.loc[~group_old.is_cancer, pcol], group_new.loc[~group_new.is_cancer, pcol],
            abs(group_old.loc[~group_old.is_cancer, 'gene_num'] - group_new.loc[~group_new.is_cancer, '#snp']),
            ax, marker='s', colorbar=False)
        
        # plot legend
        legend_patches = [
            mlines.Line2D([], [], color='black', marker='^', markersize=15, linewidth=0, label='cancer'),
            mlines.Line2D([], [], color='black', marker='s', markersize=15, linewidth=0, label='non-cancer')
        ]
        ax.legend(handles=legend_patches, loc='best')

    plt.suptitle(pcol)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.savefig(f'{images_dir}/enrichment_comparison_oldnew_{pcol}.pdf')

## Consider specific example

### Largest p-value difference

In [None]:
max_efo = abs(
    df_enr_new[df_enr_new.is_cancer].xs('20in', level='TAD_type')['pval_boundary'] - 
    df_enr_old[df_enr_old.is_cancer].xs('20in', level='TAD_type')['pval_boundary']
).idxmax()
print('EFO with largest p-value change:', max_efo)

print('> Old:\n', df_enr_old.loc[(max_efo, '20in')])
print('> New:\n', df_enr_new.loc[(max_efo, '20in')])

### P-value difference with same number of SNPs

In [None]:
idx_rel = df_enr_new[
    (df_enr_new['#snp'] - df_enr_old['gene_num']) == 0][df_enr_new.is_cancer].xs('20in', level='TAD_type').index

In [None]:
max_efo = abs(
    df_enr_new.xs('20in', level='TAD_type').loc[idx_rel]['pval_boundary'] - 
    df_enr_old.xs('20in', level='TAD_type').loc[idx_rel]['pval_boundary']
).idxmax()

print('EFO with largest p-value change:', max_efo)

print('> Old:\n', df_enr_old.loc[(max_efo, '20in')])
print('> New:\n', df_enr_new.loc[(max_efo, '20in')])

# TAD statistics

## Prepare data

In [None]:
df_tads = pd.read_table(f'{results_dir}/tads_hg38.tsv')

df_tads['tad_len'] = df_tads['tad_stop'] - df_tads['tad_start']
df_tads['prev_tad_stop'] = df_tads.tad_stop.shift(1)
df_tads['next_tad_start'] = df_tads.tad_start.shift(-1)
df_tads['prev_tad_chr'] = df_tads.chrname.shift(1)
df_tads['next_tad_chr'] = df_tads.chrname.shift(-1)

df_tads.sample(5)

## TAD (boundary) lengths

In [None]:
TAD_border_types = ['20in', '40in', '20out', '40out', '20inout', '40inout']

In [None]:
tad_data = []
for type_ in TAD_border_types:
    tad_errors = collections.defaultdict(int)
    for row in df_tads.itertuples():
        try:
            b1_range, tad_range, b2_range = get_tad_lengths(row, type_)
        except (EmptyTAD, OverlappingTADS) as ex:
            tad_errors[ex.__class__.__name__] += 1
            continue

        tad_data.append({
            'tad_type': type_,
            'tad_length': tad_range.stop - tad_range.start,
            'b1_length': b1_range.stop - b1_range.start,
            'b2_length': b2_range.stop - b2_range.start
        })
        
    print(f'[{type_}] Errors counts:', dict(tad_errors))
df_tad_len = pd.DataFrame(tad_data)
df_tad_len.sample(5)

In [None]:
df_tad_len_long = pd.melt(
    df_tad_len,
    id_vars=['tad_type'],
    value_vars=['tad_length', 'b1_length', 'b2_length'])
df_tad_len_long.sample(5)

In [None]:
g = sns.FacetGrid(
    col='tad_type', col_wrap=2,
    sharex=False, sharey=False,
    data=df_tad_len_long[df_tad_len_long['variable']!='tad_length'],
    aspect=2.3)

g.map(sns.distplot, 'value', kde=False)
plt.suptitle('TAD boundary lengths')

for ax in g.axes.flatten():
    ax.ticklabel_format(useOffset=False)
    
plt.tight_layout()
plt.subplots_adjust(top=0.9)

plt.savefig(f'{images_dir}/tadboundary_lengths.pdf')

## Inter-TAD distances

### Compute distances

In [None]:
def func2prev(row):
    if row.chrname == row.prev_tad_chr:
        return row.tad_start - row.prev_tad_stop
    else:
        return np.nan
def func2next(row):
    if row.chrname == row.next_tad_chr:
        return row.next_tad_start - row.tad_stop
    else:
        return np.nan

df_tads['dist_to_prev_tad'] = df_tads.apply(func2prev, axis=1)
df_tads['dist_to_next_tad'] = df_tads.apply(func2next, axis=1)
df_tads.sample(5)

### Investigate peculiar distances

In [None]:
df_tads['dist_to_prev_tad'][df_tads['dist_to_prev_tad']<0]

In [None]:
df_tads.loc[191:193]

### Histograms

In [None]:
inter_dists = pd.concat([df_tads['dist_to_prev_tad'], df_tads['dist_to_next_tad']]).dropna()
inter_dists = inter_dists[(inter_dists >= 0) & (inter_dists < 200_000)]

sns.distplot(inter_dists, kde=False)

plt.yscale('log')
plt.xlabel('Inter-TAD distance')
plt.ylabel('Count')

plt.tight_layout()
plt.savefig(f'{images_dir}/intertad_distances.pdf')

# GWAS history

## Acquire data

In [None]:
gwashist_dir = './cache/gwas_history/'

if not os.path.isdir(gwashist_dir):
    # download all versions
    !wget \
        -m \
        --no-parent \
        -A "gwas-catalog-associations_ontology-annotated.tsv" \
        -P $gwashist_dir \
        ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/
else:
    print('Cached', gwashist_dir)

In [None]:
# parse data
data = []
for year in os.scandir(f'{gwashist_dir}/ftp.ebi.ac.uk/pub/databases/gwas/releases/'):
    if not year.is_dir():
        continue
    for month in os.scandir(year.path):
        if not month.is_dir():
            continue
        for day in os.scandir(month.path):
            if not day.is_dir():
                continue
            for entry in os.scandir(day.path):
                if not entry.name.endswith('.tsv'):
                    continue

                timestamp = f'{year.name}.{month.name}.{day.name}'
                data.append((timestamp, entry.path, pd.read_table(entry.path, low_memory=False)))
                
df_gwashist = pd.DataFrame(data, columns=['timestamp', 'path', 'dataframe'])
df_gwashist['timestamp'] = pd.to_datetime(df_gwashist['timestamp'])
df_gwashist.set_index('timestamp', inplace=True)

In [None]:
df_gwashist.sample(5)

In [None]:
pd.set_option('display.max_columns', 99)
df_gwashist.iloc[-1].dataframe.sample(3)

## Filter for disease that appear in old and new data

In [None]:
def filter_common_efo_terms(df):
    efo = df['MAPPED_TRAIT_URI'].dropna().str.split(' *, *').apply(lambda x: sorted(x)[0].split('/')[-1])
    efo_shared = efo.isin(shared_diseases)
    return df.dropna(subset=['MAPPED_TRAIT_URI']).loc[efo_shared]

In [None]:
df_gwashist['dataframe_sub'] = df_gwashist['dataframe'].apply(filter_common_efo_terms)

In [None]:
print(df_gwashist.iloc[0].dataframe.shape)
print(df_gwashist.iloc[0].dataframe_sub.shape)

## Plot size development

In [None]:
#df_gwashist['size'] = df_gwashist['dataframe'].apply(lambda x: x.shape[0])
#df_gwashist['snp_num'] = df_gwashist['dataframe'].apply(lambda x: x['SNP_ID_CURRENT'].unique().size)

df_gwashist['size'] = df_gwashist['dataframe_sub'].apply(lambda x: x.shape[0])
df_gwashist['snp_num'] = df_gwashist['dataframe_sub'].apply(lambda x: x['SNP_ID_CURRENT'].unique().size)

In [None]:
plt.figure(figsize=(8,6))

df_gwashist['size'].plot(label='#total entries (rows)')
df_gwashist['snp_num'].plot(label='#unique SNPs')
plt.xlabel('Release date')
plt.ylabel('GWAS-Catalog size [#entries]')

plt.legend(loc='best')

plt.tight_layout()
plt.savefig(f'{images_dir}/gwas_history.pdf')