In [None]:
%matplotlib inline

In [None]:
import os

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.lines as mlines

from tqdm import tqdm_notebook as tqdm

In [None]:
sns.set_context('talk')

## GWAS history

### Acquire data

In [None]:
gwashist_dir = './cache/gwas_history/'

if not os.path.isdir(gwashist_dir):
    # download all versions
    !wget -m -A "gwas-catalog-associations.tsv" ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases -P $gwashist_dir
else:
    print('Cached', gwashist_dir)

In [None]:
# parse data
data = []
for year in os.scandir(f'{gwashist_dir}/ftp.ebi.ac.uk/pub/databases/gwas/releases/'):
    if not year.is_dir():
        continue
    for month in os.scandir(year.path):
        if not month.is_dir():
            continue
        for day in os.scandir(month.path):
            if not day.is_dir():
                continue
            for entry in os.scandir(day.path):
                if not entry.name.endswith('.tsv'):
                    continue

                timestamp = f'{year.name}.{month.name}.{day.name}'
                data.append((timestamp, entry.path, pd.read_table(entry.path, low_memory=False)))
                
df_gwashist = pd.DataFrame(data, columns=['timestamp', 'path', 'dataframe'])
df_gwashist['timestamp'] = pd.to_datetime(df_gwashist['timestamp'])
df_gwashist.set_index('timestamp', inplace=True)

In [None]:
df_gwashist.sample(5)

### Plot size development

In [None]:
df_gwashist['size'] = df_gwashist['dataframe'].apply(lambda x: x.shape[0])

In [None]:
plt.figure()

df_gwashist['size'].plot()
plt.xlabel('Release date')
plt.ylabel('GWAS-Catalog size [#entries]')

plt.tight_layout()
plt.savefig('images/gwas_history.pdf')

## Old enrichments vs new enrichments

### Read data

In [None]:
# read data
df_enr_old = pd.read_csv('results/TAD_enrichment__old_version.csv')
df_enr_new = pd.read_csv('results/TAD_enrichment.csv')

# subset to shared diseases
shared_diseases = list(set(df_enr_new['disease'].tolist()) & set(df_enr_old['disease'].tolist()))

print('#diseases (old):', df_enr_old['disease'].unique().size)
print('#diseases (new):', df_enr_new['disease'].unique().size)
print('#diseases (shared):', len(shared_diseases))

df_enr_old = df_enr_old[df_enr_old['disease'].isin(shared_diseases)]
df_enr_new = df_enr_new[df_enr_new['disease'].isin(shared_diseases)]

# annotate diseases
df_iscancer = pd.read_csv('results/disease_cancer_classification.csv')

df_enr_old = df_enr_old.merge(
    df_iscancer, left_on='disease', right_on='diseaseId').drop(columns='diseaseId').set_index('disease')
df_enr_new = df_enr_new.merge(
    df_iscancer, left_on='disease', right_on='diseaseId').drop(columns='diseaseId').set_index('disease')

In [None]:
df_enr_new.sample(5)

### Plot result

In [None]:
pval_cols = [c for c in df_enr_new.columns if c.startswith('pval')]
mylog = np.vectorize(lambda x: np.log10(x) if x > 0 else np.nan)

def plot(df_old, df_new, s_genediff, ax, marker='o', colorbar=True):
    lbl = '-log(p_value)'
    df_tmp = pd.DataFrame({
        f'{lbl} (old)': -mylog(df_old),
        f'{lbl} (new)': -mylog(df_new),
        'log(#snp diff)': mylog(s_genediff)
    })
    df_tmp.plot(
        x=f'{lbl} (old)', y=f'{lbl} (new)', c='log(#snp diff)',
        colormap='viridis', kind='scatter', ax=ax, loglog=True,
        colorbar=colorbar, marker=marker, legend=False,
        xlim=(1e-1, 1e1), ylim=(1e-1, 1e1))
    ax.set_aspect('equal')
    
    ax.axvline(x=-np.log10(.05), color='red', linestyle='dashed', linewidth=.2)
    ax.axhline(y=-np.log10(.05), color='red', linestyle='dashed', linewidth=.5)

for pcol in pval_cols:
    s = 6
    fig, ax_arr = plt.subplots(
        2, len(df_enr_new['TAD_type'].unique())//2,
        figsize=((len(df_enr_new['TAD_type'].unique())//2)*s,2*s))

    for (tad_type, group_new), ax in zip(df_enr_new.groupby('TAD_type'), ax_arr.flatten()):
        group_old = df_enr_old[df_enr_old['TAD_type'] == tad_type].reindex(shared_diseases)
        group_new = group_new.reindex(shared_diseases)

        ax.set_title(tad_type)
        
        # plot cancer
        plot(
            group_old.loc[group_old.is_cancer, pcol], group_new.loc[group_new.is_cancer, pcol],
            abs(group_old.loc[group_old.is_cancer, 'gene_num'] - group_new.loc[group_new.is_cancer, '#snp']),
            ax, marker='^')
        
        # plot non-cancer
        plot(
            group_old.loc[~group_old.is_cancer, pcol], group_new.loc[~group_new.is_cancer, pcol],
            abs(group_old.loc[~group_old.is_cancer, 'gene_num'] - group_new.loc[~group_new.is_cancer, '#snp']),
            ax, marker='s', colorbar=False)
        
        # plot legend
        legend_patches = [
            mlines.Line2D([], [], color='black', marker='^', markersize=15, linewidth=0, label='cancer'),
            mlines.Line2D([], [], color='black', marker='s', markersize=15, linewidth=0, label='non-cancer')
        ]
        ax.legend(handles=legend_patches, loc='best')

    plt.suptitle(pcol)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.savefig(f'images/enrichment_comparison_oldnew_{pcol}.pdf')