In [None]:
%matplotlib inline

In [None]:
import os

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

In [None]:
sns.set_context('talk')

## GWAS history

### Acquire data

In [None]:
gwashist_dir = './cache/gwas_history/'

if not os.path.isdir(gwashist_dir):
    # download all versions
    !wget -m -A "gwas-catalog-associations.tsv" ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases -P $gwashist_dir
else:
    print('Cached', gwashist_dir)

In [None]:
# parse data
data = []
for year in os.scandir(f'{gwashist_dir}/ftp.ebi.ac.uk/pub/databases/gwas/releases/'):
    if not year.is_dir():
        continue
    for month in os.scandir(year.path):
        if not month.is_dir():
            continue
        for day in os.scandir(month.path):
            if not day.is_dir():
                continue
            for entry in os.scandir(day.path):
                if not entry.name.endswith('.tsv'):
                    continue

                timestamp = f'{year.name}.{month.name}.{day.name}'
                data.append((timestamp, entry.path, pd.read_table(entry.path, low_memory=False)))
                
df_gwashist = pd.DataFrame(data, columns=['timestamp', 'path', 'dataframe'])
df_gwashist['timestamp'] = pd.to_datetime(df_gwashist['timestamp'])
df_gwashist.set_index('timestamp', inplace=True)

In [None]:
df_gwashist.sample(5)

### Plot size development

In [None]:
df_gwashist['size'] = df_gwashist['dataframe'].apply(lambda x: x.shape[0])

In [None]:
plt.figure()

df_gwashist['size'].plot()
plt.xlabel('Release date')
plt.ylabel('GWAS-Catalog size [#entries]')

plt.tight_layout()
plt.savefig('images/gwas_history.pdf')

## Old enrichments vs new enrichments

### Read data

In [None]:
df_enr_old = pd.read_csv('results/TAD_enrichment__old_version.csv')
df_enr_new = pd.read_csv('results/TAD_enrichment.csv')

print('#diseases (old):', df_enr_old['disease'].unique().size)
print('#diseases (new):', df_enr_new['disease'].unique().size)

In [None]:
shared_diseases = list(set(df_enr_new['disease'].tolist()) & set(df_enr_old['disease'].tolist()))

df_enr_old = df_enr_old[df_enr_old['disease'].isin(shared_diseases)].set_index('disease')
df_enr_new = df_enr_new[df_enr_new['disease'].isin(shared_diseases)].set_index('disease')

### Plot result

In [None]:
### Plot result

In [None]:
pval_cols = [c for c in df_enr_new.columns if c.startswith('pval')]
mylog = np.vectorize(lambda x: np.log10(x) if x > 0 else np.nan)

for pcol in pval_cols:
    fig, ax_arr = plt.subplots(
        2, len(df_enr_new['TAD_type'].unique())//2,
        figsize=(20,8))
    for (tad_type, group_new), ax in zip(df_enr_new.groupby('TAD_type'), ax_arr.flatten()):
        group_old = df_enr_old[df_enr_old['TAD_type'] == tad_type].reindex(shared_diseases)
        group_new = group_new.reindex(shared_diseases)

        ax.set_title(tad_type)
        lbl = '-log(p_value)'
        df_tmp = pd.DataFrame({
            f'{lbl} (old)': -mylog(group_old[pcol]),
            f'{lbl} (new)': -mylog(group_new[pcol]),
            'log(#snp diff)': mylog(abs(group_old['gene_num'] - group_new['#snp']))
        })
        df_tmp.plot(
            x=f'{lbl} (old)', y=f'{lbl} (new)', c='log(#snp diff)',
            colormap='viridis', kind='scatter', ax=ax, loglog=True)
        ax.set_aspect('equal')

    plt.suptitle(pcol)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.savefig(f'images/enrichment_comparison_oldnew_{pcol}.pdf')