In [None]:
%matplotlib inline

In [None]:
import os

import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

In [None]:
sns.set_context('talk')

## GWAS history

### Acquire data

In [None]:
gwashist_dir = './cache/gwas_history/'

if not os.path.isdir(gwashist_dir):
    # download all versions
    !wget -m -A "gwas-catalog-associations.tsv" ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases -P $gwashist_dir
else:
    print('Cached', gwashist_dir)

In [None]:
# parse data
data = []
for year in os.scandir(f'{gwashist_dir}/ftp.ebi.ac.uk/pub/databases/gwas/releases/'):
    if not year.is_dir():
        continue
    for month in os.scandir(year.path):
        if not month.is_dir():
            continue
        for day in os.scandir(month.path):
            if not day.is_dir():
                continue
            for entry in os.scandir(day.path):
                if not entry.name.endswith('.tsv'):
                    continue

                timestamp = f'{year.name}.{month.name}.{day.name}'
                data.append((timestamp, entry.path, pd.read_table(entry.path, low_memory=False)))
                
df_gwashist = pd.DataFrame(data, columns=['timestamp', 'path', 'dataframe'])
df_gwashist['timestamp'] = pd.to_datetime(df_gwashist['timestamp'])
df_gwashist.set_index('timestamp', inplace=True)

In [None]:
df_gwashist.sample(5)

### Plot size development

In [None]:
df_gwashist['size'] = df_gwashist['dataframe'].apply(lambda x: x.shape[0])

In [None]:
plt.figure()

df_gwashist['size'].plot()
plt.xlabel('Release date')
plt.ylabel('GWAS-Catalog size [#entries]')

plt.tight_layout()
plt.savefig('images/gwas_history.pdf')