In [None]:
%matplotlib inline

In [None]:
import os
import sys
import gzip
import json
import urllib
import tempfile
import collections

import numpy as np
import pandas as pd
import networkx as nx

import seaborn as sns
import matplotlib.pyplot as plt

import requests
from bs4 import BeautifulSoup

from tqdm import tqdm_notebook as tqdm

from utils import load_config, split_df_row
from tad_helper_functions import parse_tad_annotations

In [None]:
from tqdm import tqdm as tqdm_orig
tqdm_orig.pandas()

pd.set_option('display.max_columns', 99)

In [None]:
sns.set_context('talk')

In [None]:
config = load_config()

cache_dir = config['output_dirs']['cache']

# Load data

In [None]:
df_disgenet = pd.read_table(
    config['input_files']['raw_disgenet'],
    usecols=['snpId','diseaseId','diseaseName','source'])
df_disgenet['diseaseIdType'] = 'UMLS_CUI'

In [None]:
df_disgenet.head()

In [None]:
results_dir = config['output_dirs']['results']
tad_data_fname = f'{results_dir}/tads_hg38.tsv'

# Integrate latest GWAS-catalog version

Column definitions: https://www.ebi.ac.uk/gwas/docs/methods

In [None]:
df_gwascat = pd.read_table(config['input_files']['raw_gwascatalog'], low_memory=False)

#df_gwascat = df_gwascat[['SNP_ID_CURRENT', 'MAPPED_TRAIT_URI', 'MAPPED_TRAIT']]
df_gwascat.dropna(subset=['SNP_ID_CURRENT', 'MAPPED_TRAIT_URI', 'MAPPED_TRAIT'], inplace=True)
df_gwascat.rename(columns={
    'SNP_ID_CURRENT': 'snpId', 'MAPPED_TRAIT_URI': 'diseaseId', 'MAPPED_TRAIT': 'diseaseName'
}, inplace=True)

df_gwascat['snpId'] = df_gwascat['snpId'].apply(lambda x: f'rs{x}')
df_gwascat['source'] = 'GWASCUSTOM'
df_gwascat = split_df_row(df_gwascat, 'diseaseId', ',')
df_gwascat['diseaseId'] = df_gwascat['diseaseId'].apply(lambda x: x.split('/')[-1])
df_gwascat['diseaseIdType'] = df_gwascat['diseaseId'].apply(lambda x: x.split('_')[0])

# convert BETA to odds ratio
df_gwascat['odds_ratio'] = df_gwascat['OR or BETA'].apply(lambda x: np.exp(x) if x < 1 else x)
if config['filters']['OR_threshold'] is not None:
    df_gwascat = df_gwascat[df_gwascat['odds_ratio'] > config['filters']['OR_threshold']]

df_gwascat.sample(5)

In [None]:
df = pd.concat([df_gwascat])  #df_disgenet, 
df.sample(5)

## Infer associated gene(s)

Possible columns:
* REPORTED GENE(S): gene reported by author
* MAPPED GENE: Gene(s) mapped to the strongest SNP (if SNP is intergenic uses upstream and downstream genes)
* SNP_GENE_IDS: Entrez Gene ID

In [None]:
df[['REPORTED GENE(S)', 'MAPPED_GENE', 'SNP_GENE_IDS']].sample(5)

In [None]:
df['associated_genes'] = df['SNP_GENE_IDS']

# Parse Ontology OWLs

In [None]:
def parse_owl_file(soup, relevant_terms):
    """ Extract requested terms from OWL-file
    """
    node_owl_data = {}
    for entry in tqdm(soup.find_all('Class')):
        doid = entry['rdf:about'].split('/')[-1]

        # get label
        lbl = entry.find('rdfs:label').get_text()

        # get requested terms
        term_map = {term: [] for term in relevant_terms}
        for xref in entry.find_all('oboInOwl:hasDbXref'):
            txt = xref.get_text()
            for term in relevant_terms:
                if txt.startswith(f'{term}:'):
                    idx = txt.split(':')[-1]
                    term_map[term].append(idx)

        assert doid not in node_owl_data, doid
        node_owl_data[doid] = {
            'label': lbl,
            'terms': term_map
        }
        
    return node_owl_data

## DOID

In [None]:
with open('data/doid.owl') as fd:
    soup_doid = BeautifulSoup(fd, 'xml')

In [None]:
node_owl_data_doid = parse_owl_file(soup_doid, ['UMLS_CUI'])

## Save EFO disease labels

In [None]:
with open('data/efo.owl') as fd:
    soup = BeautifulSoup(fd, 'xml')

In [None]:
efo_lbl_map = {}
for entry in tqdm(soup.find_all('Class')):
    if not entry.has_attr('rdf:about'):
        continue
    
    efo = entry['rdf:about'].split('/')[-1]
    lbl = entry.find('rdfs:label').get_text()

    assert efo not in efo_lbl_map
    efo_lbl_map[efo] = lbl

In [None]:
results_dir = config['output_dirs']['results']

df_efolabels = pd.DataFrame(list(efo_lbl_map.items()), columns=['EFO', 'label'])
df_efolabels.to_csv(f'{results_dir}/disease_efolabels.csv', index=False)

print(df_efolabels.shape)
df_efolabels.sample(5)

# Disease ontology as tree

## Load data

In [None]:
def load_ontology_network(type_):
    assert type_ in ('efo', 'doid'), f'Invalid type: {type_}'
    
    fname = f'{cache_dir}/{type_}_graph.edgelist.gz'
    if not os.path.exists(fname):
        import onto2nx  # https://github.com/cthoyt/onto2nx
        nx.write_edgelist(onto2nx.parse_owl_rdf(f'data/{type_}.owl'), fname)
    else:
        print('Cached', fname)
        
    graph = nx.read_edgelist(fname, create_using=nx.DiGraph()).reverse()
    graph.name = type_
    return graph

In [None]:
doid_graph = load_ontology_network('doid')
print(nx.info(doid_graph))

In [None]:
efo_graph = load_ontology_network('efo')
print(nx.info(efo_graph))

## Map UMLS_CUI to DOID node

In [None]:
doid_umls_map = {}
for node, data in tqdm(node_owl_data_doid.items()):
    assert set(data['terms'].keys()) == set(['UMLS_CUI'])
    doid_umls_map[node] = data['terms']['UMLS_CUI']

## Find cancer subtree

In [None]:
given_diseases = set(df.diseaseId.unique())

# gather all possible disease-nodes
all_nodes_umls = [umls for doid in doid_graph.nodes() for umls in doid_umls_map[doid]]
all_nodes_efo = list(nx.descendants(efo_graph, 'EFO_0000408')) + ['EFO_0000408']  # disease subtree (vs traits, ...)

all_nodes = set(all_nodes_efo + all_nodes_umls) & given_diseases

# extract all cancer diseases
cancer_nodes_doid = (list(nx.descendants(doid_graph, 'DOID_162')) + ['DOID_162'])  # cancer subtree
cancer_nodes_umls = [umls for doid in cancer_nodes_doid for umls in doid_umls_map[doid]]

cancer_nodes_efo = list(nx.descendants(efo_graph, 'EFO_0000311')) + ['EFO_0000311']  # cancer subtree

cancer_nodes = set(cancer_nodes_efo + cancer_nodes_umls) & given_diseases
assert cancer_nodes <= all_nodes
print(f'#various nodes: {len(cancer_nodes)}/{len(all_nodes)}/{len(given_diseases)}')

In [None]:
# do cancer-classification
data_cancer = []
for disease in tqdm(df['diseaseId'].unique()):
    if disease in all_nodes:
        data_cancer.append((disease, disease in cancer_nodes))
    
df_iscancer = pd.DataFrame(data_cancer, columns=['diseaseId','is_cancer'])
df_iscancer.sample(5)

In [None]:
results_dir = config['output_dirs']['results']
df_iscancer.to_csv(f'{results_dir}/disease_cancer_classification.csv', index=False)

# SNP annotations

## Retrieve VEP annotations

In [None]:
def request_annotations(snps):
    _url = 'http://rest.ensembl.org/vep/human/id'
    headers = {'Content-Type': 'application/json', 'Accept': 'application/json'}

    r = requests.post(_url, headers=headers, data=json.dumps({'ids': snps}))
    return r.json() if r.ok else None

In [None]:
# warning: in case of update, cache-file must be deleted manually
fname = f'{cache_dir}/snp_annotations.json'

if os.path.exists(f'{fname}.gz'):
    print('Cached', f'{fname}.gz')
    with gzip.open(f'{fname}.gz') as fd:
        snp_anno_data = json.load(fd)
else:
    # setup
    snp_anno_data = []

    batch_size = 200
    snp_list = df['snpId'].unique().tolist()

    # request annotations
    pbar = tqdm(total=len(snp_list))
    
    prev_i = 0
    for i in range(batch_size, len(snp_list)+batch_size, batch_size):
        i = min(i, len(snp_list))
        cur_snps = snp_list[prev_i:i]
        assert len(cur_snps) == (i-prev_i), (prev_i, i, len(cur_snps))

        res = request_annotations(cur_snps)
        assert res is not None
        snp_anno_data.extend(res)

        prev_i = i
        pbar.update(batch_size)
        
    # cache results
    with open(fname, 'w') as fd:
        json.dump(snp_anno_data, fd)
    !gzip $fname

In [None]:
snp_anno_extract = []
for e in snp_anno_data:
    snp_anno_extract.append((
        e['id'], e['most_severe_consequence'],
        e['seq_region_name'], e['start']
    ))
    
df_anno = pd.DataFrame(snp_anno_extract, columns=['snpId', 'variant_type', 'chromosome', 'position'])
df_anno.drop_duplicates('snpId', inplace=True)
df_anno.sample(5)

In [None]:
print('#SNPs in database:', df.snpId.unique().size)
print('#annotated SNPs:', df_anno.snpId.size)
print('#intersection:', len(set(df.snpId.tolist()) & set(df.snpId.tolist())))

# Infer TAD relations

## Load SNP positions

In [None]:
df_snppos = df_anno[['snpId', 'chromosome', 'position']].copy()
df_snppos.sample(5)

## Load TAD data

In [None]:
df_tads = pd.read_table(tad_data_fname)

In [None]:
df_tads.head()

## Do work

In [None]:
def access_range_dict(row, dict_):
    range_dict_ = dict_.get(row['chromosome'], None)
    if range_dict_ is None:
        return 'undef'
    
    return range_dict_[row['position']]

In [None]:
tad_anno_20in = parse_tad_annotations('20in', fname=tad_data_fname)
df_snppos['TAD_20in'] = df_snppos.progress_apply(lambda x: access_range_dict(x, tad_anno_20in), axis=1)

tad_anno_40in = parse_tad_annotations('40in', fname=tad_data_fname)
df_snppos['TAD_40in'] = df_snppos.progress_apply(lambda x: access_range_dict(x, tad_anno_40in), axis=1)

tad_anno_20out = parse_tad_annotations('20out', fname=tad_data_fname)
df_snppos['TAD_20out'] = df_snppos.progress_apply(lambda x: access_range_dict(x, tad_anno_20out), axis=1)

tad_anno_40out = parse_tad_annotations('40out', fname=tad_data_fname)
df_snppos['TAD_40out'] = df_snppos.progress_apply(lambda x: access_range_dict(x, tad_anno_40out), axis=1)

tad_anno_20inout = parse_tad_annotations('20inout', fname=tad_data_fname)
df_snppos['TAD_20inout'] = df_snppos.progress_apply(lambda x: access_range_dict(x, tad_anno_20inout), axis=1)

tad_anno_40inout = parse_tad_annotations('40inout', fname=tad_data_fname)
df_snppos['TAD_40inout'] = df_snppos.progress_apply(lambda x: access_range_dict(x, tad_anno_40inout), axis=1)

In [None]:
df_snptads = df_snppos.drop(['chromosome', 'position'], axis=1)
df_snptads.sample(5)

# Merge into DisGeNET

In [None]:
# individual databases
print('DisGeNET only:', df_disgenet.shape)
print('Most recent GWAS-catalog: ', df_gwascat.shape)

In [None]:
# initial aggregation
df_final = df.copy()
df_final.shape

In [None]:
# cancer-classification
df_final = df_final.merge(df_iscancer, on='diseaseId')
df_final.shape

In [None]:
# TAD localization
df_final = df_final.merge(df_snptads, on='snpId')
df_final.shape

In [None]:
# SNP annotation
df_final = df_final.merge(df_anno, how='left')
df_final.shape

# Subset SNP-sets

In [None]:
variant_type_counts = (df_final[['snpId', 'variant_type']]
                       .drop_duplicates()['variant_type']
                       .value_counts()
                       .rename('count')
                       .reset_index()
                       .rename(columns={'index': 'variant_type'}))
variant_type_counts

In [None]:
# TODO: parse ontology (http://www.sequenceontology.org/browser/obob.cgi) properly
exon_type = [
    'missense_variant', 'non_coding_transcript_exon_variant',
    '3_prime_UTR_variant', 'synonymous_variant', '5_prime_UTR_variant'
]
intron_type = ['intron_variant']
intergenic_variant = ['intergenic_variant']

In [None]:
# remove exonic SNPs
if config['filters']['variant_type'] == 'nonexonic':
    df_final = df_final[~df_final['variant_type'].isin(exon_type)]

# keep only intergenic SNPs
if config['filters']['variant_type'] == 'intergenic':
    df_final = df_final[df_final['variant_type'].isin(intergenic_variant)]

df_final.shape

# Save result

In [None]:
results_dir = config['output_dirs']['results']
df_final.to_csv(f'{results_dir}/snpdb_enhanced.tsv', sep='\t', index=False)
df_final.sample(5)

# Plot database statistics

In [None]:
images_dir = config['output_dirs']['images']

## Number of entries per disease

In [None]:
disease_counts = (df_final['diseaseId']
                  .value_counts()
                  .rename('count')
                  .reset_index()
                  .rename(columns={'index': 'diseaseId'})
                  .sort_values('count')
                  .merge(df_iscancer, on='diseaseId'))

disease_counts.sample(5)

In [None]:
sns.boxplot(x='is_cancer', y='count', data=disease_counts)

plt.title('#rows associated with single diseases')
plt.yscale('log')

plt.tight_layout()
plt.savefig(f'{images_dir}/disease_count_distribution.pdf')

## Odds ratio distribution

In [None]:
df_final['odds_ratio'].describe()

In [None]:
odds_ratio = df_final['odds_ratio'].dropna()
sns.boxplot(odds_ratio[odds_ratio < odds_ratio.quantile(.75)], orient='v')

plt.title('Odds ratios (< 75% quantile) for all diseases')

plt.tight_layout()
plt.savefig(f'{images_dir}/oddsratio_distribution.pdf')

## Variant types

In [None]:
plt.figure(figsize=(16,8))
sns.barplot(
    x='count', y='variant_type',
    data=variant_type_counts, orient='h')

plt.title('#variant_type in database')

plt.tight_layout()
plt.savefig(f'{images_dir}/variant_type_counts.pdf')