In [None]:
import os
import sys
import gzip
import json
import urllib
import tempfile
import collections

import numpy as np
import pandas as pd
import networkx as nx

import requests
from bs4 import BeautifulSoup

from tqdm import tqdm_notebook as tqdm

from utils import load_config
from tad_helper_functions import *

In [None]:
from tqdm import tqdm as tqdm_orig
tqdm_orig.pandas()

In [None]:
config = load_config()

## Load data

In [None]:
df = pd.read_table(
    config['input_files']['raw_disgenet'],
    usecols=['snpId','diseaseId','diseaseName','source'])
df.rename(columns={'diseaseId': 'UMLS_CUI'}, inplace=True)

In [None]:
df.head()

## Decide whether to use hg19 or hg38

In [None]:
# Restart kernel and rerun all cells after changing this cell
USE_HG38 = True

In [None]:
if USE_HG38:
    genome_version = 'hg38'
    tad_data_fname = 'results/tads_hESC_hg38.tsv'
    
    def snp_position_convert(row):
        # don't do anything, as positions are already in hg38
        return row.position
else:
    genome_version = 'hg19'
    tad_data_fname = config['input_files']['tad_coordinates_hg19']
    
    # get hg19 SNP-positions
    df_snp_pos_map = pd.read_csv('results/snp_positions_hg19.csv')
    df_snp_pos_map['chrom'] = df_snp_pos_map['chrom'].apply(lambda x: x[3:])
    df_snp_pos_map['pos'] = list(zip(df_snp_pos_map['chrom'], df_snp_pos_map['end']))

    snp_pos_hg19 = df_snp_pos_map.set_index('SNPS').to_dict()['pos']
    def snp_position_convert(row):
        # convert BP-position from hg38 to hg19
        if row.snpId not in snp_pos_hg19:
            return np.nan
        
        chrom, pos_hg19 = snp_pos_hg19[row.snpId]
        assert row.chromosome == chrom
        return pos_hg19

## Parse Disease-Ontology OWL

In [None]:
with open('data/doid.owl') as fd:
    soup = BeautifulSoup(fd, 'xml')

In [None]:
node_owl_data = {}

for entry in tqdm(soup.find_all('Class')):
    doid = entry['rdf:about'].split('/')[-1]
    
    # get label
    lbl = entry.find('rdfs:label').get_text()
    
    # get UMLS_CUI/EFO terms
    efo_terms = []
    cui_terms = []
    for xref in entry.find_all('oboInOwl:hasDbXref'):
        txt = xref.get_text()
        if txt.startswith('UMLS_CUI:'):
            cui = txt.split(':')[-1]
            cui_terms.append(cui)
        elif txt.startswith('EFO:'):
            efo = txt.split(':')[-1]
            efo_terms.append(efo)
    
    assert doid not in node_owl_data
    node_owl_data[doid] = {
        'label': lbl,
        'UMLS_CUI': cui_terms,
        'EFO': efo_terms
    }

## Integrate lates GWAS-catalog version

In [None]:
df_gwascat = pd.read_table('data/gwas_catalog_v1.0.1-associations_e91_r2018-03-13.tsv', low_memory=False)

df_gwascat = df_gwascat[['SNP_ID_CURRENT', 'MAPPED_TRAIT_URI', 'MAPPED_TRAIT']]
df_gwascat.dropna(inplace=True)
df_gwascat.rename(columns={
    'SNP_ID_CURRENT': 'snpId', 'MAPPED_TRAIT_URI': 'EFO', 'MAPPED_TRAIT': 'diseaseName'
}, inplace=True)

df_gwascat['snpId'] = df_gwascat['snpId'].apply(lambda x: f'rs{x}')
df_gwascat['source'] = 'GWASCUSTOM'

In [None]:
# map EFO to UMLS_CUI
efo2cui_map = {}
for entry in node_owl_data.values():
    for efo in entry['EFO']:
        assert efo not in efo2cui_map, efo
        for cui in entry['UMLS_CUI']:
            efo2cui_map[f'EFO_{efo}'] = cui

df_gwascat['UMLS_CUI'] = df_gwascat['EFO'].str.split(' *, *').apply(
    lambda xs: sorted([efo2cui_map.get(x[25:], str(np.nan)) for x in xs])[0])
df_gwascat = df_gwascat[df_gwascat['UMLS_CUI'] != 'nan']
df_gwascat.drop('EFO', axis=1, inplace=True)

In [None]:
df_gwascat.sample(5)

In [None]:
df = pd.concat([df, df_gwascat])

## Disease ontology

### Load data

In [None]:
fname = 'cache/doid_graph.edgelist.gz'

if not os.path.exists(fname):
    import onto2nx
    nx.write_edgelist(onto2nx.parse_owl_rdf('data/doid.owl'), fname)
else:
    print('Cached', fname)
    
doid_graph = nx.read_edgelist(fname, create_using=nx.DiGraph()).reverse()
print(nx.info(doid_graph))

In [None]:
# enhance graph with associations
nx.set_node_attributes(doid_graph, node_owl_data)

In [None]:
# check out exemplary node (cancer)
doid_data = dict(doid_graph.nodes(data=True))

doid_data['DOID_162']

In [None]:
data_cui = []
for node, data in tqdm(doid_data.items()):
    for term in data['UMLS_CUI']:
        data_cui.append((node, data['label'], term))

df_cui = pd.DataFrame(data_cui, columns=['DOID','DO_label','UMLS_CUI'])
df_cui.head()

### Find cancer subtree

In [None]:
cancer_nodes = nx.descendants(doid_graph, 'DOID_162')

data_cancer = [('DOID_162', True)]
for n in cancer_nodes:
    data_cancer.append((n, True))
for n in (doid_graph.nodes() - cancer_nodes):
    data_cancer.append((n, False))
    
df_iscancer = pd.DataFrame(data_cancer, columns=['DOID','is_cancer'])
df_iscancer.head()

### Merge data sources

In [None]:
print('Nodes in doid.owl:', len(doid_data))
print('Nodes with UMLS_CUI:', df_cui.DOID.unique().size)
print('(Non)cancer nodes (should be all):', df_iscancer.DOID.unique().size)

In [None]:
df_onto = df_cui.merge(df_iscancer, on='DOID')

print(df_onto.shape)
df_onto.head()

In [None]:
# save disease cancer-classification
tmp = df_onto[['UMLS_CUI','is_cancer','DO_label']].copy()
tmp.rename(columns={'UMLS_CUI': 'term', 'is_cancer': 'type', 'DO_label': 'label'}, inplace=True)
tmp['type'] = tmp['type'].apply(lambda x: 'cancer' if x else 'disease')
tmp.to_csv('results/disease_terms.csv', index=False)

## SNP annotations

### Retrieve VEP annotations

In [None]:
def request_annotations(snps):
    _url = 'http://rest.ensembl.org/vep/human/id'
    headers = {'Content-Type': 'application/json', 'Accept': 'application/json'}

    r = requests.post(_url, headers=headers, data=json.dumps({'ids': snps}))
    return r.json() if r.ok else None

In [None]:
# warning: in case of update, cache-file must be deleted manually
fname = 'cache/snp_annotations.json'

if os.path.exists(f'{fname}.gz'):
    print('Cached', f'{fname}.gz')
    with gzip.open(f'{fname}.gz') as fd:
        snp_anno_data = json.load(fd)
else:
    # setup
    snp_anno_data = []

    batch_size = 200
    snp_list = df['snpId'].unique().tolist()

    # request annotations
    prev_i = 0
    for i in tqdm(range(batch_size, len(snp_list)+batch_size, batch_size)):
        i = min(i, len(snp_list))
        cur_snps = snp_list[prev_i:i]
        assert len(cur_snps) == (i-prev_i), (prev_i, i, len(cur_snps))

        res = request_annotations(cur_snps)
        assert res is not None
        snp_anno_data.extend(res)

        prev_i = i
        
    # cache results
    with open(fname, 'w') as fd:
        json.dump(snp_anno_data, fd)
    !gzip $fname

In [None]:
snp_anno_extract = []
for e in snp_anno_data:
    snp_anno_extract.append((
        e['id'], e['most_severe_consequence'],
        e['seq_region_name'], e['start']
    ))
    
df_anno = pd.DataFrame(snp_anno_extract, columns=['snpId', 'variant_type', 'chromosome', 'position'])
df_anno.drop_duplicates('snpId', inplace=True)
df_anno.sample(5)

## Infer TAD relations

### Load SNP positions

In [None]:
df_snppos = df_anno[['snpId', 'chromosome', 'position']].copy()
df_snppos.sample(5)

### Load TAD data

In [None]:
df_tads = pd.read_table(tad_data_fname)

In [None]:
df_tads.head()

### Do work

In [None]:
def access_range_dict(row, dict_):
    range_dict_ = dict_.get(row['chromosome'], None)
    if range_dict_ is None:
        return 'undef'
    
    return range_dict_[row['position']]

In [None]:
tad_anno_20in = parse_tad_annotations('20in', fname=tad_data_fname)
df_snppos['TAD_20in'] = df_snppos.progress_apply(lambda x: access_range_dict(x, tad_anno_20in), axis=1)

tad_anno_40in = parse_tad_annotations('40in', fname=tad_data_fname)
df_snppos['TAD_40in'] = df_snppos.progress_apply(lambda x: access_range_dict(x, tad_anno_40in), axis=1)

tad_anno_20out = parse_tad_annotations('20out', fname=tad_data_fname)
df_snppos['TAD_20out'] = df_snppos.progress_apply(lambda x: access_range_dict(x, tad_anno_20out), axis=1)

tad_anno_40out = parse_tad_annotations('40out', fname=tad_data_fname)
df_snppos['TAD_40out'] = df_snppos.progress_apply(lambda x: access_range_dict(x, tad_anno_40out), axis=1)

tad_anno_20inout = parse_tad_annotations('20inout', fname=tad_data_fname)
df_snppos['TAD_20inout'] = df_snppos.progress_apply(lambda x: access_range_dict(x, tad_anno_20inout), axis=1)

tad_anno_40inout = parse_tad_annotations('40inout', fname=tad_data_fname)
df_snppos['TAD_40inout'] = df_snppos.progress_apply(lambda x: access_range_dict(x, tad_anno_40inout), axis=1)

In [None]:
df_snptads = df_snppos.drop(['chromosome', 'position'], axis=1)
df_snptads.sample(5)

## Merge into DisGeNET

In [None]:
df_final = df.copy()
df_final.shape

In [None]:
df_final = df_final.merge(df_onto, on='UMLS_CUI')
df_final.shape

In [None]:
df_final = df_final.merge(df_snptads, on='snpId')
df_final.shape

In [None]:
df_final = df_final.merge(df_anno, how='left')
df_final.shape

In [None]:
df_final_sub = df_final.drop(['diseaseName', 'DOID'], axis=1)
df_final_sub.rename(columns={'DO_label': 'diseaseName'}, inplace=True)

df_final_sub.to_csv(f'results/disgenet_enhanced_{genome_version}.tsv', sep='\t', index=False)
df_final_sub.head()