In [None]:
import os
import sys
import urllib
import tempfile

import pandas as pd
import networkx as nx

from bs4 import BeautifulSoup

from tqdm import tqdm_notebook as tqdm

## Load data

In [None]:
df = pd.read_table(
    'data/curated_variant_disease_associations.tsv.gz',
    usecols=['snpId','diseaseId','diseaseName','source'])
df.rename(columns={'diseaseId': 'UMLS_CUI'}, inplace=True)

In [None]:
df.head()

## Disease ontology

### Load data

In [None]:
#g = onto2nx.parse_owl_rdf('data/doid.owl')
#nx.write_edgelist(g, 'results/doid_graph.edgelist.gz')
doid_graph = nx.read_edgelist('results/doid_graph.edgelist.gz', create_using=nx.DiGraph()).reverse()
print(nx.info(doid_graph))

In [None]:
with open('data/doid.owl') as fd:
    soup = BeautifulSoup(fd, 'xml')

In [None]:
node_owl_data = {}

for entry in tqdm(soup.find_all('Class')):
    doid = entry['rdf:about'].split('/')[-1]
    
    # get label
    lbl = entry.find('rdfs:label').get_text()
    
    # get UMLS_CUI terms
    terms = []
    for xref in entry.find_all('oboInOwl:hasDbXref'):
        txt = xref.get_text()
        if txt.startswith('UMLS_CUI:'):
            cui = txt.split(':')[-1]
            terms.append(cui)
    
    assert doid not in node_owl_data
    node_owl_data[doid] = {
        'label': lbl,
        'UMLS_CUI': terms
    }
    
nx.set_node_attributes(doid_graph, node_owl_data)

In [None]:
# check out exemplary node (cancer)
doid_data = dict(doid_graph.nodes(data=True))

doid_data['DOID_162']

In [None]:
data_cui = []
for node, data in tqdm(doid_data.items()):
    for term in data['UMLS_CUI']:
        data_cui.append((node, data['label'], term))

df_cui = pd.DataFrame(data_cui, columns=['DOID','DO_label','UMLS_CUI'])
df_cui.head()

### Find cancer subtree

In [None]:
cancer_nodes = nx.descendants(doid_graph, 'DOID_162')

data_cancer = []
for n in cancer_nodes:
    data_cancer.append((n, True))
for n in (doid_graph.nodes() - cancer_nodes):
    data_cancer.append((n, False))
    
df_iscancer = pd.DataFrame(data_cancer, columns=['DOID','is_cancer'])
df_iscancer.head()

### Merge data sources

In [None]:
print('Nodes in doid.owl:', len(doid_data))
print('Nodes with UMLS_CUI:', df_cui.DOID.unique().size)
print('(Non)cancer nodes (should be all):', df_iscancer.DOID.unique().size)

In [None]:
df_onto = df_cui.merge(df_iscancer, on='DOID')

print(df_onto.shape)
df_onto.head()

## Load SNP positions

In [None]:
df_snppos = pd.read_table(
    'data/all_variant_disease_pmid_associations.tsv.gz', usecols=['snpId','chromosome','position'])
df_snppos.drop_duplicates(inplace=True)

In [None]:
df_snppos.head()

## Merge into DisGeNET

In [None]:
df_final = df.copy()
df.shape

In [None]:
df_final = df_final.merge(df_onto, on='UMLS_CUI')
df_final.shape

In [None]:
df_final = df_final.merge(df_snppos, on='snpId')
df_final.shape

In [None]:
df_final.to_csv('results/disgenet_enhanced.tsv', sep='\t', index=False)
df_final.head()