# Data
In this notebook I'll be downloading the data and putting it into a PyTorch Geometric Data object.

In [1]:
import os
import requests
import gzip
import numpy as np
from collections import defaultdict
import pickle as pkl
import re
import string
import time

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS

import pandas as pd
import networkx as nx

import pronto

from Bio import Entrez
from pyumls import api

## Download network data

In [2]:
# get the filenames
urls = ["http://snap.stanford.edu/biodata/datasets/10012/files/DG-AssocMiner_miner-disease-gene.tsv.gz",
        "https://www.inetbio.org/humannet/networks/HumanNet-XN.tsv",
        "http://snap.stanford.edu/biodata/datasets/10006/files/DD-Miner_miner-disease-disease.tsv.gz",
        "https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/master/src/ontology/doid.obo"
        ]
    
fnames = []
for url in urls:
    fname = url.split("/")[-1]
    fnames.append("data/"+fname)

In [3]:
# if the data isn't downloaded, download it into a data folder
if not os.path.isdir("data"):
    os.mkdir("data")
    
    for url, fname in zip(urls, fnames):
        with open(fname, "wb") as f:
            r = requests.get(url)
            f.write(r.content)

## Load into dataframes

In [4]:
# read the disease-gene association file
with gzip.open(fnames[0], "rb") as f:
    edges = defaultdict(list)
    for idx, line in enumerate(f):
        if idx == 0:
            pass
        else:
            d_id, d_name, g_id = line.decode("utf-8").strip().split("\t")
            edges["disease_id"].append(d_id)
            edges["disease_name"].append(d_name.strip('"'))
            edges["gene_id"].append(int(g_id))
    
    disease_gene_edge_df = pd.DataFrame(edges)

In [5]:
# read the gene-gene association file
with open(fnames[1], "r") as f:
    edges = defaultdict(list)
    for idx, line in enumerate(f):
        if idx == 0:
            pass
        else:
            g1_id, g2_id, weight = line.strip().split("\t")
            g1_id = int(g1_id)
            g2_id = int(g2_id)
            weight = float(weight)
            # we only want the rows where both of the genes are already in our disease-gene network
            if (g1_id in disease_gene_edge_df["gene_id"].values) and (g2_id in disease_gene_edge_df["gene_id"].values):
                edges["gene1_id"].append(g1_id)
                edges["gene2_id"].append(g2_id)
                edges["log_likelihood_score"].append(weight)
    
    gene_gene_edge_df = pd.DataFrame(edges)

In [6]:
# this is useful for mapping diseases labeled with different IDs
doid = pronto.Ontology("data/doid.obo")

In [7]:
# read the disease-disease association file
with gzip.open(fnames[2], "rb") as f:
    edges = defaultdict(list)
    for idx, line in enumerate(f):
        if idx == 0:
            headers = line.decode("utf-8").strip().split("\t")
        else:
            doid1, doid2 = line.decode("utf-8").strip().split("\t") # get the disease ids (in DOID format)
            # get the CUI cross references (xrefs) associated with these DOIDs
            doid1_xrefs = [xref.id.replace("UMLS_CUI:", "") for xref in doid[doid1].xrefs if "UMLS_CUI" in xref.id]
            doid2_xrefs = [xref.id.replace("UMLS_CUI:", "") for xref in doid[doid2].xrefs if "UMLS_CUI" in xref.id]
            
            # only add the disease-disease link if both of these diseases are present in the disease-gene associations
            for xref1 in doid1_xrefs:
                if xref1 in disease_gene_edge_df["disease_id"].values:
                    for xref2 in doid2_xrefs:
                        if xref2 in disease_gene_edge_df["disease_id"].values:
                            edges["disease1_id"].append(xref1)
                            edges["disease2_id"].append(xref2)
    
    disease_disease_edge_df = pd.DataFrame(edges)

## Encode node ids
Below I'm adding integer labels for each gene and each disease. These integer labels will be the node ids for the graph representation. The gene_node_ids values start where disease_node_ids leaves off so there aren't any overlap in node ids.

In [8]:
# create the mapping from a (disease or gene) identifier to a node id
diseases_and_genes = np.append(disease_gene_edge_df["disease_id"].values, disease_gene_edge_df["gene_id"].values)
diseases_and_genes = list(dict.fromkeys(diseases_and_genes).keys())
node_id_mapping = {id_: node_id for id_, node_id in zip(diseases_and_genes, list(range(len(diseases_and_genes))))}

In [9]:
# add node id and gene id columns to the disease-gene association df
disease_gene_edge_df["disease_node_id"] = [node_id_mapping[id_] for id_ in disease_gene_edge_df["disease_id"]]
disease_gene_edge_df["gene_node_id"] = [node_id_mapping[id_] for id_ in disease_gene_edge_df["gene_id"]]

In [10]:
# add node ids to the gene-gene assocations
gene_gene_edge_df["gene1_node_id"] = [node_id_mapping[id_] for id_ in gene_gene_edge_df["gene1_id"]]
gene_gene_edge_df["gene2_node_id"] = [node_id_mapping[id_] for id_ in gene_gene_edge_df["gene2_id"]]

In [11]:
# add node ids to the disease-disease assocations
disease_disease_edge_df["disease1_node_id"] = [node_id_mapping[id_] for id_ in disease_disease_edge_df["disease1_id"]]
disease_disease_edge_df["disease2_node_id"] = [node_id_mapping[id_] for id_ in disease_disease_edge_df["disease2_id"]]

In [12]:
diseases = disease_gene_edge_df.iloc[:, [0,1,3]].drop_duplicates() # df with just the diseases
genes = disease_gene_edge_df.iloc[:, [2,4]].drop_duplicates() # df with just the genes

diseases.to_pickle("data/diseases_df.pkl")
genes.to_pickle("data/genes_df.pkl")

## Get gene and disease features

**Gene features**

In [13]:
# The Entrez module of Biopython will be used for accessing gene data
# enter your email below when requesting data from the API
# Entrez.email =

In [14]:
# Borrowed from the BioPython docs: https://biopython.org/wiki/Annotate_Entrez_Gene_IDs.
def retrieve_annotation(id_list):
    """
    Annotates Entrez Gene IDs using Bio.Entrez, in particular epost (to
    submit the data to NCBI) and esummary to retrieve the information.
    Returns a list of dictionaries with the annotations.
    """
    
    id_list = list(map(str, id_list))
    
    request = Entrez.epost("gene", id=",".join(id_list))
    try:
        result = Entrez.read(request)
    except RuntimeError as e:
        print("An error occurred while retrieving the annotations.")
        print("The error returned was %s" % e)
        return

    webEnv = result["WebEnv"]
    queryKey = result["QueryKey"]
    data = Entrez.esummary(db="gene", webenv=webEnv, query_key=queryKey)
    annotations = Entrez.read(data)

    print("Retrieved %d annotations for %d genes" % (len(annotations), len(id_list)))

    return annotations

In [15]:
id_list = genes["gene_id"].to_list()

if not os.path.isfile("gene_summaries.pkl"):
    annotations = retrieve_annotation(id_list)
    annotation_list = annotations["DocumentSummarySetmentSummarySet"]["DocumentSummary"]
    gene_summaries = [str(doc["Summary"]) for doc in annotation_list]
    with open("gene_summaries.pkl", "wb") as f:
        pkl.dump(gene_summaries, f)
else:
    with open("gene_summaries.pkl", "rb") as f:
        gene_summaries = pkl.load(f)

In [16]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text) # remove text within < >
    text = re.sub(r'\[.*?\]', '', text) # remove text within [ ]
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text) # remove punctuation
    text = re.sub(r'[‘’“”…]', '', text)
    text = re.sub(r'\s\d+\s', '', text)
    return text

In [17]:
genes["summary"] = gene_summaries
genes["summary"] = genes["summary"].apply(clean_text)

In [18]:
# stopwords from https://cs.stanford.edu/people/sonal/gupta14jamia_supl.pdf, plus some custom words
medical_stopwords = "gene, genes, protein, refseq, provided, contains, encoded, encode, encoding, splicing, type, expression, located, superfamily, target, known, described, identified, including, thought, syndrome, family, associated, region, domain, alternative, alternatively, factor, transcription, cause, belongs, belong, activity, encodes, variants, transcript, cell, proteins, multiple, member, involved, different, role, results, cells, function,  disease, diseases, disorder, symptom, symptoms, drug, drugs, problems, problem,prob, probs, med, meds,pill,  pills,  medicine,  medicines,  medication,  medications,  treatment,  treatments,  caps,  capsules,  capsule, tablet,  tablets,  tabs,  doctor,  dr,  dr.,  doc,  physician,  physicians,  test,  tests,  testing,  specialist, specialists, side-effect, side-effects, pharmaceutical, pharmaceuticals, pharma, diagnosis, diagnose, diagnosed, exam, challenge,  device,  condition,  conditions,  suffer,  suffering  ,suffered,  feel,  feeling,  prescription,  prescribe, prescribed, over-the-counter, otc"
medical_stopwords = list(map(str.strip, medical_stopwords.split(",")))

stop_words = ENGLISH_STOP_WORDS.union(medical_stopwords)

# count vectorizer on the lemmatized text with no named entities
cv = TfidfVectorizer(stop_words=stop_words, min_df=2, max_df=0.8, ngram_range=(1,1))
data_cv = cv.fit_transform(genes["summary"])
dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())



In [19]:
genes = pd.concat([genes.reset_index(drop=True), dtm], axis=1)

**Disease features**

In [20]:
# This list was curated using the UMLS UTS api, which I won't show here. (https://uts.nlm.nih.gov/home.html)
with open("disease_definitions.pkl", "rb") as f:
    disease_definitions = pkl.load(f)

In [21]:
diseases["definition"] = disease_definitions
diseases["definition"] = diseases["definition"].apply(clean_text)

In [22]:
# stopwords from https://cs.stanford.edu/people/sonal/gupta14jamia_supl.pdf, plus some custom words
medical_stopwords = "gene, genes, protein, refseq, provided, characterized, contains, encoded, encode, encoding, splicing, type, expression, located, superfamily, target, known, described, identified, including, thought, syndrome, family, associated, region, domain, alternative, alternatively, factor, transcription, cause, belongs, belong, activity, encodes, variants, transcript, cell, proteins, multiple, member, involved, different, role, results, cells, function,  disease, diseases, disorder, symptom, symptoms, drug, drugs, problems, problem,prob, probs, med, meds,pill,  pills,  medicine,  medicines,  medication,  medications,  treatment,  treatments,  caps,  capsules,  capsule, tablet,  tablets,  tabs,  doctor,  dr,  dr.,  doc,  physician,  physicians,  test,  tests,  testing,  specialist, specialists, side-effect, side-effects, pharmaceutical, pharmaceuticals, pharma, diagnosis, diagnose, diagnosed, exam, challenge,  device,  condition,  conditions,  suffer,  suffering  ,suffered,  feel,  feeling,  prescription,  prescribe, prescribed, over-the-counter, otc"
medical_stopwords = list(map(str.strip, medical_stopwords.split(",")))

stop_words = ENGLISH_STOP_WORDS.union(medical_stopwords)

# count vectorizer on the lemmatized text with no named entities
cv = TfidfVectorizer(stop_words=stop_words, min_df=2, max_df=0.8, ngram_range=(1,1))
data_cv = cv.fit_transform(diseases["definition"])
dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())



In [23]:
diseases = pd.concat([diseases.reset_index(drop=True), dtm], axis=1)

## Convert to NetworkX graphs
This is a much more natural way to represent this data.

In the cell below I create the building blocks for the graphs: the different edge sets (disease -> disease, disease -> gene, and gene -> gene), as well as node sets (disease, gene, and disease+gene).

In [24]:
disease_gene_edges = list(zip(disease_gene_edge_df["disease_node_id"].values, 
                              disease_gene_edge_df["gene_node_id"].values))

gene_gene_edges = list(zip(gene_gene_edge_df["gene1_node_id"].values, 
                              gene_gene_edge_df["gene2_node_id"].values))

disease_disease_edges = list(zip(disease_disease_edge_df["disease1_node_id"].values, 
                              disease_disease_edge_df["disease2_node_id"].values))

# node features are added from the dataframes
disease_nodes = [(row[2], {"x": [1.,0.]+row[4:].to_list() }) for _, row in diseases.iterrows()] 
gene_nodes = [(row[1], {"x": [0.,1.]+row[3:].to_list() }) for _, row in genes.iterrows()] 
all_nodes = disease_nodes + gene_nodes # all nodes

### Graph creation
Here I use the pieces above to build several different graphs in NetworkX. One graph is created for each edge set, as well as a graph with all edge sets.

In [25]:
# disease-gene edge set graph. No node features are added bc they aren't needed in this graph
G_disease_gene = nx.Graph()
G_disease_gene.add_edges_from(disease_gene_edges)

# make sure the graph was created as intended. Expect undirected graph with 7,813 nodes.
print(G_disease_gene.is_directed(), G_disease_gene.number_of_nodes(), G_disease_gene.number_of_edges())

nx.write_gpickle(G_disease_gene, "data/disease_gene_graph.pkl")

False 7813 21357


In [26]:
# disease-disease edge set graph
G_disease = nx.Graph()
G_disease.add_edges_from(disease_disease_edges)
G_disease.add_nodes_from(disease_nodes)

# make sure the graph was created as intended. Expect undirected graph with 519 nodes.
print(G_disease.is_directed(), G_disease.number_of_nodes(), G_disease.number_of_edges())

nx.write_gpickle(G_disease, "data/disease_graph.pkl")

False 519 82


In [27]:
# gene-gene edge set graph
G_gene = nx.Graph()
G_gene.add_edges_from(gene_gene_edges)
G_gene.add_nodes_from(gene_nodes)

# make sure the graph was created as intended. Expect undirected graph with 7,294 nodes.
print(G_gene.is_directed(), G_gene.number_of_nodes(), G_gene.number_of_edges())

nx.write_gpickle(G_gene, "data/gene_graph.pkl")

False 7294 131509


In [28]:
# graph with all edge sets. This graph will only be used for visualization,
#    so only the identity of each node (gene or disease) will be added to features.
disease_nodes = [(row[2], {"x": [1.,0.]}) for _, row in diseases.iterrows()] 
gene_nodes = [(row[1], {"x": [0.,1.]}) for _, row in genes.iterrows()] 
all_nodes = disease_nodes + gene_nodes # all nodes

G_all = nx.Graph()
G_all.add_edges_from(gene_gene_edges)
G_all.add_edges_from(disease_disease_edges)
G_all.add_edges_from(disease_gene_edges)

G_all.add_nodes_from(all_nodes)

# make sure the graph was created as intended. Expect undirected graph with 7,294 nodes.
print(G_all.is_directed(), G_all.number_of_nodes(), G_all.number_of_edges())

nx.write_gpickle(G_all, "data/vis_graph.pkl")

False 7813 152948
