In [1]:
import pandas as pd
import networkx as nx

import xml.etree.ElementTree as ET

from itertools import combinations

In [2]:
def get_node_attributes_from_entry(entry): 
    
    node_attributes = {
        "type": entry.get("type"), # i.e. 'gene'
        "id": int(entry.get("id")) # i.e. 84
    }
    
    # If compound node (protein complexes)
    if node_attributes["type"] == "group": 
        node_attributes["components"] = [int(x.get("id")) for x in entry.findall("component")]
        return node_attributes 
    else: 
        node_attributes["aliases"] = entry[0].get("name") # i.e. 'ALDOA, ALDA, GSD12, HEL-S-87p...'
        node_attributes["hsa_tags"] = entry.get("name") # i.e. 'hsa:226 hsa:229 hsa:230'
        node_attributes["first_name"] = node_attributes["aliases"].split(", ")[0].split("...")[0] # i.e. 'ALDOA'
    
    return node_attributes


def get_edge_attributes_from_entry(entry):
    
    edge_attributes = {
        "node1": int(entry.get("entry1")),
        "node2": int(entry.get("entry2")),
        "type": entry.get("type")
    }
    
    for attribute in entry: 
        
        interaction = attribute.get("name")
        
        if interaction in ["activation", "expression", "indirect effect"]: 
            edge_attributes["sign"] = 1
        elif interaction in ["inhibition", "repression"]: 
            edge_attributes["sign"] = -1
        elif interaction in ["binding/association"]: 
            edge_attributes["sign"] = 0
            
        elif interaction == "phosphorylation": 
            edge_attributes["phosphorylation"] = 1
        elif interaction == "dephosphorylation": 
            edge_attributes["phosphorylation"] = -1
            
        elif interaction == "glycosylation":
            edge_attributes["glycosylation"] = 1
            
        elif interaction == "ubiquitination": 
            edge_attributes["ubiquitination"] = 1
            
        elif interaction == "methylation": 
            edge_attributes["methylation"] = 1
        
        # Force edge through compound node. Unfortunately, these interactions are poorly annotated in KGML.
        # These should be of type "ECrel", but may be mistakenly annotated as "PPrel" in KGMLs.
        elif interaction == "compound": 
            edge1 = {"node1": edge_attributes["node1"], "node2": int(attribute.get("value")), "type": "ECrel"}
            edge2 = {"node2": edge_attributes["node2"], "node2": int(attribute.get("value")), "type": "ECrel"}
        
        else: 
            print(interaction)
            
        # Fill in sign for implicit interactions
        if len(set(edge_attributes.keys()) & set(["phosphorylation", "glycosylation", "ubiquitination", "methylation"])) > 0:
            edge_attributes["sign"] = 1
            
    return edge_attributes


def replace_group_edges(edge_attributes_df, group_obj):
    
    group_id = group_obj["id"] 
    group_members = group_obj["components"]
    n_members = len(group_members)
    initial_length = len(edge_attributes_df)
    
    # Split into two dataframes based on `node1` column
    edges_without_df = edge_attributes_df[edge_attributes_df["node1"] != group_id]
    edges_with_df = edge_attributes_df[edge_attributes_df["node1"] == group_id]
    # Duplicate rows where `node1` contains the `group_id`
    expanded_edges_df = pd.concat([edges_with_df]*n_members).sort_index() 
    # Replace `node` column with repeating list of `group_members`
    expanded_edges_df["node1"] = group_members*len(edges_with_df)
    # Concatenate the new dataframes and reset index
    edge_attributes_df = pd.concat([expanded_edges_df, edges_without_df]).reset_index(drop=True)
    
    # Do the same procedure with `node2` column
    edges_without_df = edge_attributes_df[edge_attributes_df["node2"] != group_id]
    edges_with_df = edge_attributes_df[edge_attributes_df["node2"] == group_id]
    expanded_edges_df = pd.concat([edges_with_df]*n_members).sort_index() 
    expanded_edges_df["node2"] = group_members*len(edges_with_df)
    edge_attributes_df = pd.concat([expanded_edges_df, edges_without_df]).reset_index(drop=True)
    
    # Add edges between group members. `sign` will implicity be assigned 0 to indicate no directionality. 
    group_rows = [{"node1": a, "node2": b, "type":"complex"} for a,b in combinations(group_members, 2)]
    edge_attributes_df = edge_attributes_df.append(group_rows, ignore_index=True).fillna(0)
    
#     print("{} members in group {}. {} edges added.".format(n_members, group_id, len(edge_attributes_df)-initial_length))
        
    return edge_attributes_df

In [3]:
def get_KGML_as_networkx(path):
    
    # Open XML as elementtree object
    tree = ET.parse(path)
    root = tree.getroot()
    
    print(root.attrib)

    # Define node and edge elements
    node_elements  = [elem for elem in root.findall("entry") if elem.get("type") != "group"] # alternatively `root.findall("./entry[@type='group']")`
    group_elements = [elem for elem in root.findall("entry") if elem.get("type") == "group"]
    edge_elements  = root.findall("relation")
    
    # Get node, group and edge attributes
    node_attributes = [get_node_attributes_from_entry(element) for element in node_elements]
    node_attributes_df = pd.DataFrame(node_attributes).set_index("id")

    group_attributes = [get_node_attributes_from_entry(element) for element in group_elements]
    
    edge_attributes_df = pd.DataFrame([get_edge_attributes_from_entry(x) for x in edge_elements if x.get("type") in ["PPrel", "CRel"]])
    
    # Replace group nodes with individual components and update edge_attributes
    for group_attribute in group_attributes: 
        edge_attributes_df = replace_group_edges(edge_attributes_df, group_attribute)
        
    # Create directed networkx graph
    G_dir = nx.DiGraph()
    G_dir.add_nodes_from([(key, dic) for key, dic in node_attributes_df.to_dict("index").items()])
    G_dir.add_edges_from([(row["node1"], row["node2"], row[edge_attributes_df.columns[2:]].to_dict()) for _,row in edge_attributes_df.iterrows()])
    nx.relabel_nodes(G_dir, {node_id:G_dir.node[node_id]["first_name"] for node_id in G_dir.nodes()}, copy=False)

    # Create undirected networkx graph. Useful for finding interactions with ambiguous direction
    G_und = nx.Graph()
    G_und.add_nodes_from([(key, dic) for key, dic in node_attributes_df.to_dict("index").items()])
    G_und.add_edges_from([(row["node1"], row["node2"], row[edge_attributes_df.columns[2:]].to_dict()) for _,row in edge_attributes_df.iterrows()])
    nx.relabel_nodes(G_und, {node_id:G_und.node[node_id]["first_name"] for node_id in G_und.nodes()}, copy=False)
    
    out = {"directed": G_dir, "undirected": G_und}
    out = {**out, **root.attrib}
    
    return out

In [4]:
from glob import glob
from os.path import basename

paths = glob("../data/human_KGML/*.xml")

# graphs = {basename(path).split(".")[0]: get_KGML_as_networkx(path) for path in paths}
graphs = {}
for path in paths: 
    try: 
        tag = basename(path).split(".")[0]
        graphs[tag] = get_KGML_as_networkx(path)
    except: 
        pass

{'name': 'path:hsa04371', 'org': 'hsa', 'number': '04371', 'title': 'Apelin signaling pathway', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa04371.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa04371'}
{'name': 'path:hsa04211', 'org': 'hsa', 'number': '04211', 'title': 'Longevity regulating pathway', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa04211.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa04211'}
{'name': 'path:hsa05133', 'org': 'hsa', 'number': '05133', 'title': 'Pertussis', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa05133.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa05133'}
{'name': 'path:hsa05132', 'org': 'hsa', 'number': '05132', 'title': 'Salmonella infection', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa05132.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa05132'}
{'name': 'path:hsa00062', 'org': 'hsa', 'number': '00062', 'title': 'Fatty acid elongation', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa0

{'name': 'path:hsa04217', 'org': 'hsa', 'number': '04217', 'title': 'Necroptosis', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa04217.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa04217'}
{'name': 'path:hsa04216', 'org': 'hsa', 'number': '04216', 'title': 'Ferroptosis', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa04216.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa04216'}
{'name': 'path:hsa05134', 'org': 'hsa', 'number': '05134', 'title': 'Legionellosis', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa05134.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa05134'}
dissociation
{'name': 'path:hsa05120', 'org': 'hsa', 'number': '05120', 'title': 'Epithelial cell signaling in Helicobacter pylori infection', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa05120.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa05120'}
dissociation
{'name': 'path:hsa04014', 'org': 'hsa', 'number': '04014', 'title': 'Ras signaling pathway', 'image': 'ht

{'name': 'path:hsa04924', 'org': 'hsa', 'number': '04924', 'title': 'Renin secretion', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa04924.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa04924'}
{'name': 'path:hsa04930', 'org': 'hsa', 'number': '04930', 'title': 'Type II diabetes mellitus', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa04930.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa04930'}
{'name': 'path:hsa04918', 'org': 'hsa', 'number': '04918', 'title': 'Thyroid hormone synthesis', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa04918.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa04918'}
{'name': 'path:hsa03050', 'org': 'hsa', 'number': '03050', 'title': 'Proteasome', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa03050.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa03050'}
{'name': 'path:hsa00565', 'org': 'hsa', 'number': '00565', 'title': 'Ether lipid metabolism', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa00565.

{'name': 'path:hsa00600', 'org': 'hsa', 'number': '00600', 'title': 'Sphingolipid metabolism', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa00600.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa00600'}
{'name': 'path:hsa04658', 'org': 'hsa', 'number': '04658', 'title': 'Th1 and Th2 cell differentiation', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa04658.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa04658'}
dissociation
{'name': 'path:hsa05220', 'org': 'hsa', 'number': '05220', 'title': 'Chronic myeloid leukemia', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa05220.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa05220'}
dissociation
dissociation
missing interaction
missing interaction
missing interaction
missing interaction
missing interaction
missing interaction
missing interaction
missing interaction
{'name': 'path:hsa04670', 'org': 'hsa', 'number': '04670', 'title': 'Leukocyte transendothelial migration', 'image': 'http://www.kegg.jp/keg

{'name': 'path:hsa00010', 'org': 'hsa', 'number': '00010', 'title': 'Glycolysis / Gluconeogenesis', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa00010.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa00010'}
{'name': 'path:hsa05168', 'org': 'hsa', 'number': '05168', 'title': 'Herpes simplex infection', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa05168.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa05168'}
dissociation
dissociation
{'name': 'path:hsa05140', 'org': 'hsa', 'number': '05140', 'title': 'Leishmaniasis', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa05140.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa05140'}
dissociation
dissociation
{'name': 'path:hsa04921', 'org': 'hsa', 'number': '04921', 'title': 'Oxytocin signaling pathway', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa04921.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa04921'}
{'name': 'path:hsa05418', 'org': 'hsa', 'number': '05418', 'title': 'Fluid shear 

{'name': 'path:hsa00630', 'org': 'hsa', 'number': '00630', 'title': 'Glyoxylate and dicarboxylate metabolism', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa00630.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa00630'}
{'name': 'path:hsa05012', 'org': 'hsa', 'number': '05012', 'title': "Parkinson's disease", 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa05012.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa05012'}
missing interaction
missing interaction
missing interaction
missing interaction
{'name': 'path:hsa00340', 'org': 'hsa', 'number': '00340', 'title': 'Histidine metabolism', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa00340.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa00340'}
{'name': 'path:hsa04640', 'org': 'hsa', 'number': '04640', 'title': 'Hematopoietic cell lineage', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa04640.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa04640'}
{'name': 'path:hsa04668', 'org': 'hsa', 

{'name': 'path:hsa00350', 'org': 'hsa', 'number': '00350', 'title': 'Tyrosine metabolism', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa00350.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa00350'}
{'name': 'path:hsa04650', 'org': 'hsa', 'number': '04650', 'title': 'Natural killer cell mediated cytotoxicity', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa04650.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa04650'}
{'name': 'path:hsa05200', 'org': 'hsa', 'number': '05200', 'title': 'Pathways in cancer', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa05200.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa05200'}
{'name': 'path:hsa05214', 'org': 'hsa', 'number': '05214', 'title': 'Glioma', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa05214.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa05214'}
missing interaction
missing interaction
dissociation
dissociation
missing interaction
missing interaction
missing interaction
missing intera

{'name': 'path:hsa03013', 'org': 'hsa', 'number': '03013', 'title': 'RNA transport', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa03013.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa03013'}
{'name': 'path:hsa04973', 'org': 'hsa', 'number': '04973', 'title': 'Carbohydrate digestion and absorption', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa04973.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa04973'}
{'name': 'path:hsa00730', 'org': 'hsa', 'number': '00730', 'title': 'Thiamine metabolism', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa00730.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa00730'}
{'name': 'path:hsa04218', 'org': 'hsa', 'number': '04218', 'title': 'Cellular senescence', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa04218.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa04218'}
{'name': 'path:hsa04966', 'org': 'hsa', 'number': '04966', 'title': 'Collecting duct acid secretion', 'image': 'http://www.kegg.jp/kegg/

{'name': 'path:hsa00534', 'org': 'hsa', 'number': '00534', 'title': 'Glycosaminoglycan biosynthesis - heparan sulfate / heparin', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa00534.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa00534'}
{'name': 'path:hsa04961', 'org': 'hsa', 'number': '04961', 'title': 'Endocrine and other factor-regulated calcium reabsorption', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa04961.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa04961'}
{'name': 'path:hsa04975', 'org': 'hsa', 'number': '04975', 'title': 'Fat digestion and absorption', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa04975.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa04975'}
{'name': 'path:hsa05100', 'org': 'hsa', 'number': '05100', 'title': 'Bacterial invasion of epithelial cells', 'image': 'http://www.kegg.jp/kegg/pathway/hsa/hsa05100.png', 'link': 'http://www.kegg.jp/kegg-bin/show_pathway?hsa05100'}
{'name': 'path:hsa04550', 'org': 'hsa', 'nu

In [5]:
import pickle

pickle.dump(graphs, open("KGML_networkx_graphs.v0.1.pickle", "wb"))