In [61]:
import pandas as pd
import networkx as nx

import xml.etree.ElementTree as ET

from itertools import combinations

In [2]:
pathway = "hsa04810"

tree = ET.parse("../data/{}.xml".format(pathway))
root = tree.getroot()

node_elements  = [elem for elem in root.findall("entry") if elem.get("type") != "group"] # alternatively `root.findall("./entry[@type='group']")`
group_elements = [elem for elem in root.findall("entry") if elem.get("type") == "group"]
edge_elements  = root.findall("relation")

In [3]:
[x.attrib for x in group_elements]

[{'id': '84', 'name': 'undefined', 'type': 'group'},
 {'id': '85', 'name': 'undefined', 'type': 'group'},
 {'id': '86', 'name': 'undefined', 'type': 'group'}]

In [86]:
def get_node_attributes_from_entry(entry): 
    
    node_attributes = {
        "type": entry.get("type"), # i.e. 'gene'
        "id": int(entry.get("id")) # i.e. 84
    }
    
    # If compound node (protein complexes)
    if node_attributes["type"] == "group": 
        node_attributes["components"] = [int(x.get("id")) for x in entry.findall("component")]
        return node_attributes 
    else: 
        node_attributes["aliases"] = entry[0].get("name") # i.e. 'ALDOA, ALDA, GSD12, HEL-S-87p...'
        node_attributes["hsa_tags"] = entry.get("name") # i.e. 'hsa:226 hsa:229 hsa:230'
        node_attributes["first_name"] = node_attributes["aliases"].split(", ")[0].split("...")[0] # i.e. 'ALDOA'
    
    return node_attributes


def get_edge_attributes_from_entry(entry):
    
    edge_attributes = {
        "node1": int(entry.get("entry1")),
        "node2": int(entry.get("entry2")),
        "type": entry.get("type")
    }
    
    for attribute in entry: 
        
        interaction = attribute.get("name")
        
        if interaction in ["activation", "expression", "indirect effect"]: 
            edge_attributes["sign"] = 1
        elif interaction in ["inhibition", "repression"]: 
            edge_attributes["sign"] = -1
        elif interaction in ["binding/association"]: 
            edge_attributes["sign"] = 0
            
        elif interaction == "phosphorylation": 
            edge_attributes["phosphorylation"] = 1
        elif interaction == "dephosphorylation": 
            edge_attributes["phosphorylation"] = -1
            
        elif interaction == "glycosylation":
            edge_attributes["glycosylation"] = 1
            
        elif interaction == "ubiquitination": 
            edge_attributes["ubiquitination"] = 1
            
        elif interaction == "methylation": 
            edge_attributes["methylation"] = 1
        
        # Force edge through compound node. Unfortunately, these interactions are poorly annotated in KGML.
        elif interaction == "compound": 
            edge1 = {"node1": edge_attributes["node1"], "node2": int(attribute.get("value")), "type": "PPrel"}
            edge2 = {"node2": edge_attributes["node2"], "node2": int(attribute.get("value")), "type": "PPrel"}
        
        else: 
            print(interaction)
            
        # Fill in sign for implicit interactions
        if len(set(edge_attributes.keys()) & set(["phosphorylation", "glycosylation", "ubiquitination", "methylation"])) > 0:
            edge_attributes["sign"] = 1
            
    return edge_attributes


def replace_group_edges(edge_attributes_df, group_obj):
    
    group_id = group_obj["id"] 
    group_members = group_obj["components"]
    n_members = len(group_members)
    
    # Split into two dataframes based on `node1` column
    edges_without_df, edges_with_df = [x for _, x in edge_attributes_df.groupby(edge_attributes_df['node1'] == group_id)]
    # Duplicate rows where `node1` contains the `group_id`
    expanded_edges_df = pd.concat([edges_with_df]*n_members).sort_index() 
    # Replace `node` column with repeating list of `group_members`
    expanded_edges_df["node1"] = group_members*len(edges_with_df)
    # Concatenate the new dataframes and reset index
    edge_attributes_df = pd.concat([expanded_edges_df, edges_without_df]).reset_index(drop=True)
    
    # Do the same procedure with `node2` column
    edges_without_df, edges_with_df = [x for _, x in edge_attributes_df.groupby(edge_attributes_df['node2'] == group_id)]
    expanded_edges_df = pd.concat([edges_with_df]*n_members).sort_index() 
    expanded_edges_df["node2"] = group_members*len(edges_with_df)
    edge_attributes_df = pd.concat([expanded_edges_df, edges_without_df]).reset_index(drop=True)
    
    # Add edges between group members. `sign` will implicity be assigned 0 to indicate no directionality. 
    group_rows = [{"node1": a, "node2": b, "type":"complex"} for a,b in combinations(group_members, 2)]
    edge_attributes_df = edge_attributes_df.append(group_rows, ignore_index=True).fillna(0)
        
    return edge_attributes_df

In [5]:
# Should probably separate out complexes (i.e. `groups`)
node_attributes = [get_node_attributes_from_entry(element) for element in node_elements]
node_attributes_df = pd.DataFrame(node_attributes).set_index("id")

In [6]:
node_attributes_df.head()

Unnamed: 0_level_0,aliases,first_name,hsa_tags,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,"F2, PT, RPRGL2, THPH1",F2,hsa:2147,gene
2,C04637...,C04637,cpd:C04637 cpd:C11554,compound
3,C04637...,C04637,cpd:C04637 cpd:C11554,compound
4,C05981,C05981,cpd:C05981,compound
5,C00306,C00306,cpd:C00306,compound


In [7]:
node_attributes_df.shape

(83, 4)

In [8]:
group_attributes = [get_node_attributes_from_entry(element) for element in group_elements]
group_attributes

[{'components': [59, 60, 61], 'id': 84, 'type': 'group'},
 {'components': [11, 12], 'id': 85, 'type': 'group'},
 {'components': [16, 17, 18], 'id': 86, 'type': 'group'}]

In [69]:
edge_attributes_df = pd.DataFrame([get_edge_attributes_from_entry(x) for x in edge_elements if x.get("type") == "PPrel"])
edge_attributes_df.head(20)

Unnamed: 0,node1,node2,phosphorylation,sign,type
0,50,54,,,PPrel
1,28,30,,,PPrel
2,28,29,,,PPrel
3,24,25,,,PPrel
4,73,67,,1.0,PPrel
5,67,47,,1.0,PPrel
6,47,46,,1.0,PPrel
7,46,41,,1.0,PPrel
8,46,27,,1.0,PPrel
9,27,26,,1.0,PPrel


In [87]:
# Replace group nodes
for group_attribute in group_attributes: 
    edge_attributes_df = replace_group_edges(edge_attributes_df, group_attribute)

In [94]:
G = nx.Graph()
G.add_nodes_from([(key, dic) for key, dic in node_attributes_df.to_dict("index").items()])

In [110]:
G.add_edges_from([(row["node1"], row["node2"], row[edge_attributes_df.columns[2:]].to_dict()) for _,row in edge_attributes_df.iterrows()])