# Gathering pathways, resolving gene homologs and preparing to visualize..

In [1]:
import copy
import json
import pickle as pkl
from importlib import reload

import pandas as pd
import numpy as np

from owlready2 import *
import networkx as nx
from pyvis.network import Network

In [2]:
pc = get_ontology("../data/PathwayCommons11.reactome.BIOPAX.owl").load()

## Load PANTHER gene orthologs
```
ftp://ftp.pantherdb.org/ortholog/14.1/RefGenomeOrthologs.tar.gz
```

In [3]:
orthologs = pd.read_csv(
  '../data/RefGenomeOrthologs', delimiter='\t', 
  names=['Gene', 'Ortholog', 'Type', 'Common ancestor', 'something else']
)
orthologs.head()

Unnamed: 0,Gene,Ortholog,Type,Common ancestor,something else
0,ARATH|TAIR=AT4G37920|UniProtKB=Q84WN0,ARATH|TAIR=AT1G36320|UniProtKB=Q9C8X8,P,Embryophyta|Magnoliophyta,PTHR31755
1,HUMAN|HGNC=10663|UniProtKB=O60524,MOUSE|MGI=MGI=1918305|UniProtKB=Q8CCP0,LDO,Euarchontoglires,PTHR15239
2,HUMAN|HGNC=10663|UniProtKB=O60524,RAT|Ensembl=ENSRNOG00000056128|UniProtKB=A0A0G...,LDO,Euarchontoglires,PTHR15239
3,HUMAN|HGNC=10663|UniProtKB=O60524,CHICK|Ensembl=ENSGALG00000012263|UniProtKB=F1N8T0,LDO,Amniota,PTHR15239
4,HUMAN|HGNC=10663|UniProtKB=O60524,DANRE|Ensembl=ENSDARG00000102859|UniProtKB=A0A...,LDO,Euteleostomi,PTHR15239


## Build an ortholog -> mouse gene map

In [4]:
def convert_db_name(db):
  if db == 'UniProtKB' or db == 'uniprot knowledgebase':
    return 'uniprot'
  elif db == 'ncbi gene':
    return 'ncbi'
  elif db == 'hgnc symbol':
    return 'hgnc'
  return db.lower()


def convert_db_id(id):
  if ':' in str(id):
    return id.split(':')[-1].lower()
  return str(id).lower()


def make_db_ref(db, id):
  return '{0}:{1}'.format(convert_db_name(db), convert_db_id(id))


try:
  # Load precomputed ortholog gene map
  with open('../data/gene-map-all2mouse', 'rb') as f:
    all2mouse = pkl.load(f)
except:
  # Compute and store the ortholog gene map
  all2mouse = {}
  for _, row in orthologs.iterrows():
    # Keep only the closest homologs
    if row['Type'] != 'LDO':
      continue
    if not row['Ortholog'].startswith('MOUSE|'):
      continue

    dst = [
      make_db_ref(d.split('=')[0], d.split('=')[-1])
      for d in row['Ortholog'].split('|')[1:]
    ]

    for src in row['Gene'].split('|')[1:]:
      src_parts = src.split('=')
      db = src_parts[0]
      id = src_parts[-1]
      all2mouse[make_db_ref(db, id)] = dst

  with open('../data/gene-map-all2mouse', 'wb') as f:
    pkl.dump(all2mouse, f)

## Load mouse gene info from NCBI Gene
```
ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Mus_musculus.gene_info.gz
```

In [5]:
def load_gene_info(path):
  """Load the NCBI Gene file and returns a {gene_name: info} dictionary"""
  gene_info = pd.read_csv(path, delimiter='\t')
  gene_info_map = {}
  for _, row in gene_info.iterrows():
    info = {
      'name': row['Symbol'],
#       'type': row['type_of_gene'],
#       'desc': row['Other_designations'].split('|')[0],
    }
    ids = [make_db_ref('ncbi', row['GeneID'])]
    for xref in row['dbXrefs'].split('|'):
      xref_parts = xref.split(':')
      db = xref_parts[0]
      id = xref_parts[-1]
      ids.append(make_db_ref(db, id))
    for id in ids:
      gene_info_map[id] = info
#     for s in row['Synonyms'].split('|'):
#       gene_info_map[s] = info
  return gene_info, gene_info_map

gene_info, gene_info_map = load_gene_info('../data/Mus_musculus.gene_info')

Exploring the BIOPAX format here..

In [6]:
Pathway = list(filter(lambda x: x.name == 'Pathway', pc.world.classes()))[0]
Interaction = list(filter(lambda x: x.name == 'Interaction', pc.world.classes()))[0]
Conversion = list(filter(lambda x: x.name == 'Conversion', pc.world.classes()))[0]
Control = list(filter(lambda x: x.name == 'Control', pc.world.classes()))[0]
Catalysis = list(filter(lambda x: x.name == 'Catalysis', pc.world.classes()))[0]
TemplateReaction = list(filter(lambda x: x.name == 'TemplateReaction', pc.world.classes()))[0]
BiochemicalReaction = list(filter(lambda x: x.name == 'BiochemicalReaction', pc.world.classes()))[0]
BindingFeature = list(filter(lambda x: x.name == 'BindingFeature', pc.world.classes()))[0]
ModificationFeature = list(filter(lambda x: x.name == 'ModificationFeature', pc.world.classes()))[0]
Complex = list(filter(lambda x: x.name == 'Complex', pc.world.classes()))[0]
for p in Pathway.instances()[0:1]:
  print(dir(p))
  print(str(p.__class__).replace('biopax-level3.', ''))
  print(p.get_properties())
  print(p.displayName)
  print(p.pathwayComponent[0])
  print(p.pathwayComponent[0].get_properties())
  print(p.pathwayComponent[0].displayName)
  print()
  print(p.pathwayComponent[0].left[0].get_properties())
  print(p.pathwayComponent[0].left[0].displayName)
  print(p.pathwayComponent[0].left[0].entityReference)
  for c in p.pathwayComponent[0].left[0].component:
    print('--c')
    print(c)
    print(c.displayName)
    if hasattr(c, 'component'):
      for c2 in c.component:
        print('--c2')
        print(c2.get_properties())
        print(c2)
        print(c2.displayName)
        if hasattr(c2, 'component'):
          for c3 in c2.component:
            print('--c3')
            print(c3.get_properties())
            print(c3)
            print(c3.displayName)
  print(p.pathwayComponent[0].left[0].xref[0].get_properties())
  for xref in p.pathwayComponent[0].left[0].xref:
    print(xref.id)
    print(xref.db)
#   print(p.pathwayComponent[0].left[0].entityReference.name)
#   print(p.pathwayComponent[0].left[0].entityReference.standardName)
#   print(p.pathwayComponent[0].left[0].entityReference.displayName)
#   print(p.pathwayComponent[0].left[0].entityReference.xref[0].get_properties())

#   print(p.pathwayComponent[0].controller)
#   print(p.pathwayComponent[0].controller[0].get_properties())
#   print(p.pathwayComponent[0].controller[0].displayName)
#   print(p.pathwayComponent[0].controller[0].entityReference)
#   print(p.pathwayComponent[0].controller[0].entityReference.get_properties())
#   print(p.pathwayComponent[0].controller[0].entityReference.organism.displayName)
#   print(p.pathwayComponent[0].controller[0].entityReference.organism)
#   print()
#   print(p.pathwayComponent[0].controlled)
#   print(p.pathwayComponent[0].controlled.get_properties())
#   print(p.pathwayComponent[0].controlled.displayName)
#   print(hasattr(p.pathwayComponent[0].controlled, 'displayName'))
#   print(p.pathwayComponent[0].controlled.conversionDirection)
  print()
#   print(p.pathwayComponent[0].controlled.right[0].get_properties())
#   print(p.pathwayComponent[0].controlled.right[0].displayName)
#   print(p.pathwayComponent[0].controlled.right[0].entityReference)
#   print(p.pathwayComponent[0].controlled.right[0].entityReference.name)
#   print(p.pathwayComponent[0].controlled.right[0].entityReference.standardName)
#   print(p.pathwayComponent[0].controlled.right[0].entityReference.displayName)
#   print(p.pathwayComponent[0].controlled.right[0].entityReference.xref[0].get_properties())
  print()
#   print(p.pathwayComponent)

#   print(p.xref)
#   print(p.xref[0].get_properties())
#   print(p.xref[0].db)

['INDIRECT_get_properties', '__class__', '__classcell__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_equivalent_to', '_get_instance_possible_relations', '_get_is_instance_of', '_instance_equivalent_to_changed', '_instance_is_a_changed', '_name', '_set_is_instance_of', 'comment', 'dataSource', 'differents', 'displayName', 'generate_default_name', 'get_equivalent_to', 'get_inverse_properties', 'get_iri', 'get_name', 'get_properties', 'iri', 'is_a', 'is_instance_of', 'name', 'namespace', 'organism', 'pathwayComponent', 'pathwayOrder', 'set_equivalent_to', 'set_iri', 'set_name', 'storid', 'xref']
Pathway
{biopax-level3.displayName, biopax-level3.pathwayOrder, biopax-level3.organism, biopax-

In [7]:
print(json.dumps(all2mouse, indent=2)[:400])

{
  "hgnc:10663": [
    "mgi:1918305",
    "uniprot:q8ccp0"
  ],
  "uniprot:o60524": [
    "mgi:1918305",
    "uniprot:q8ccp0"
  ],
  "hgnc:20854": [
    "mgi:1914066",
    "uniprot:q9d305"
  ],
  "uniprot:q9h0w7": [
    "mgi:1914066",
    "uniprot:q9d305"
  ],
  "hgnc:18049": [
    "mgi:2152200",
    "uniprot:q91va0"
  ],
  "uniprot:q08ah1": [
    "mgi:2152200",
    "uniprot:q91va0"
  ],
  "hgnc:


## Parse BIOPAX into a more actionable form
Extracting pathways and resolving chemical entities

In [9]:
def parse_molecule(m):
  res = {
    'type': 'molecule',
    'name': m.displayName if m.displayName is not None else m.name,
#     'name': getattr(m, 'displayName', m.name),
#     'name': m.name,
  }
  if isinstance(m, Complex):
    res['type'] = 'complex'
    res['component'] = [parse_molecule(cc) for cc in m.component]
  if hasattr(m, 'cellularLocation') and m.cellularLocation is not None:
    res['cellularLocation'] = m.cellularLocation.term[0]
#   if hasattr(m, 'feature') and m.feature is not None:
#     res['feature'] = []
#     if len(m.feature) > 0:
#       for f in m.feature:
#         if not isinstance(f, ModificationFeature) and not isinstance(f, BindingFeature):
#           continue
#         feature = {
#           'biopaxType': str(f.__class__).replace('biopax-level3.', ''),
#         }
#         if hasattr(f, 'modificationType') and f.modificationType is not None:
#           feature['modificationType'] = f.modificationType.term[0]
#         if hasattr(f, 'featureLocation') and f.featureLocation is not None:
#           feature['featureLocation'] = [{
#             'sequencePosition': l.sequencePosition,
#             'positionStatus': l.positionStatus,
#           } for l in f.featureLocation]
# #         print(f.get_properties())
# #         print(f.modificationType.term[0])
# #         print(f.featureLocation[0].get_properties())
# #         print(f.featureLocation[0].sequencePosition)
# #         print(f.featureLocation[0].positionStatus)
# # #         print(f.comment)
# #         print(f.featureLocation)
# #         print(f.modificationType.term[0])
#         res['feature'].append(feature)
  if hasattr(m, 'entityReference') and m.entityReference is not None:
    e = m.entityReference
    res['entityReference'] = {
      'name': getattr(e, 'displayName', e.name),
    }
    if hasattr(e, 'xref') and e.xref is not None:
      for x in e.xref:
        conv_id = '{0}:{1}'.format(convert_db_name(x.db), convert_db_id(x.id))
        if conv_id in all2mouse:
          for db_ref in all2mouse[conv_id]:
            if db_ref in gene_info_map:
              res['entityReference']['gene'] = gene_info_map[db_ref]
        # TODO: resolve small molecules
        if ('uniprot' in x.db) \
          or ( \
            (x.db == 'ensembl' or x.db == 'ncbi gene') \
            and ( \
              'xref' not in res['entityReference'] \
              or res['entityReference']['xref']['db'] != 'ncbi gene' \
            ) \
          ) \
          or ( \
#             x.db in ['chebi'] \
#             and 
            'xref' not in res['entityReference'] \
          ):
          # NCBI Gene has the top priority and the Ensemlb goes next
          res['entityReference']['xref'] = {
            'id': convert_db_id(x.id),
            'db': convert_db_name(x.db),
          }
#       res['entityReference']['xref'] = [
#         {
#           'id': x.id,
#           'db': x.db,
# #           'relationshipType': [
# #              r.term
# #             for r in getattr(x, 'relationshipType', [])
# #           ],
#         }
#         for x in e.xref
#       ]
#     if hasattr(e, 'organism') and e.organism is not None:
#       o = e.organism
#       res['entityReference']['organism'] = {
#         'name': getattr(o, 'displayName', o.name),
#       }
#       if 'Homo' not in res['entityReference']['organism']['name']:
#         print(res['entityReference']['organism']['name'])
  return res


def parse_reaction(r):
  res = {
    'type': 'reaction',
    'name': r.displayName if r.displayName is not None else r.name,
#     'name': getattr(r, 'displayName', r.name),
#     'comment': getattr(r, 'comment', None),
    'conversionDirection': getattr(r, 'conversionDirection', None),
  }
  if hasattr(r, 'left') and r.left is not None:
    res['left'] = [parse_molecule(c) for _, c in enumerate(r.left)]
  if hasattr(r, 'right') and r.right is not None:
    res['right'] = [parse_molecule(c) for _, c in enumerate(r.right)]
  return res


def parse_control(c):
  res = {
    'type': 'control',
    'name': c.displayName if c.displayName is not None else c.name,
#     'name': getattr(c, 'displayName', c.name),
#     'comment': getattr(c, 'comment', None),
    'controlType': getattr(c, 'controlType', None),
  }
  if hasattr(c, 'controller') and c.controller is not None:
    res['controller'] = [parse_molecule(cc) for cc in c.controller]
  if hasattr(c, 'cofactor') and c.cofactor is not None:
    res['cofactor'] = [parse_molecule(cc) for cc in c.cofactor]
  if hasattr(c, 'controlled') and c.controlled is not None:
    res['controlled'] = parse_reaction(c.controlled)
  return res


def parse_template_reaction(c):
  res = {
    'type': 'template_reaction',
    'name': c.displayName if c.displayName is not None else c.name,
#     'name': getattr(c, 'displayName', c.name),
#     'comment': getattr(c, 'comment', None),
    'templateDirection': getattr(c, 'templateDirection', None),
  }
  if hasattr(c, 'template') and c.template is not None:
    res['template'] = parse_molecule(c.template)
  if hasattr(c, 'product') and c.product is not None:
    res['product'] = [parse_molecule(cc) for cc in c.product]
  return res


def parse_pathway(p):
  res = {
    'type': 'pathway',
    'name': getattr(p, 'displayName', p.name),
#     'comment': getattr(p, 'comment', None),
  }
  if hasattr(p, 'pathwayComponent') and p.pathwayComponent is not None:
    res['pathwayComponent'] = []
    for c in p.pathwayComponent:
      if isinstance(c, Pathway):
#         res_list.append(parse_pathway(c))
        continue
      elif isinstance(c, Control):
        res['pathwayComponent'].append(parse_control(c))
      elif isinstance(c, TemplateReaction):
        res['pathwayComponent'].append(parse_template_reaction(c))
      else:
        res['pathwayComponent'].append(parse_reaction(c))
  return res


# print(list(pc.world.properties()))

parsed_map = {}
pathways = []

generic_pathway_template = {
  'type': 'pathway',
  'name': '<generic>',
  'pathwayComponent': [],
}

def process_reactions(reaction_class, parse_fn):
  print(len(reaction_class.instances()))
  for c in reaction_class.instances()[:]:
    if c.name in parsed_map:
      continue
    parsed_map[c.name] = ''
#     for xref in c.xref:
#       print(xref.title)
#       print(xref.get_properties())
#     print(dir(c))
    r = parse_fn(c)
  #   print(json.dumps(r, indent=4))
    generic_pathway = copy.deepcopy(generic_pathway_template)
    generic_pathway['pathwayComponent'].append(r)
    pathways.append(generic_pathway) 

for reaction_class, parse_fn in [
  (Control, parse_control),
  (TemplateReaction, parse_template_reaction),
#   (BiochemicalReaction, parse_reaction),
  (Conversion, parse_reaction),
  (Interaction, parse_reaction),
]:
  process_reactions(reaction_class, parse_fn)

# for p in Pathway.instances()[:]:
#   pathway = parse_pathway(p)
# #   print(json.dumps(res, indent=2)[:100000])
#   pathways.append(pathway)

print(len(pathways))

7085
14
11980
19079
19079


In [10]:
# Filter out all the reactions without resolved genes
filtered_pathways = []

def gene_resolved(m):
  if m['type'] == 'complex':
    return any([gene_resolved(c) for c in m['component']])
  if 'entityReference' not in m:
    return False
  return 'gene' in m['entityReference']

for p in pathways:
  filtered_p = copy.deepcopy(p)
  filtered_p['pathwayComponent'] = []
  for r in p['pathwayComponent']:
    resolved = False
    rr = r
    if r['type'] == 'control':
      for m in r['controller']:
        resolved = resolved or gene_resolved(m)
    elif r['type'] == 'template_reaction':
      for m in r['product']:
        resolved = resolved or gene_resolved(m)
    if 'left' in rr:
      for m in rr['left']:
        resolved = resolved or gene_resolved(m)
    if 'right' in rr:
      for m in rr['right']:
        resolved = resolved or gene_resolved(m)
    if resolved:
      filtered_p['pathwayComponent'].append(r)
  filtered_pathways.append(filtered_p)

print(len(filtered_pathways))

19079


## Convert the parsed data into a graph form

In [26]:
def location_group(data):
  if 'cellularLocation' not in data:
    return None
  loc = data['cellularLocation']
  if any([ l in loc for l in [
    'extracellular region',
    'external side of plasma membrane',
    'cell junction',
  ] ]):
    return 5
  elif any([ l in loc for l in [
    'cell outer membrane',
    'cell wall',
    'cytoplasmic side of plasma membrane',
    'plasma membrane',
  ] ]):
    return 4
  elif any([ l in loc for l in [
    'mitochondrial',
  ] ]):
    return 3
  elif any([ l in loc for l in [
    'cytosol',
    'host cell cytosol',
  ] ]):
    return 0
  elif any([ l in loc for l in [
    'Golgi',
  ] ]):
    return -3
  elif any([ l in loc for l in [
    'endoplasmic reticulum',
  ] ]):
    return -4
  elif any([ l in loc for l in [
    'nuclear envelope',
    'nucleoplasm',
    'chromosome',
    'chromosome, centromeric region',
  ] ]):
    return -5
  return None

def pathways2graph(pathways):
  nodes = []
  node_name_map = {}
  edges = []
  edge_map = {}
  
  def add_node(data):
    name = data['entityReference']['name'] if 'entityReference' in data else data['name']
    map_id = f'{name}@{location_group(data)}'
    if map_id in node_name_map:
      return node_name_map[map_id]
    id = len(nodes)
    node = {
      '__id': id,
      'type': data['type'],
      'name': name,
    }
    if data['type'] == 'molecule':
      if 'cellularLocation' in data:
        node['cellularLocation'] = data['cellularLocation']
      if 'entityReference' in data:
        node['entityReference'] = data['entityReference']
    nodes.append(node)
    node_name_map[map_id] = id
    return id
  
  def add_nodes(data):
    ids = []
    if data['type'] == 'complex':
      ids.extend([ add_node(c) for c in data['component'] ])
    else:
      ids.append(add_node(data))
    return list(set(ids))
  
  def add_edge(source, target, relation):
    edge_key = f'{source}|{target}'
    if edge_key in edge_map:
      return
    edges.append([source, target, relation])
    edge_map[edge_key] = ''
  
  control_type2relation = {
    'ACTIVATION': 'activator',
    'INHIBITION': 'inhibitor',
  }
  
  for pathway in pathways:
    for reaction_container in pathway['pathwayComponent']:
      reaction = reaction_container
      
      if reaction_container['type'] == 'control':
        reaction = reaction_container['controlled']
    
      reaction_node_id = add_node(reaction)
      if reaction_container['type'] == 'control':
        for c in reaction_container['controller']:
          molecule_node_ids = add_nodes(c)
          for m in molecule_node_ids:
            relation = 'controller'
            if 'controlType' in reaction_container:
              relation = reaction_container['controlType']
              for c2r in control_type2relation:
                if c2r in relation:
                  relation = control_type2relation[c2r]
            add_edge(m, reaction_node_id, relation)
      elif reaction_container['type'] == 'template_reaction':
        if 'template' in reaction_container:
          molecule_node_ids = add_nodes(reaction_container['template'])
          [ add_edge(m, reaction_node_id, 'template') for m in molecule_node_ids ]
        for product in reaction_container['product']:
          molecule_node_ids = add_nodes(product)
          [ add_edge(reaction_node_id, m, 'product') for m in molecule_node_ids ]
          
      if 'left' in reaction:
        for molecule in reaction['left']:
          molecule_node_ids = add_nodes(molecule)
          if reaction['conversionDirection'] == 'LEFT_TO_RIGHT':
            [ add_edge(m, reaction_node_id, 'reactant') for m in molecule_node_ids ]
          elif reaction['conversionDirection'] == 'RIGHT_TO_LEFT':
            [ add_edge(reaction_node_id, m, 'product') for m in molecule_node_ids ]
          else:
            [ add_edge(m, reaction_node_id, 'reactant') for m in molecule_node_ids ]
          
      if 'right' in reaction:
        for molecule in reaction['right']:
          molecule_node_ids = add_nodes(molecule)
          if reaction['conversionDirection'] == 'LEFT_TO_RIGHT':
            [ add_edge(reaction_node_id, m, 'product') for m in molecule_node_ids ]
          elif reaction['conversionDirection'] == 'RIGHT_TO_LEFT':
            [ add_edge(m, reaction_node_id, 'reactant') for m in molecule_node_ids ]
          else:
            [ add_edge(reaction_node_id, m, 'product') for m in molecule_node_ids ]
        
  return {
    'nodes': nodes,
    'edges': edges,
  }

print(len(pathways2graph(filtered_pathways[:1])['nodes']))
print(len(pathways2graph(filtered_pathways[:1])['edges']))
print(json.dumps(pathways2graph(filtered_pathways[:1]), indent=4))

5
4
{
    "nodes": [
        {
            "__id": 0,
            "type": "reaction",
            "name": "TP53 stimulates BNIP3L expression"
        },
        {
            "__id": 1,
            "type": "complex",
            "name": "p-S15,S20-TP53 Tetramer"
        },
        {
            "__id": 2,
            "type": "molecule",
            "name": "NIX",
            "cellularLocation": "nucleoplasm",
            "entityReference": {
                "name": "NIX",
                "gene": {
                    "name": "Bnip3l"
                },
                "xref": {
                    "id": "ensg00000104765",
                    "db": "ensembl"
                }
            }
        },
        {
            "__id": 3,
            "type": "molecule",
            "name": "CBP_HUMAN",
            "cellularLocation": "nucleoplasm",
            "entityReference": {
                "name": "CBP_HUMAN",
                "xref": {
                    "id": "q92793",
              

In [27]:
print(len(filtered_pathways))
with open('./results/pathways.json', 'w') as f:
  json.dump(pathways2graph(filtered_pathways[:]), f)

19079


In [268]:
# print(len(pathways2graph(filtered_pathways[:])['nodes']))
# print(len(pathways2graph(filtered_pathways[:])['edges']))
print(json.dumps(filtered_pathways[0], indent=4))

{
    "type": "pathway",
    "name": "NTF4 activates NTRK2 (TRKB) signaling",
    "pathwayComponent": [
        {
            "type": "reaction",
            "name": "NTF4-bound NTRK2 dimers trans-autophosphorylate",
            "conversionDirection": "LEFT_TO_RIGHT",
            "left": [
                {
                    "type": "complex",
                    "name": "NTF4:NTRK2 homodimer",
                    "component": [
                        {
                            "type": "complex",
                            "name": "NTF4:NTRK2",
                            "component": [
                                {
                                    "type": "complex",
                                    "name": "NTF4 homodimer",
                                    "component": [
                                        {
                                            "type": "molecule",
                                            "name": "NTF4",
                               

In [183]:
def iterate_pathway(p, on_control=None, on_template_reaction=None, on_reaction=None, on_molecule=None):
  for r in p['pathwayComponent']:
    rr = r
    if r['type'] == 'control':
      if on_control is not None:
        on_control(r)
      for m in r['controller']:
        if on_molecule is not None:
          on_molecule(m, r)
    elif r['type'] == 'template_reaction':
      if on_template_reaction is not None:
        on_template_reaction(r)
      for m in r['product']:
        if on_molecule is not None:
          on_molecule(m, r)
    else:
      if on_reaction is not None:
        on_reaction(r)
    if 'left' in rr:
      for m in rr['left']:
        if on_molecule is not None:
          on_molecule(m, r)
    if 'right' in rr:
      for m in rr['right']:
        if on_molecule is not None:
          on_molecule(m, r)
          

cellular_locations = []
for p in pathways:
  def process_molecule(m, r):
    if 'cellularLocation' in m:
      cellular_locations.append(m['cellularLocation']) 
  iterate_pathway(p, on_molecule=process_molecule)
print(np.unique(cellular_locations))

['COPII-coated ER to Golgi transport vesicle'
 'ER to Golgi transport vesicle membrane' 'Golgi lumen' 'Golgi membrane'
 'Golgi-associated vesicle lumen' 'Golgi-associated vesicle membrane'
 'autophagosome' 'autophagosome membrane' 'axonemal microtubule'
 'azurophil granule lumen' 'azurophil granule membrane' 'cell junction'
 'cell outer membrane' 'cell wall' 'chromosome'
 'chromosome, centromeric region' 'ciliary basal body' 'ciliary base'
 'ciliary membrane' 'ciliary tip' 'cilium'
 'clathrin-coated endocytic vesicle'
 'clathrin-coated endocytic vesicle membrane'
 'clathrin-coated vesicle membrane'
 'clathrin-sculpted acetylcholine transport vesicle lumen'
 'clathrin-sculpted gamma-aminobutyric acid transport vesicle lumen'
 'clathrin-sculpted gamma-aminobutyric acid transport vesicle membrane'
 'clathrin-sculpted monoamine transport vesicle lumen'
 'clathrin-sculpted monoamine transport vesicle membrane'
 'cornified envelope' 'cytoplasm' 'cytoplasmic side of plasma membrane'
 'cytopla

In [182]:
# g = nx.DiGraph()
# g = Network(notebook=True, width='100%')

n = 0
nr = 0
un = 0
# for r in pathway['pathwayComponent']:
#   g.add_node(r['name'])
#   rr = r
#   if r['type'] == 'catalysis':
#     for m in r['controller']:
#       if 'gene' in m['entityReference']:
#         nr += 1
#       n += 1
#       g.add_node(m['name'])
#       g.add_edge(m['name'], r['name'])
#     rr = r['controlled']
#   # TODO: conversion direction
#   for m in rr['left']:
#     if 'gene' in m['entityReference']:
#       nr += 1
#     n += 1
#     g.add_node(m['name'])
#     g.add_edge(m['name'], r['name'])
#   for m in rr['right']:
#     if 'gene' in m['entityReference']:
#       nr += 1
#     n += 1
#     g.add_node(m['name'])
#     g.add_edge(r['name'], m['name'])

# nx.draw(g)

# net = Network(notebook=True, width='100%')
# net.from_nx(g)
# net.enable_physics(True)
# net.show("pyvis.html")

for p in filtered_pathways:
  def process_molecule(m, r):
    global n, nr, un
    n += 1
    if 'entityReference' not in m:
      un += 1
      return
    if 'gene' in m['entityReference']:
      nr += 1
  iterate_pathway(p, on_molecule=process_molecule)
      
print(n)
print(nr)

17055
8174


In [1]:
# import xml.etree.ElementTree as ET
from lxml import etree

In [13]:
mouse_gene2uniprot = {}
with open('../data/uniprot_sprot.xml', 'rb') as f:
  context = etree.iterparse(
    '../data/uniprot_sprot.xml', 
    events=("end",), 
    tag="{http://uniprot.org/uniprot}entry"
  )
  for action, elem in context:
    if action == 'end':
#       print("%s: %s" % (action, elem.tag))
      organism = elem.findall('{http://uniprot.org/uniprot}organism')[0] \
            .findall('{http://uniprot.org/uniprot}name')[0].text.strip()
      if not 'mus musculus' in organism.lower():
        # Memory cleanup
        elem.clear()
        for ancestor in elem.xpath('ancestor-or-self::*'):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[0]
        continue
      accession = elem.findall('{http://uniprot.org/uniprot}accession')[0].text.strip()
      if len(elem.findall('{http://uniprot.org/uniprot}gene')) < 1:
        continue
      gene_name = elem.findall('{http://uniprot.org/uniprot}gene')[0] \
            .findall('{http://uniprot.org/uniprot}name')[0].text.strip()
      mouse_gene2uniprot[gene_name] = accession
      
      # Memory cleanup
      elem.clear()
      for ancestor in elem.xpath('ancestor-or-self::*'):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[0]
              
#       print('Frog virus 3' in organism)
#       print(elem.xpath('.//name', namespaces=context.nsmap))
#       print(etree.tostring(elem))
#       break

In [16]:
len(mouse_gene2uniprot)

16628

In [23]:
results_merged = pd.read_csv('./results/merged.csv')
daphnia2mouse = pd.read_csv('../data/MetaPhOrs-daphnia2mouse.csv')
daphnia2mouse["daphnia"] = daphnia2mouse["Daphnia pulex"].map(lambda g: g.replace('GO', '').strip())
daphnia2mouse["mouse"] = daphnia2mouse["Mus musculus"].map(lambda g: g.replace('GO', '').strip())
daphnia2mouse.head()

Unnamed: 0,Lineage,Daphnia pulex,Mus musculus,CS,EL,Trees,phylome,ensembl,orthomcl,treefam,eggnog,hogenom,daphnia,mouse
0,,GO E9HRC7,GO Q922V4,1.0,3,7,,,1.00/5,,,,E9HRC7,Q922V4
1,,GO E9GF52,GO Q3U3J3,0.714,3,7,,,0.60/5,,,,E9GF52,Q3U3J3
2,,GO E9HAK4,GO Q3U1N2,1.0,3,4,,,1.00/2,,,,E9HAK4,Q3U1N2
3,,GO E9G2Y1,GO S4R1P5,1.0,1,1,,,,,,,E9G2Y1,S4R1P5
4,,GO E9FSN2,GO Q8CFE4,1.0,3,7,,,1.00/5,,,,E9FSN2,Q8CFE4


In [44]:
for index, row in results_merged.iterrows():
  if not row['gene'] in mouse_gene2uniprot:
    continue
  gene_name = mouse_gene2uniprot[row['gene']]
  results_merged.loc[index, 'uniprot_mouse'] = gene_name
  
  homolog_match = daphnia2mouse[daphnia2mouse["mouse"] == gene_name]
  if len(homolog_match) < 1:
    continue
  if homolog_match.iloc[0]["EL"] < 2:
    # Low number of supporting data sources
    continue
  homolog = homolog_match.iloc[0]["daphnia"]
  if homolog.startswith('M!'):
    # Not a proper Uniprot id
    continue
#   print(gene_name)
#   print(homolog)
  results_merged.loc[index, 'uniprot_daphnia'] = homolog

In [45]:
results_merged[results_merged['uniprot_daphnia'].notnull()].head()

Unnamed: 0.1,Unnamed: 0,index,dataset,start_age,end_age,sex,tissue,subtissue,cell_ontology_class,gene,fold_change_log2,expression_at_start_age,expression_at_end_age,expression_max_std_dev,p_value,uniprot_mouse,uniprot_daphnia
60,19245,5,droplet,24m,30m,male,Kidney,,,Hnf4a,15.215368,4.09403e-09,0.00015579,0.00023,0.0,P49698,E9H7Y7
64,2428,3,droplet,3m,24m,male,Kidney,,,Hnf4a,-15.065793,0.000140447,4.09403e-09,0.000258,7.582662e-73,P49698,E9H7Y7
65,5398,15,facs,3m,18m,female,Liver,,,Map4k1,15.05417,3.732086e-09,0.0001270061,0.000393,1.709817e-08,P70218,E9G8P2
82,42689,0,facs,3m,24m,male,Brain_Non-Myeloid,Striatum,oligodendrocyte,Ppil2,-14.671002,9.562943e-05,3.664892e-09,0.000178,2.025537e-15,Q9D787,E9H7X4
92,42690,1,facs,3m,24m,male,Brain_Non-Myeloid,Striatum,oligodendrocyte,Aldh6a1,-14.523355,8.632671e-05,3.664892e-09,0.00016,1.266913e-15,Q9EQ20,E9GHJ1


In [46]:
results_merged.to_csv('./results/merged_augmented.csv')