# Build full go tree from Cytoscape.js JSON


In [1]:
import pandas as pd
from goatools import obo_parser

oboUrl = './data/go.obo'
treeSourceUrl = 'http://chianti.ucsd.edu/~kono/ci/data/collapsed_go.no_IGI.propagated.small_parent_tree'
oboUrl = './data/go.obo'
yeastAnnotationUrl = './data/gene_association.sgd.gz'
kegg2goUrl = 'http://geneontology.org/external2go/kegg2go'
reactome2go = 'http://geneontology.org/external2go/reactome2go'

phenotypeUrl='http://downloads.yeastgenome.org/curation/literature/phenotype_data.tab'

In [2]:
import json

with open('data/full-go.cyjs') as data_file:    
    original = json.load(data_file)

In [29]:
print(original['elements']['nodes'][0])

print(original['elements']['edges'][0])


{'data': {'shared_name': 'YNL259C', 'id': '425956', 'SUID': 425956, 'selected': False, 'name': 'YNL259C'}, 'position': {'y': 29466.993279476694, 'x': 16546.29659293159}, 'selected': False}
{'data': {'source': '425956', 'shared_name': 'YNL259C (gene) GO:0016531', 'selected': False, 'interaction': 'gene', 'target': '12823', 'isTree': 'TREE', 'name': 'YNL259C (gene) GO:0016531', 'id': '425981', 'shared_interaction': 'gene', 'SUID': 425981}, 'selected': False}


In [33]:
cols = pd.read_csv('./annotation_columns.txt', names=['col_names'])
col_names = cols['col_names'].tolist()
print(col_names)

yeastAnnotation = pd.read_csv(yeastAnnotationUrl, delimiter='\t', comment='!', compression='gzip', names=col_names)
yeastAnnotation['DB_Object_Synonym'] = yeastAnnotation['DB_Object_Synonym'].fillna('')
yeastAnnotation.head()

['DB', 'DB_Object_ID', 'DB_Object_Symbol', 'Qualifier', 'GO_ID', 'DB:Reference', 'Evidence', 'With_or_From', 'Aspect', 'DB_Object_Name', 'DB_Object_Synonym', 'DB_Object_Type', 'taxon', 'Date', 'Assigned_by', 'Annotation_Extension', 'Gene_Product_Form_ID']


Unnamed: 0,DB,DB_Object_ID,DB_Object_Symbol,Qualifier,GO_ID,DB:Reference,Evidence,With_or_From,Aspect,DB_Object_Name,DB_Object_Synonym,DB_Object_Type,taxon,Date,Assigned_by,Annotation_Extension,Gene_Product_Form_ID
0,SGD,S000007287,15S_RRNA,,GO:0005763,SGD_REF:S000073641|PMID:6262728,IDA,,C,Ribosomal RNA of the small mitochondrial ribos...,Q0020|14s rRNA|15S_RRNA_2,gene,taxon:559292,20150612,SGD,,
1,SGD,S000007287,15S_RRNA,,GO:0032543,SGD_REF:S000073641|PMID:6262728,IC,GO:0005763,P,Ribosomal RNA of the small mitochondrial ribos...,Q0020|14s rRNA|15S_RRNA_2,gene,taxon:559292,20150612,SGD,,
2,SGD,S000007287,15S_RRNA,,GO:0003735,SGD_REF:S000073641|PMID:6262728,IC,GO:0005763,F,Ribosomal RNA of the small mitochondrial ribos...,Q0020|14s rRNA|15S_RRNA_2,gene,taxon:559292,20150612,SGD,,
3,SGD,S000007288,21S_RRNA,,GO:0005762,SGD_REF:S000073372|PMID:6759872,IDA,,C,Mitochondrial 21S rRNA,Q0158|21S_rRNA_3|21S_rRNA_4,gene,taxon:559292,20040202,SGD,,
4,SGD,S000007288,21S_RRNA,,GO:0032543,SGD_REF:S000073372|PMID:6759872,IMP,,P,Mitochondrial 21S rRNA,Q0158|21S_rRNA_3|21S_rRNA_4,gene,taxon:559292,20100715,SGD,,


In [11]:
## Load gene count
df_term_size = pd.read_csv('./data/collapsed_go.no_IGI.propagated.term_sizes', delimiter='\t', names=['term_id', 'geneCount'])
df_term_size.head()

Unnamed: 0,term_id,geneCount
0,GO:0000001,27
1,GO:0000002,42
2,GO:0000003,448
3,GO:0000006,1
4,GO:0000007,1


In [13]:
go_map = {}

for row in df_term_size.itertuples():
    go_map[row[1]] = int(row[2])

In [40]:
gene_map = {}

for row in yeastAnnotation.itertuples():
    gene_map[row[11].split('|')[0]] = row[3]


In [20]:
obo = obo_parser.GODag(oboUrl)

load obo file ./data/go.obo
./data/go.obo: fmt(1.2) rel(2017-08-10) 49,042 GO Terms


In [47]:
full_go_w_genes = {}
new_nodes = []
new_edges = []

for node in original['elements']['nodes']:
    
    data = node['data']
    new_node = {
        'data': {
            'id': data['name']
        },
        'position': {}
    }
    
    data = node['data']
    
    if (node['data']['name'].startswith('GO'))  and (data['name'] in obo.keys()):
        # This is GO
        new_node['data']['geneCount'] = go_map[data['name']]
        go = obo[data['name']]
        new_node['data']['name'] = go.name
        new_node['data']['namespace'] = go.namespace
    elif not node['data']['name'].startswith('GO'):
                
        if data['name'] in gene_map.keys():
            new_node['data']['name'] = gene_map[data['name']]
        else:
            new_node['data']['name'] = data['name']
    
    original_pos = node['position']
    
    new_node['position']['x'] = original_pos['x']*10 
    new_node['position']['y'] = original_pos['y']*10 

    
    new_nodes.append(new_node)

print(new_nodes[9000])
print(new_nodes[9])


{'data': {'geneCount': 167, 'id': 'GO:0061695', 'namespace': 'cellular_component', 'name': 'transferase complex, transferring phosphorus-containing groups'}, 'position': {'y': 381203.35697655764, 'x': 336743.222041512}}
{'data': {'id': 'YJL124C', 'name': 'LSM1'}, 'position': {'y': 191197.4547364546, 'x': 207072.8630944996}}


In [48]:
treeColNames = ['parent', 'child', 'type', 'in_tree']
tree = pd.read_csv(treeSourceUrl, delimiter='\t', names=treeColNames)
tree.tail(10)

Unnamed: 0,parent,child,type,in_tree
441927,GO:0090150,YHR083W,gene,NOT_TREE
441928,GO:0005575,YHR083W,gene,NOT_TREE
441929,GO:0098796,YHR083W,gene,NOT_TREE
441930,GO:1902589,YHR083W,gene,NOT_TREE
441931,GO:0044085,YHR083W,gene,NOT_TREE
441932,GO:0015031,YHR083W,gene,NOT_TREE
441933,GO:1902582,YHR083W,gene,NOT_TREE
441934,GO:1902580,YHR083W,gene,NOT_TREE
441935,GO:0098799,YHR083W,gene,NOT_TREE
441936,GO:0098798,YHR083W,gene,NOT_TREE


In [148]:
import networkx as nx

G=nx.DiGraph()

node_set = set()
edges = []

for row in tree.itertuples():
    node_set.add(row[1])
    node_set.add(row[2])
    if "GO:" in row[1] and "GO:" in row[2]:
        edges.append((row[2], row[1]))

In [149]:
for node in node_set:
     if "GO:" in node:
        G.add_node(node)
    
len(edges)

14528

In [150]:
for e in edges:
    G.add_edge(e[0], e[1])

In [151]:
print(nx.info(G))
for n in G.nodes():
    if n == 'GO:00SUPER':
        print(n)
        
root = G.node['GO:00SUPER']


Name: 
Type: DiGraph
Number of nodes: 6618
Number of edges: 14528
Average in degree:   2.1952
Average out degree:   2.1952
GO:00SUPER


In [153]:
paths = nx.all_simple_paths(G, source='GO:0098799', target='GO:00SUPER')

sg = nx.DiGraph()
ns = set()

for p in paths:
    p_len = len(p)
    
    for i, v in enumerate(p):
        
        if i < p_len-1:
            s = v
            t = p[i+1]
            
            if s not in ns:
                sg.add_node(s)
                ns.add(s)
            if t not in ns:
                sg.add_node(t)
                ns.add(t)
            sg.add_edge(s, t)

print(ns)
print(nx.info(sg))

nx.write_graphml(sg, "test.graphml")

{'GO:0098798', 'GO:0044455', 'GO:0044425', 'GO:0019867', 'GO:0098588', 'GO:0044422', 'GO:0098805', 'GO:0044464', 'GO:0032991', 'GO:0044444', 'GO:0005740', 'GO:0044446', 'GO:0044424', 'GO:0043231', 'GO:0043234', 'GO:0043226', 'GO:0043227', 'GO:0005737', 'GO:0098799', 'GO:0016020', 'GO:0044429', 'GO:0031966', 'GO:00SUPER', 'GO:0043229', 'GO:0031090', 'GO:0031967', 'GO:0005741', 'GO:0005739', 'GO:0005622', 'GO:0005575', 'GO:0098796', 'GO:0031968'}
Name: 
Type: DiGraph
Number of nodes: 32
Number of edges: 63
Average in degree:   1.9688
Average out degree:   1.9688


In [102]:
import igraph as ig

g = ig.Graph(directed=True)
for node in node_set:
    if "GO:" in node:
        g.add_vertex(name=node)

g.summary()

'IGRAPH DN-- 6618 0 -- \n+ attr: name (v)'

In [103]:
g.add_edges(edges)

In [104]:
g.summary()
for e in edges:
    if e[0] == 'GO:0098798' or e[1]=='GO:0098798':
        print(e)

('GO:0098798', 'GO:0005739')
('GO:0098798', 'GO:0044429')
('GO:0098798', 'GO:0043234')
('GO:0017087', 'GO:0098798')
('GO:0098800', 'GO:0098798')
('GO:0098799', 'GO:0098798')
('GO:0030062', 'GO:0098798')


In [117]:
paths1 = g.vertex_disjoint_paths(g.vs.find('GO:0098798').index, target=g.vs.find('GO:00SUPER').index)

In [118]:
print(g.vs[2787]['name'])
print(paths1)


GO:0003857
1


In [108]:
subg = ig.Graph()
n_set = set()

for p in paths1:
    p_len = len(p)
    
    
    for i, v in enumerate(p):
        if i < p_len-1:
#             print(str(v) + ' --> ' + str(p[i+1]))
            s = g.vs[v]['name']
            t = g.vs[p[i+1]]['name']
            print(s + ' --> ' + t)
            if s not in n_set: 
                subg.add_vertex(s)
                n_set.add(s)
            if t not in n_set:
                subg.add_vertex(t)
                n_set.add(t)
            subg.add_edge(source=s, target=t)
    print('-----')

subg.summary()
print(n_set)

GO:0098798 --> GO:0043234
GO:0043234 --> GO:0032991
GO:0032991 --> GO:0005575
GO:0005575 --> GO:00SUPER
-----
{'GO:0032991', 'GO:0043234', 'GO:0098798', 'GO:00SUPER', 'GO:0005575'}


In [101]:
subg.save("sub.gml", format="gml")