# Configuration of the architecture of the GO hidden layers of GraphGONet (step 3)

## Summary
Truncate the graph

#### Load libraries

In [1]:
%config Completer.use_jedi = False

In [2]:
 import sys; sys.getdefaultencoding()

'utf-8'

In [3]:
import torch
import torch.nn as nn
import torch_geometric
import networkx as nx
from goatools import obo_parser

#dealing with dataset
from torchvision import transforms
from torch.utils.data import Dataset
from torch_geometric.data import Data, DataLoader

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from math import *

import os
import pickle
import time
import sys
import json

#### Set environnement

In [4]:
SUBONTOLOGY = "BP"

In [13]:
DATASET=input("Which dataset are you using? TCGA or microarray")

Which dataset are you using? TCGA or microarray TCGA


In [7]:
DIM_INITIAL_NEURON = 1

In [24]:
keyRef = "ENSEMBL" if DATASET=="TCGA" else "PROBE"

In [9]:
dir_files = "../files" #can be modified to your own path

In [10]:
dir_data = "../data" #can be modified to your own path

In [11]:
filename = os.path.join(dir_files,"id_genes.npz")

## Load dataset & data cleaning

Load useful files for the construction of the graph (ensembl,GO-terms)

In [14]:
start = time.time()

### Gene expression

#### Selection of the genes connected to a GO-term in the list cleaned

In [19]:
file_go = pd.read_csv(os.path.join(dir_files,"GOannotations_kept.csv"),index_col=0,encoding='utf-8')

In [22]:
loaded = np.load(os.path.join(dir_data,"id_genes.npz"),allow_pickle=True)
list_genes = loaded["genes"]

In [None]:
%%time
to_keep = np.intersect1d(list_genes,np.unique(file_go[keyRef])) #file_go can contain ensembl that we don't have in our dataset.
mask = np.isin(list_genes,to_keep)
list_genes[mask]

In [None]:
list_genes[mask].shape[0]

In [None]:
%%time
np.savez_compressed(file=os.path.join(dir_files,"genes_annotated.npz"),mask=mask)

### Load and Format GO

In [29]:
%%time
matrix_connection_original = pd.read_csv(os.path.join(dir_files,"matrix_connection_entire.csv"),index_col=0,encoding='utf-8')

CPU times: user 34.3 s, sys: 1.01 s, total: 35.3 s
Wall time: 1min 7s


In [14]:
matrix_connection_original.shape

(18074, 11867)

In [20]:
list_go = matrix_connection_original.columns

In [15]:
%%time
df_go_level=pd.read_csv(os.path.join(dir_files,"go_to_level_entire.csv"),index_col=0,encoding='utf-8')

CPU times: user 14.3 ms, sys: 0 ns, total: 14.3 ms
Wall time: 13.8 ms


In [16]:
df_go_level.head()

Unnamed: 0,root,d+,d-
GO:0000002,6,1,1
GO:0000003,1,1,4
GO:0000012,8,1,4
GO:0000017,8,1,0
GO:0000018,8,2,7


In [31]:
original_graph = nx.read_gpickle(os.path.join(dir_files,"gobp-final"))

In [None]:
original_graph = nx.read_gpickle(os.path.join(dir_files,"gobp-entire"))

In [32]:
print(nx.info(original_graph))

Name: go
Type: MultiDiGraph
Number of nodes: 16062
Number of edges: 37406
Average in degree:   2.3289
Average out degree:   2.3289


In [33]:
import copy
truncated_graph = copy.deepcopy(original_graph)

#### Convert the names of the nodes into integer
The first indices will target the GO-terms whose initial embeddings are based on the expression of the ensembl directly connected to them.
For the other GO-terms, their initial embedding is set to 0. They are used for the graph topology consistency.

In [21]:
map_go_int={go:idx for idx,go in enumerate(list_go)}
map_go_int

{'GO:0000002': 0,
 'GO:0000003': 1,
 'GO:0000012': 2,
 'GO:0000017': 3,
 'GO:0000018': 4,
 'GO:0000019': 5,
 'GO:0000023': 6,
 'GO:0000027': 7,
 'GO:0000028': 8,
 'GO:0000032': 9,
 'GO:0000038': 10,
 'GO:0000045': 11,
 'GO:0000050': 12,
 'GO:0000052': 13,
 'GO:0000053': 14,
 'GO:0000054': 15,
 'GO:0000055': 16,
 'GO:0000056': 17,
 'GO:0000070': 18,
 'GO:0000076': 19,
 'GO:0000077': 20,
 'GO:0000079': 21,
 'GO:0000080': 22,
 'GO:0000082': 23,
 'GO:0000083': 24,
 'GO:0000086': 25,
 'GO:0000096': 26,
 'GO:0000097': 27,
 'GO:0000098': 28,
 'GO:0000103': 29,
 'GO:0000105': 30,
 'GO:0000117': 31,
 'GO:0000122': 32,
 'GO:0000132': 33,
 'GO:0000154': 34,
 'GO:0000160': 35,
 'GO:0000161': 36,
 'GO:0000165': 37,
 'GO:0000183': 38,
 'GO:0000184': 39,
 'GO:0000209': 40,
 'GO:0000212': 41,
 'GO:0000226': 42,
 'GO:0000244': 43,
 'GO:0000245': 44,
 'GO:0000255': 45,
 'GO:0000256': 46,
 'GO:0000266': 47,
 'GO:0000272': 48,
 'GO:0000278': 49,
 'GO:0000281': 50,
 'GO:0000288': 51,
 'GO:0000289': 52,
 'G

In [22]:
list_nodes_graph=np.array(original_graph.nodes)
start_idx=len(map_go_int)
for idx,go in enumerate(list_nodes_graph[np.isin(list_nodes_graph,list_go,invert=True)]):
    map_go_int[go]=idx+start_idx

In [23]:
len(map_go_int)

15849

In [24]:
map_int_go={idx:go for go,idx in map_go_int.items()}
map_int_go

{0: 'GO:0000002',
 1: 'GO:0000003',
 2: 'GO:0000012',
 3: 'GO:0000017',
 4: 'GO:0000018',
 5: 'GO:0000019',
 6: 'GO:0000023',
 7: 'GO:0000027',
 8: 'GO:0000028',
 9: 'GO:0000032',
 10: 'GO:0000038',
 11: 'GO:0000045',
 12: 'GO:0000050',
 13: 'GO:0000052',
 14: 'GO:0000053',
 15: 'GO:0000054',
 16: 'GO:0000055',
 17: 'GO:0000056',
 18: 'GO:0000070',
 19: 'GO:0000076',
 20: 'GO:0000077',
 21: 'GO:0000079',
 22: 'GO:0000080',
 23: 'GO:0000082',
 24: 'GO:0000083',
 25: 'GO:0000086',
 26: 'GO:0000096',
 27: 'GO:0000097',
 28: 'GO:0000098',
 29: 'GO:0000103',
 30: 'GO:0000105',
 31: 'GO:0000117',
 32: 'GO:0000122',
 33: 'GO:0000132',
 34: 'GO:0000154',
 35: 'GO:0000160',
 36: 'GO:0000161',
 37: 'GO:0000165',
 38: 'GO:0000183',
 39: 'GO:0000184',
 40: 'GO:0000209',
 41: 'GO:0000212',
 42: 'GO:0000226',
 43: 'GO:0000244',
 44: 'GO:0000245',
 45: 'GO:0000255',
 46: 'GO:0000256',
 47: 'GO:0000266',
 48: 'GO:0000272',
 49: 'GO:0000278',
 50: 'GO:0000281',
 51: 'GO:0000288',
 52: 'GO:0000289',
 53

In [25]:
with open(os.path.join(dir_files,'map_int_go_entire.txt'), 'w') as f:
    json.dump(map_int_go, f, indent=2)  

In [26]:
original_graph = nx.relabel_nodes(original_graph,map_go_int)

In [27]:
nx.write_gpickle(original_graph,os.path.join(dir_files,"gobp-entire-converted"))

#### Get the list of leaves

In [34]:
original_leaves = [x for x in original_graph.nodes() if original_graph.in_degree(x)==0]
len(original_leaves)

5426

In [35]:
original_leaves

['GO:0000017',
 'GO:0000023',
 'GO:0000032',
 'GO:0000050',
 'GO:0000053',
 'GO:0000055',
 'GO:0000080',
 'GO:0000103',
 'GO:0000105',
 'GO:0000132',
 'GO:0000160',
 'GO:0000173',
 'GO:0000185',
 'GO:0000244',
 'GO:0000256',
 'GO:0000290',
 'GO:0000294',
 'GO:0000301',
 'GO:0000304',
 'GO:0000320',
 'GO:0000338',
 'GO:0000348',
 'GO:0000349',
 'GO:0000350',
 'GO:0000353',
 'GO:0000354',
 'GO:0000379',
 'GO:0000381',
 'GO:0000388',
 'GO:0000389',
 'GO:0000390',
 'GO:0000395',
 'GO:0000413',
 'GO:0000415',
 'GO:0000432',
 'GO:0000435',
 'GO:0000447',
 'GO:0000448',
 'GO:0000453',
 'GO:0000454',
 'GO:0000455',
 'GO:0000461',
 'GO:0000467',
 'GO:0000472',
 'GO:0000480',
 'GO:0000481',
 'GO:0000492',
 'GO:0000493',
 'GO:0000494',
 'GO:0000495',
 'GO:0000710',
 'GO:0000711',
 'GO:0000712',
 'GO:0000715',
 'GO:0000717',
 'GO:0000718',
 'GO:0000720',
 'GO:0000727',
 'GO:0000732',
 'GO:0000733',
 'GO:0000735',
 'GO:0000738',
 'GO:0000916',
 'GO:0000917',
 'GO:0000921',
 'GO:0000958',
 'GO:00009

In [None]:
gp_leaves = df_go_level.loc[original_leaves].groupby("root")
gp_leaves.size()

In [34]:
gp_leaves.groups.keys()

dict_keys([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

In [35]:
mlevel_degree=np.array([])
for idx in gp_leaves.groups.keys():
    tmp=gp_leaves.get_group(idx)["d+"].describe().values
    mlevel_degree = np.vstack([mlevel_degree, tmp]) if mlevel_degree.size else tmp
df_level_degree_out=pd.DataFrame(mlevel_degree.transpose(),index=gp_leaves.get_group(idx)["d+"].describe().index,columns=gp_leaves.groups.keys())
df_level_degree_out

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,9.0,44.0,135.0,235.0,435.0,640.0,786.0,809.0,806.0,691.0,424.0,225.0,125.0,55.0,31.0,14.0,6.0,3.0
mean,1.111111,1.136364,1.385185,1.92766,2.112644,2.1625,2.402036,2.600742,2.724566,2.670043,2.65566,2.68,2.712,3.145455,3.354839,2.785714,3.166667,3.333333
std,0.333333,0.347142,0.610666,0.88136,0.999401,1.010973,0.988441,1.082054,1.120589,1.151414,1.049289,1.135782,1.022458,0.848052,0.950382,1.251373,0.983192,1.154701
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0
25%,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,2.0,2.25,3.0
50%,1.0,1.0,1.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.5,4.0
75%,1.0,1.0,2.0,2.5,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.75,4.0,4.0
max,2.0,2.0,4.0,5.0,8.0,7.0,6.0,8.0,8.0,8.0,7.0,9.0,7.0,5.0,5.0,5.0,4.0,4.0


In [36]:
mlevel_degree.shape

(18, 8)

#### Update the graph

In [37]:
truncated_graph.remove_nodes_from(original_leaves)

In [38]:
print(nx.info(truncated_graph))

Name: go
Type: MultiDiGraph
Number of nodes: 10376
Number of edges: 22667
Average in degree:   2.1846
Average out degree:   2.1846


Check DAG property and connectivity

In [48]:
nx.is_weakly_connected(truncated_graph)

True

In [49]:
nx.is_directed_acyclic_graph(truncated_graph)

True

In [27]:
nx.write_gpickle(original_graph,os.path.join(dir_files,"gobp-truncated"))

#### Update the adjacency matrix

In [39]:
matrix_connection_truncated = matrix_connection_original.loc[:,~matrix_connection_original.columns.isin(original_leaves)].copy(deep=True)
matrix_connection_truncated.shape

(18074, 6394)

Associate the "new" leaves to the ensembl of their former children

In [40]:
new_leaves = [x for x in truncated_graph.nodes() if truncated_graph.in_degree(x)==0]
len(new_leaves)

3749

In [41]:
%%time
for node in new_leaves:
    successors = list(nx.ancestors(original_graph, node))
    matrix_connection_truncated[node] = (matrix_connection_original.loc[:,matrix_connection_original.columns.isin(successors)].sum(axis=1)>=1).values.astype(int)
    if node in matrix_connection_original.columns:
        matrix_connection_truncated[node] += matrix_connection_original.loc[:,node] 

CPU times: user 14.8 s, sys: 8.97 s, total: 23.8 s
Wall time: 23.8 s


In [42]:
matrix_connection_truncated.shape

(18074, 8095)

In [43]:
list_go = matrix_connection_truncated.columns

In [44]:
matrix_connection_truncated.to_csv(os.path.join(dir_files,"matrix_connection_truncated.csv"))

#### Update go-level

In [9]:
%%time
df_go_level = pd.DataFrame(df_go_level.loc[map_int_go.values(),"root"])
degree_by_level_in,degree_by_level_out=list(),list()
for node in map_int_go.keys():
    degree_by_level_in.append(truncated_graph.in_degree(int(node)))
    degree_by_level_out.append(truncated_graph.out_degree(int(node)))
df_go_level["d+"]=degree_by_level_out
df_go_level["d-"]=degree_by_level_in

CPU times: user 106 ms, sys: 9.45 ms, total: 116 ms
Wall time: 114 ms


In [10]:
df_go_level.head()

Unnamed: 0,root,d+,d-
GO:0000002,6,1,1
GO:0000003,1,1,4
GO:0000012,8,1,3
GO:0000018,8,2,7
GO:0000019,9,2,2


In [11]:
df_go_level.groupby("root").size()

root
0        1
1       24
2      120
3      300
4      606
5     1077
6     1390
7     1489
8     1593
9     1454
10    1024
11     608
12     334
13     186
14      91
15      46
16      21
17       8
18       4
dtype: int64

In [13]:
df_go_level.to_csv(os.path.join(dir_files,"go_to_level_truncated.csv"))

#### Convert the names of the nodes into integer

In [51]:
map_go_int={go:idx for idx,go in enumerate(list_go)}
map_go_int

{'GO:0000002': 0,
 'GO:0000003': 1,
 'GO:0000012': 2,
 'GO:0000018': 3,
 'GO:0000019': 4,
 'GO:0000027': 5,
 'GO:0000028': 6,
 'GO:0000038': 7,
 'GO:0000045': 8,
 'GO:0000052': 9,
 'GO:0000054': 10,
 'GO:0000056': 11,
 'GO:0000070': 12,
 'GO:0000076': 13,
 'GO:0000077': 14,
 'GO:0000079': 15,
 'GO:0000082': 16,
 'GO:0000083': 17,
 'GO:0000086': 18,
 'GO:0000096': 19,
 'GO:0000097': 20,
 'GO:0000098': 21,
 'GO:0000117': 22,
 'GO:0000122': 23,
 'GO:0000154': 24,
 'GO:0000165': 25,
 'GO:0000183': 26,
 'GO:0000184': 27,
 'GO:0000209': 28,
 'GO:0000212': 29,
 'GO:0000226': 30,
 'GO:0000245': 31,
 'GO:0000255': 32,
 'GO:0000266': 33,
 'GO:0000272': 34,
 'GO:0000278': 35,
 'GO:0000281': 36,
 'GO:0000288': 37,
 'GO:0000289': 38,
 'GO:0000290': 39,
 'GO:0000291': 40,
 'GO:0000296': 41,
 'GO:0000302': 42,
 'GO:0000303': 43,
 'GO:0000305': 44,
 'GO:0000375': 45,
 'GO:0000380': 46,
 'GO:0000387': 47,
 'GO:0000398': 48,
 'GO:0000414': 49,
 'GO:0000416': 50,
 'GO:0000422': 51,
 'GO:0000423': 52,
 'G

In [52]:
list_nodes_graph=np.array(truncated_graph.nodes)
start_idx=len(map_go_int)
for idx,go in enumerate(list_nodes_graph[np.isin(list_nodes_graph,list_go,invert=True)]):
    map_go_int[go]=idx+start_idx

In [53]:
len(map_go_int)

10376

In [54]:
map_int_go={idx:go for go,idx in map_go_int.items()}
map_int_go

{0: 'GO:0000002',
 1: 'GO:0000003',
 2: 'GO:0000012',
 3: 'GO:0000018',
 4: 'GO:0000019',
 5: 'GO:0000027',
 6: 'GO:0000028',
 7: 'GO:0000038',
 8: 'GO:0000045',
 9: 'GO:0000052',
 10: 'GO:0000054',
 11: 'GO:0000056',
 12: 'GO:0000070',
 13: 'GO:0000076',
 14: 'GO:0000077',
 15: 'GO:0000079',
 16: 'GO:0000082',
 17: 'GO:0000083',
 18: 'GO:0000086',
 19: 'GO:0000096',
 20: 'GO:0000097',
 21: 'GO:0000098',
 22: 'GO:0000117',
 23: 'GO:0000122',
 24: 'GO:0000154',
 25: 'GO:0000165',
 26: 'GO:0000183',
 27: 'GO:0000184',
 28: 'GO:0000209',
 29: 'GO:0000212',
 30: 'GO:0000226',
 31: 'GO:0000245',
 32: 'GO:0000255',
 33: 'GO:0000266',
 34: 'GO:0000272',
 35: 'GO:0000278',
 36: 'GO:0000281',
 37: 'GO:0000288',
 38: 'GO:0000289',
 39: 'GO:0000290',
 40: 'GO:0000291',
 41: 'GO:0000296',
 42: 'GO:0000302',
 43: 'GO:0000303',
 44: 'GO:0000305',
 45: 'GO:0000375',
 46: 'GO:0000380',
 47: 'GO:0000387',
 48: 'GO:0000398',
 49: 'GO:0000414',
 50: 'GO:0000416',
 51: 'GO:0000422',
 52: 'GO:0000423',
 53

In [55]:
with open(os.path.join(dir_files,'map_int_go_truncated.txt'), 'w') as f:
    json.dump(map_int_go, f, indent=2)  

In [56]:
truncated_graph = nx.relabel_nodes(truncated_graph,map_go_int)

In [60]:
nx.write_gpickle(truncated_graph,os.path.join(dir_files,"gobp-truncated-converted"))