# Generating datasets for gene prediction experiments

In [1]:
import modelo as m
import pandas as pd
import Bio.KEGG.REST as rkg
import Bio.KEGG.Enzyme as ex
import networkx as nx
import numpy as np

### E. coli

In [2]:
eco = m.get_network("eco", opt="eco")
eco.grafo = eco.grafo.to_undirected()
eco_nodes = list(eco.grafo.nodes())
eco_nodes.remove('undefined')
orf_queries = dict()

for i in range(len(eco_nodes)):
    gene = rkg.kegg_get(eco_nodes[i])
    tmp = ex.read(gene)
    if eco_nodes[i].startswith("eco:"):
        orf_queries[eco_nodes[i]] = tmp.name[0]

Retrieving data from KEGG PATHWAY database. Sat Apr 28 15:13:05 2018
Building graphs.


In [3]:
#Centralities
close_cent = nx.closeness_centrality(eco.grafo)
bet_cent = nx.betweenness_centrality(eco.grafo)
eigen_cent = nx.eigenvector_centrality(eco.grafo)
load_cent = nx.load_centrality(eco.grafo)
deg_cent = nx.degree_centrality(eco.grafo)
lreach_cent = {k:nx.local_reaching_centrality(eco.grafo, k) for k in eco.grafo.nodes()}

n = len(eco.grafo) - 1 #normalizing harmonic_centrality
harm_cent = {k:(v/n) for k,v in nx.harmonic_centrality(eco.grafo).items()}

n = nx.estrada_index(eco.grafo) #normalizing subgraph_centrality
sub_cent = {k:(v/n) for k,v in nx.subgraph_centrality(eco.grafo).items()}

#Other features
cco = nx.clustering(eco.grafo)
pagerank = nx.pagerank(eco.grafo)
hits = nx.hits(eco.grafo)
ind_set = {k:len(nx.maximal_independent_set(eco.grafo, [k])) for k in eco.grafo.nodes()} #to be normalized later
damage = m.damage_lemke(eco.grafo) #to be normalized later

In [4]:
nodes_eco = {v:k for k, v in orf_queries.items()}
table = pd.read_table("data/EcoliNetData.dat") #Generated in a previous step (another notebook)
node2rem = []
for j in eco.grafo.nodes():
    if orf_queries.get(j,None) == None:
        node2rem.append(j)
eco.grafo.remove_nodes_from(node2rem)

In [5]:
aux = []
#Nodes of unknown essentiality are discarded
for i,j in zip(table["Feature/Product"], table["Class(1:essential 2:noessential)"]):
    tmp = nodes_eco.get(i,None)
    if tmp != None:
        aux.append([tmp, eigen_cent[tmp], close_cent[tmp], bet_cent[tmp], load_cent[tmp], harm_cent[tmp], lreach_cent[tmp], 
                     sub_cent[tmp], deg_cent[tmp], cco[tmp], pagerank[tmp], hits[0][tmp], ind_set[tmp], damage[tmp], j])

In [7]:
data = pd.DataFrame(aux, columns = ["ec", "eigenvector_centrality", "closeness_centrality", "betweenness_centrality",
                                    "load_centrality", "harmonic_centrality", "local_reach_centrality", "subgraph_centrality",
                                    "degree_centrality", "clustering_coefficient", "pagerank", "hits", "ind_set_maximal", 
                                    "damage", "class"])
#Normalizing remaining features
data['ind_set_maximal'] = data['ind_set_maximal'] / max(data['ind_set_maximal'])
data['damage'] = data['damage'] / max(data['damage'])
data = data[data['degree_centrality'] != 0.0] #Removing isolates
data.to_csv('data/ecoli.csv')

### M. genitalium

In [8]:
mge = m.get_network("mge", opt="mge")
mge.grafo = mge.grafo.to_undirected()
mge_nodes = list(mge.grafo.nodes())
mge_nodes.remove('undefined')
orf_queries = dict()

for i in range(len(mge_nodes)):
    gene = rkg.kegg_get(mge_nodes[i])
    tmp = ex.read(gene)
    if mge_nodes[i].startswith("mge:") and tmp.name != []:
        orf_queries[mge_nodes[i]] = tmp.name[0]

Retrieving data from KEGG PATHWAY database. Sat Apr 28 15:41:01 2018
Building graphs.


In [9]:
#Centralities
close_cent = nx.closeness_centrality(mge.grafo)
bet_cent = nx.betweenness_centrality(mge.grafo)
eigen_cent = nx.eigenvector_centrality(mge.grafo)
load_cent = nx.load_centrality(mge.grafo)
deg_cent = nx.degree_centrality(mge.grafo)
lreach_cent = {k:nx.local_reaching_centrality(mge.grafo, k) for k in mge.grafo.nodes()}

n = len(mge.grafo) - 1 #normalizing harmonic_centrality
harm_cent = {k:(v/n) for k,v in nx.harmonic_centrality(mge.grafo).items()}

n = nx.estrada_index(mge.grafo) #normalizing subgraph_centrality
sub_cent = {k:(v/n) for k,v in nx.subgraph_centrality(mge.grafo).items()}

#Other features
cco = nx.clustering(mge.grafo)
pagerank = nx.pagerank(mge.grafo)
hits = nx.hits(mge.grafo)
ind_set = {k:len(nx.maximal_independent_set(mge.grafo, [k])) for k in mge.grafo.nodes()} #to be normalized later
damage = m.damage_lemke(mge.grafo) #to be normalized later

In [10]:
nodes_mge = {v:k for k,v in orf_queries.items()}
table = pd.read_table("data/MgenitaliumNetData.dat") #Generated in a previous step (another notebook)
node2rem = []
for j in mge.grafo.nodes():
    if orf_queries.get(j,None) == None:
        node2rem.append(j)
mge.grafo.remove_nodes_from(node2rem)

In [12]:
aux = []
#Nodes of unknown essentiality are discarded
for i,j in zip(table["Feature/Product"], table["Class(1:essential 2:noessential)"]):
    tmp = nodes_mge.get(i,None)
    if tmp != None:
        aux.append([tmp, eigen_cent[tmp], close_cent[tmp], bet_cent[tmp], load_cent[tmp], harm_cent[tmp], lreach_cent[tmp], 
                     sub_cent[tmp], deg_cent[tmp], cco[tmp], pagerank[tmp], hits[0][tmp], ind_set[tmp], damage[tmp], j])

In [13]:
data = pd.DataFrame(aux, columns = ["ec", "eigenvector_centrality", "closeness_centrality", "betweenness_centrality",
                                    "load_centrality", "harmonic_centrality", "local_reach_centrality", "subgraph_centrality",
                                    "degree_centrality", "clustering_coefficient", "pagerank", "hits", "ind_set_maximal", 
                                    "damage", "class"])
#Normalizing remaining features
data['ind_set_maximal'] = data['ind_set_maximal'] / max(data['ind_set_maximal'])
data['damage'] = data['damage'] / max(data['damage'])
data = data[data['degree_centrality'] != 0.0] #Removing isolates
data.to_csv('data/mgenitalium.csv')

### P. aeruginosa

In [14]:
pae = m.get_network("pae", opt="pae")
pae.grafo = pae.grafo.to_undirected()
pae_nodes = list(pae.grafo.nodes())
pae_nodes.remove('undefined')
orf_queries = dict()

for i in range(len(pae_nodes)):
    gene = rkg.kegg_get(pae_nodes[i])
    tmp = ex.read(gene)
    if pae_nodes[i].startswith("pae:") and tmp.name != []:
        orf_queries[pae_nodes[i]] = tmp.name[0]

Retrieving data from KEGG PATHWAY database. Sat Apr 28 15:48:00 2018
Building graphs.


In [15]:
#Centralities
close_cent = nx.closeness_centrality(pae.grafo)
bet_cent = nx.betweenness_centrality(pae.grafo)
eigen_cent = nx.eigenvector_centrality(pae.grafo)
load_cent = nx.load_centrality(pae.grafo)
deg_cent = nx.degree_centrality(pae.grafo)
lreach_cent = {k:nx.local_reaching_centrality(pae.grafo, k) for k in pae.grafo.nodes()}

n = len(pae.grafo) - 1 #normalizing harmonic_centrality
harm_cent = {k:(v/n) for k,v in nx.harmonic_centrality(pae.grafo).items()}

n = nx.estrada_index(pae.grafo) #normalizing subgraph_centrality
sub_cent = {k:(v/n) for k,v in nx.subgraph_centrality(pae.grafo).items()}

#Other features
cco = nx.clustering(pae.grafo)
pagerank = nx.pagerank(pae.grafo)
hits = nx.hits(pae.grafo)
ind_set = {k:len(nx.maximal_independent_set(pae.grafo, [k])) for k in pae.grafo.nodes()} #to be normalized later
damage = m.damage_lemke(pae.grafo) #to be normalized later

In [16]:
nodes_pae = {v:k for k, v in orf_queries.items()}
table = pd.read_table("data/PaeruginosaNetData.dat") #Generated in a previous step (another notebook)
node2rem = []
for j in pae.grafo.nodes():
    if orf_queries.get(j,None) == None:
        node2rem.append(j)
pae.grafo.remove_nodes_from(node2rem)

In [17]:
aux = []
#Nodes of unknown essentiality are discarded
for i,j in zip(table["Feature/Product"], table["Class(1:essential 2:noessential)"]):
    tmp = nodes_pae.get(i,None)
    if tmp != None:
        aux.append([tmp, eigen_cent[tmp], close_cent[tmp], bet_cent[tmp], load_cent[tmp], harm_cent[tmp], lreach_cent[tmp], 
                     sub_cent[tmp], deg_cent[tmp], cco[tmp], pagerank[tmp], hits[0][tmp], ind_set[tmp], damage[tmp], j])

In [18]:
data = pd.DataFrame(aux, columns = ["ec", "eigenvector_centrality", "closeness_centrality", "betweenness_centrality",
                                    "load_centrality", "harmonic_centrality", "local_reach_centrality", "subgraph_centrality",
                                    "degree_centrality", "clustering_coefficient", "pagerank", "hits", "ind_set_maximal", 
                                    "damage", "class"])
#Normalizing remaining features
data['ind_set_maximal'] = data['ind_set_maximal'] / max(data['ind_set_maximal'])
data['damage'] = data['damage'] / max(data['damage'])
data = data[data['degree_centrality'] != 0.0] #Removing isolates
data.to_csv('data/paeruginosa.csv')

### S. cerevisiae

In [22]:
sce = m.get_network("sce", opt="sce")
sce.grafo = sce.grafo.to_undirected()
sce_nodes = list(sce.grafo.nodes())
sce_nodes.remove('undefined')
orf_queries = dict()

for i in range(len(sce_nodes)):
    gene = rkg.kegg_get(sce_nodes[i])
    tmp = ex.read(gene)
    if sce_nodes[i].startswith("sce:"):
        orf_queries[sce_nodes[i]] = sce_nodes[i].split(':')[1]

Retrieving data from KEGG PATHWAY database. Sat Apr 28 16:36:50 2018
Building graphs.


In [23]:
#Centralities
close_cent = nx.closeness_centrality(sce.grafo)
bet_cent = nx.betweenness_centrality(sce.grafo)
eigen_cent = nx.eigenvector_centrality(sce.grafo)
load_cent = nx.load_centrality(sce.grafo)
deg_cent = nx.degree_centrality(sce.grafo)
lreach_cent = {k:nx.local_reaching_centrality(sce.grafo, k) for k in sce.grafo.nodes()}

n = len(sce.grafo) - 1 #normalizing harmonic_centrality
harm_cent = {k:(v/n) for k,v in nx.harmonic_centrality(sce.grafo).items()}

n = nx.estrada_index(sce.grafo) #normalizing subgraph_centrality
sub_cent = {k:(v/n) for k,v in nx.subgraph_centrality(sce.grafo).items()}

#Other features
cco = nx.clustering(sce.grafo)
pagerank = nx.pagerank(sce.grafo)
hits = nx.hits(sce.grafo)
ind_set = {k:len(nx.maximal_independent_set(sce.grafo, [k])) for k in sce.grafo.nodes()} #to be normalized later
damage = m.damage_lemke(sce.grafo) #to be normalized later

In [24]:
nodes_sce = {v:k for k, v in orf_queries.items()}
table = pd.read_table("data/ScerevisiaeNetData.dat") #Generated in a previous step (another notebook)
node2rem = []
for j in sce.grafo.nodes():
    if orf_queries.get(j,None) == None:
        node2rem.append(j)
sce.grafo.remove_nodes_from(node2rem)

In [25]:
aux = []
#Nodes of unknown essentiality are discarded
for i,j in zip(table["Feature/Product"], table["Class(1:essential 2:noessential)"]):
    tmp = nodes_sce.get(i,None)
    if tmp != None:
        aux.append([tmp, eigen_cent[tmp], close_cent[tmp], bet_cent[tmp], load_cent[tmp], harm_cent[tmp], lreach_cent[tmp], 
                     sub_cent[tmp], deg_cent[tmp], cco[tmp], pagerank[tmp], hits[0][tmp], ind_set[tmp], damage[tmp], j])

In [26]:
data = pd.DataFrame(aux, columns = ["ec", "eigenvector_centrality", "closeness_centrality", "betweenness_centrality",
                                    "load_centrality", "harmonic_centrality", "local_reach_centrality", "subgraph_centrality",
                                    "degree_centrality", "clustering_coefficient", "pagerank", "hits", "ind_set_maximal", 
                                    "damage", "class"])
#Normalizing remaining features
data['ind_set_maximal'] = data['ind_set_maximal'] / max(data['ind_set_maximal'])
data['damage'] = data['damage'] / max(data['damage'])
data = data[data['degree_centrality'] != 0.0] #Removing isolates
data.to_csv('data/scerevisiae.csv')