In [1]:
%matplotlib inline

import sys; sys.path.insert(0, "..")

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

import collections
from math import log

# import time
import pickle

# from random import sample
from communityDetection.orcci import *
from utils.pobsnet_sinfo import *

from sklearn.metrics import normalized_mutual_info_score

In [2]:
verbose = "TRACE"

filename = "../data/PhIMain/PhIMain.xlsx"

df = pd.read_excel(filename)
# df.head()

gene1_label = "IntA"
gene2_label = "IntB"
g = nx.from_pandas_edgelist(df,source=gene1_label,target=gene2_label)
g.name = 'PhIMain'

In [3]:
print(nx.info(g))
print("Number of connected components:", nx.number_connected_components(g), "\n")

edge_selfloops = [e1 for e1, e2 in nx.selfloop_edges(g)]
g.remove_edges_from(nx.selfloop_edges(g))

print("<<< Without self loops >>>")
print(nx.info(g))
print("Number of self loops:", len(edge_selfloops))

# degree list
degree_list = sorted(g.degree, key=lambda x: x[1], reverse=True)
print("Top 5 highest degree nodes:", degree_list[0:5])

Name: PhIMain
Type: Graph
Number of nodes: 273
Number of edges: 529
Average degree:   3.8755
Number of connected components: 17 

<<< Without self loops >>>
Name: PhIMain
Type: Graph
Number of nodes: 273
Number of edges: 495
Average degree:   3.6264
Number of self loops: 34
Top 5 highest degree nodes: [('AT1G35560', 43), ('AT1G31880', 26), ('AT2G01760', 22), ('AT4G37260', 19), ('AT4G32570', 17)]


### Side Information (Global)

In [4]:
filename = "../data/PhIMain/Phi_Main_ppi_all_sideinfo_KEGG_edit.xlsx"

df_sinfo = pd.read_excel(filename,header=0) # ,index_col=1
df_sinfo = df_sinfo[['Label','SideInfo_Label']]
df_sinfo = df_sinfo[[isinstance(x,str) for x in df_sinfo['SideInfo_Label']]]  # filter row with valid SideInfo
pathway_list = list(set(df_sinfo['SideInfo_Label']))
pathway_list.sort()
pathway_list.insert(0,'unknown')
print("Amount of SideInfo: %d" % len(df_sinfo))
print("Number of known pathways: %d" % (len(pathway_list)-1))
print("Pathways:", pathway_list[1:])

Amount of SideInfo: 171
Number of known pathways: 15
Pathways: ['abscisic acid', 'auxin', 'brassinosteroid', 'circadian rhythm', 'cytokinin', 'dna repair', 'dna rna metabolism', 'ethylene', 'fatty acid metabolism', 'gibberellin', 'jasmonic acid', 'plant pathogen interaction', 'protein processing', 'salicylic acid', 'signal transduction']


In [5]:
# label nodes with known pathways
nx.set_node_attributes(g, 'unknown', "pathway")
nx.set_node_attributes(g, 0, "pathwayID")

for idx, row  in df_sinfo.iterrows():
    gene = row['Label']
    sinfo_label = row['SideInfo_Label']
    path_id = pathway_list.index(sinfo_label)
    g.nodes[gene]['pathway'] = sinfo_label
    g.nodes[gene]['pathwayID'] = path_id

In [6]:
# create side information list

# check if known nodes belonging to the same community have connection
known_partition = dict()  # {0: [], 1: [], 2: []}

edges_known = []  # list of all edges known (both existing and artificially created)
nodes_known = list(df_sinfo['Label'])
for i in nodes_known:
    block = g.nodes[i]['pathway']
    insert_to_dict_list(known_partition, block, i)
    # known_partition[block].append(i)
    # print(i, block)

print("Side Info Summary:")
print("Total side information: %d" % len(nodes_known))
for i in known_partition:
    print(i, len(known_partition[i]))
    
edges_known = []

Side Info Summary:
Total side information: 171
auxin 40
abscisic acid 32
brassinosteroid 13
gibberellin 14
jasmonic acid 14
cytokinin 7
dna repair 2
salicylic acid 18
ethylene 24
protein processing 2
signal transduction 1
fatty acid metabolism 1
dna rna metabolism 1
plant pathogen interaction 1
circadian rhythm 1


In [7]:
for pathway_type in known_partition:
    pathway_gene_list = known_partition[pathway_type]
    for idx1, gene1 in enumerate(pathway_gene_list):
        for idx2, gene2 in enumerate(pathway_gene_list[idx1+1:]):
                
            # Trial # 2: Soft force of weight to maximum (and ORC)
            if g.has_edge(gene1,gene2):
                edges_known.append((gene1, gene2))
                # g.edges[(gene1, gene2)][''] = 1.0  # unweighted
        
print("Known edges:")
# print(edges_known)
print("Number of known edges:", len(edges_known))

Known edges:
Number of known edges: 96


In [8]:
print(nx.info(g))

for idx, node in enumerate(g.nodes()):
    print(idx, node, g.nodes[node])
    if idx == 5:
        break
    
for idx, edge in enumerate(g.edges()):
    print(idx, edge, g.edges[edge])
    if idx == 5:
        break

Name: PhIMain
Type: Graph
Number of nodes: 273
Number of edges: 495
Average degree:   3.6264
0 AT1G01030 {'pathway': 'auxin', 'pathwayID': 2}
1 AT3G17600 {'pathway': 'auxin', 'pathwayID': 2}
2 AT1G01140 {'pathway': 'unknown', 'pathwayID': 0}
3 AT2G23290 {'pathway': 'unknown', 'pathwayID': 0}
4 AT1G01360 {'pathway': 'abscisic acid', 'pathwayID': 1}
5 AT4G26080 {'pathway': 'unknown', 'pathwayID': 0}
0 ('AT1G01030', 'AT3G17600') {}
1 ('AT3G17600', 'AT2G01760') {}
2 ('AT3G17600', 'AT2G28350') {}
3 ('AT3G17600', 'AT2G38490') {}
4 ('AT3G17600', 'AT3G15540') {}
5 ('AT3G17600', 'AT4G30080') {}


In [9]:
# ORCCI for NO side info and complete observation

# g_orc_all, partitions_list = orcci(g, verbose="INFO")

# filename_orc = "../results/PhI_files/PhI_Main/PhiMain_orc_all.gml"
# nx.write_gml(g_orc_all.G,filename_orc,stringizer=str)

### Community Detection

In [10]:
# ORCCI for no side info and complete observation
orc_main, partitions_list = orcci(g, block_label="blockRicci", verbose="ERROR") # "INFO"

# output .gml filename creation
f_out = "../results/PhI_files/PhIMain/PhIMain.gml"

print(f_out)
nx.write_gml(orc_main.G,f_out,stringizer=str)

Using number of cpu threads: = 16


FINAL RESULT
Elapsed time: 32.1517 (s)
Number of iterations: 244
Number of communities detected: 75
Size of largest community: 16
List of component sizes (final): [3 9 9 16 5 2 7 4 6 3 3 4 4 4 7 6 8 3 2 6 1 6 7 3 5 7 4 3 3 4 4 3 2 2 2 2 1 1 5 3 2 8 2 2 3 3 3 9 3 3 6 3 2 2 2 1 2 2 2 2 2 2 3 3 2 2 1 2 2 2 3 2 2 2 2]
MAXIMUM MODULARITY PARTITION RESULT
Modularity: 0.5910
Number of communities detected: 31
List of component sizes (max mod): [35 44 19 16 42 26 16 8 1 6 5 12 3 1 1 9 2 2 2 1 2 2 2 3 2 1 2 2 2 2 2]

../results/PhI_files/PhIMain/PhIMain.gml


In [11]:
g_orc = orc_main.G

print(nx.info(g_orc))

for idx, node in enumerate(g_orc.nodes()):
    print(idx, node, g_orc.nodes[node])
    if idx == 5:
        break

Name: PhIMain
Type: Graph
Number of nodes: 273
Number of edges: 495
Average degree:   3.6264
0 AT1G01030 {'pathway': 'auxin', 'pathwayID': 2, 'known': 0, 'blockRicciMaxMod': 0, 'blockRicciFinal': 0}
1 AT3G17600 {'pathway': 'auxin', 'pathwayID': 2, 'known': 0, 'blockRicciMaxMod': 0, 'blockRicciFinal': 0}
2 AT1G01140 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod': 1, 'blockRicciFinal': 1}
3 AT2G23290 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod': 1, 'blockRicciFinal': 1}
4 AT1G01360 {'pathway': 'abscisic acid', 'pathwayID': 1, 'known': 0, 'blockRicciMaxMod': 2, 'blockRicciFinal': 2}
5 AT4G26080 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod': 2, 'blockRicciFinal': 2}


#### Side Information

In [12]:
g_orc = orc_main.G

nodes_known = []

for node in g_orc:
    if g_orc.nodes[node]['pathway'] != 'unknown':
        nodes_known.append(node)
#     print(node, g_orc.nodes[node]['pathway'])
# nodes_known

In [13]:
f_in = "../results/PhI_files/PhIMain/PhIMain.gml"
print(f_in)

orc_sinfo, partitions_list = orcci(g_orc, nodes_known=nodes_known, block_label="blockRicciSinfo", sinfo_label="pathwayID", verbose="ERROR")

../results/PhI_files/PhIMain/PhIMain.gml
Using number of cpu threads: = 16


FINAL RESULT
Elapsed time: 21.3278 (s)
Number of iterations: 157
Number of communities detected: 76
Size of largest community: 25
List of component sizes (final): [25 10 19 1 1 10 5 15 3 4 4 4 5 10 2 6 2 1 5 5 3 5 8 10 1 6 2 6 2 2 11 1 1 1 1 1 4 4 2 7 6 3 2 2 2 2 1 1 2 2 2 1 1 1 3 1 1 1 1 1 1 1 1 2 2 1 1 1 1 2 1 2 2 1 1 2]
MAXIMUM MODULARITY PARTITION RESULT
Modularity: 0.5249
Number of communities detected: 61
List of component sizes (max mod): [41 10 19 1 1 27 15 13 30 4 5 2 2 1 5 5 1 6 6 2 11 1 1 1 1 1 7 6 3 2 2 2 1 1 2 2 2 1 1 1 3 1 1 1 1 1 1 1 1 2 2 1 1 1 1 2 1 2 2 1 1]



In [14]:
g_orc = orc_sinfo.G

print(nx.info(g_orc))

for idx, node in enumerate(g_orc.nodes()):
    print(idx, node, g_orc.nodes[node])
    if idx == 5:
        break

Name: PhIMain
Type: Graph
Number of nodes: 273
Number of edges: 495
Average degree:   3.6264
0 AT1G01030 {'pathway': 'auxin', 'pathwayID': 2, 'known': 1, 'blockRicciMaxMod': 0, 'blockRicciFinal': 0, 'blockRicciSinfoMaxMod': 0, 'blockRicciSinfoFinal': 0}
1 AT3G17600 {'pathway': 'auxin', 'pathwayID': 2, 'known': 1, 'blockRicciMaxMod': 0, 'blockRicciFinal': 0, 'blockRicciSinfoMaxMod': 0, 'blockRicciSinfoFinal': 0}
2 AT1G01140 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod': 1, 'blockRicciFinal': 1, 'blockRicciSinfoMaxMod': 1, 'blockRicciSinfoFinal': 1}
3 AT2G23290 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod': 1, 'blockRicciFinal': 1, 'blockRicciSinfoMaxMod': 1, 'blockRicciSinfoFinal': 1}
4 AT1G01360 {'pathway': 'abscisic acid', 'pathwayID': 1, 'known': 1, 'blockRicciMaxMod': 2, 'blockRicciFinal': 2, 'blockRicciSinfoMaxMod': 2, 'blockRicciSinfoFinal': 2}
5 AT4G26080 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod': 2, 'blo

In [15]:
# output .gml filename creation
f_out = "../results/PhI_files/PhIMain/PhIMain_sinfo.gml"

print(f_out)
nx.write_gml(orc_sinfo.G,f_out,stringizer=str)

../results/PhI_files/PhIMain/PhIMain_sinfo.gml
