In [1]:
%matplotlib inline

import sys; sys.path.insert(0, "..")

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

import collections
from math import log

# import time
import pickle

# from random import sample
from communityDetection.orcci import *
from utils.pobsnet_sinfo import *

from sklearn.metrics import normalized_mutual_info_score

In [2]:
verbose = "TRACE"

filename = "../data/AI-1Main/BIOGRID_Braun2011_21798944.xlsx"

df = pd.read_excel(filename)
# df.head()

gene1_label = "Systematic Name Interactor A"
gene2_label = "Systematic Name Interactor B"
g = nx.from_pandas_edgelist(df,source=gene1_label,target=gene2_label)
g.name = 'PhIBraun'

In [3]:
print(nx.info(g))
print("Number of connected components:", nx.number_connected_components(g), "\n")

edge_selfloops = [e1 for e1, e2 in nx.selfloop_edges(g)]
g.remove_edges_from(nx.selfloop_edges(g))

print("<<< Without self loops >>>")
print(nx.info(g))
print("Number of self loops:", len(edge_selfloops))

# degree list
degree_list = sorted(g.degree, key=lambda x: x[1], reverse=True)
print("Top 5 highest degree nodes:", degree_list[0:5])

Name: PhIBraun
Type: Graph
Number of nodes: 2657
Number of edges: 5641
Average degree:   4.2461
Number of connected components: 126 

<<< Without self loops >>>
Name: PhIBraun
Type: Graph
Number of nodes: 2657
Number of edges: 5506
Average degree:   4.1445
Number of self loops: 135
Top 5 highest degree nodes: [('AT5G22290', 222), ('AT4G19030', 181), ('AT4G35580', 160), ('AT1G22920', 134), ('AT1G27300', 112)]


### Side Information (Global)

In [6]:
filename = "../data/AI-1Main/ppi_braun_SideInfo_OtherPath.edit.csv"

df_sinfo = pd.read_csv(filename) # ,index_col=1 ,header=0
df_sinfo = df_sinfo[['Label','SideInfo_Label']]

print(len(df_sinfo))

# Correct typos and remove 'multi' and 'unknown' from side information list

df_sinfo = df_sinfo.replace({'Unknown':np.nan}, regex=True)

# Ignore phytohormone associated groups
df_sinfo = df_sinfo.replace({'Abiotic_biotic_stimulus':np.nan}, regex=False)
df_sinfo = df_sinfo.replace({'Transcription factor':np.nan}, regex=False)
df_sinfo = df_sinfo.replace({'Transcription_factor':np.nan}, regex=False)
df_sinfo = df_sinfo.replace({'Transcription_factor,Stress':np.nan}, regex=False)
df_sinfo = df_sinfo.replace({'transcription factor':np.nan}, regex=False)
df_sinfo = df_sinfo.replace({'Stress':np.nan}, regex=False)
df_sinfo = df_sinfo.replace({'Transporter':np.nan}, regex=False)
df_sinfo = df_sinfo.replace({'transporter':np.nan}, regex=False)


df_sinfo = df_sinfo.replace({'Multi':np.nan}, regex=False)
df_sinfo = df_sinfo.replace({'multi':np.nan}, regex=False)
df_sinfo = df_sinfo.replace({'gibberellin,brassinosteroid':np.nan}, regex=False)
df_sinfo = df_sinfo.replace({'jasmonic acid,salicylic acid':np.nan}, regex=False)

# not in the list (not considered for side information)
df_sinfo = df_sinfo.replace({'AT hook motif ':np.nan}, regex=False)
df_sinfo = df_sinfo.replace({'photosynthesis':np.nan}, regex=False)


# Set higher level categories
df_sinfo = df_sinfo.replace({'Abscisic_acid':'abscisic acid'}, regex=False)
df_sinfo = df_sinfo.replace({'abscisic acid ':'abscisic acid'}, regex=False)
df_sinfo = df_sinfo.replace({'Brassinosteroid':'brassinosteroid'}, regex=False)
df_sinfo = df_sinfo.replace({'Cytokinin':'cytokinin'}, regex=False)
df_sinfo = df_sinfo.replace({'Ethylene':'ethylene'}, regex=False)
df_sinfo = df_sinfo.replace({'Gibberellin':'gibberellin'}, regex=False)
df_sinfo = df_sinfo.replace({'gibberellic acid':'gibberellin'}, regex=False)
df_sinfo = df_sinfo.replace({'Jasmonic_acid':'jasmonic acid'}, regex=False)

# DOUBLE CHECK TO INCLUDE OR NOT
df_sinfo = df_sinfo.replace({'auxin,Circadian rhythm':'auxin'}, regex=False)
df_sinfo = df_sinfo.replace({'abscisic acid,transporter':'abscisic acid'}, regex=False)
df_sinfo = df_sinfo.replace({'ethylene,transcription factor':'ethylene'}, regex=False)

df_sinfo = df_sinfo.replace({'DNA repair,abscisic acid':'abscisic acid'}, regex=False)

# Other pathways
df_sinfo = df_sinfo.replace({'Circadian rhythm':'circadian rhythm'}, regex=False)
df_sinfo = df_sinfo.replace({'Citrate cycle,Pyruvate metabolism,Glycolysis/Gluconeogenesis':'metabolism'}, regex=False)
df_sinfo = df_sinfo.replace({'Cysteine and methionine metabolism':'amino acid metabolism'}, regex=False)
df_sinfo = df_sinfo.replace({'DNA repair':'dna repair replicate'}, regex=False)
df_sinfo = df_sinfo.replace({'DNA replication':'dna repair replicate'}, regex=False)
df_sinfo = df_sinfo.replace({'Endocytosis':'protein processing'}, regex=False)
df_sinfo = df_sinfo.replace({'Fructose and mannose metabolism':'dna rna metabolism'}, regex=False)
df_sinfo = df_sinfo.replace({'MAPK':'signal transduction'}, regex=False)
df_sinfo = df_sinfo.replace({'Mismatch repair,DNA replication,Nucleotide excision repair':'dna repair replicate'}, regex=False)
df_sinfo = df_sinfo.replace({'Phosphatidylinositol signaling system':'signal transduction'}, regex=False)
df_sinfo = df_sinfo.replace({'Protein processing':'protein processing'}, regex=False)
df_sinfo = df_sinfo.replace({'Protein processing in endoplasmic reticulum':'protein processing'}, regex=False)
df_sinfo = df_sinfo.replace({'Protein processing in endoplasmic reticulum,Ubiquitin mediated proteolysis':'protein processing'}, regex=False)
df_sinfo = df_sinfo.replace({'RNA degradation':'rna processing'}, regex=False)
df_sinfo = df_sinfo.replace({'RNA polymerase':'rna processing'}, regex=False)
df_sinfo = df_sinfo.replace({'RNA processing':'rna processing'}, regex=False)
df_sinfo = df_sinfo.replace({'RNA transport':'rna processing'}, regex=False)
df_sinfo = df_sinfo.replace({'Ribosomal':'protein processing'}, regex=False)
df_sinfo = df_sinfo.replace({'Ribosome':'protein processing'}, regex=False)
df_sinfo = df_sinfo.replace({'Spliceosome':'rna processing'}, regex=False)
df_sinfo = df_sinfo.replace({'Spliceosome,RNA transport':'rna processing'}, regex=False)
df_sinfo = df_sinfo.replace({'Ubiquitin mediated proteolysis':'protein processing'}, regex=False)
df_sinfo = df_sinfo.replace({'Valine, leucine and isoleucine degradation,Peroxisome':'fatty acid metabolism'}, regex=False)

df_sinfo = df_sinfo.replace({'peroxisome':'fatty acid metabolism'}, regex=False)
df_sinfo = df_sinfo.replace({'plant-pathogen interaction':'plant pathogen interaction'}, regex=False)
df_sinfo = df_sinfo.replace({'protein biosynthesis':'protein processing'}, regex=False)
df_sinfo = df_sinfo.replace({'protein degradation':'protein processing'}, regex=False)
df_sinfo = df_sinfo.replace({'protein transport':'protein processing'}, regex=False)
df_sinfo = df_sinfo.replace({'ribosomal':'protein processing'}, regex=False)
df_sinfo = df_sinfo.replace({'ribosome':'protein processing'}, regex=False)



# filter row with valid SideInfo
df_sinfo = df_sinfo[[isinstance(x,str) for x in df_sinfo['SideInfo_Label']]]  
pathway_list = list(set(df_sinfo['SideInfo_Label']))
pathway_list.sort()
pathway_list.insert(0,'unknown')
print("Amount of SideInfo: %d" % len(df_sinfo))
print("Number of known pathways: %d" % (len(pathway_list)-1))
print("Pathways:", pathway_list[1:])


2657
Amount of SideInfo: 388
Number of known pathways: 18
Pathways: ['abscisic acid', 'amino acid metabolism', 'auxin', 'brassinosteroid', 'circadian rhythm', 'cytokinin', 'dna repair replicate', 'dna rna metabolism', 'ethylene', 'fatty acid metabolism', 'gibberellin', 'jasmonic acid', 'metabolism', 'plant pathogen interaction', 'protein processing', 'rna processing', 'salicylic acid', 'signal transduction']


In [7]:
df_sinfo[['Label','SideInfo_Label']]

Unnamed: 0,Label,SideInfo_Label
7,AT1G01210,rna processing
22,AT1G02140,rna processing
33,AT1G03140,rna processing
34,AT1G03250,rna processing
35,AT1G03360,rna processing
...,...,...
2604,AT5G63370,rna processing
2618,AT5G64813,ethylene
2624,AT5G65210,salicylic acid
2625,AT5G65220,protein processing


In [8]:
# filename = "../results/PhI_files/Braun/ppi_braun_SideInfo_OtherPath.edit2.csv"
# df_sinfo = pd.read_csv(filename) # ,index_col=1 ,header=0
# len(df_sinfo)

In [9]:
# label nodes with known pathways
nx.set_node_attributes(g, 'unknown', "pathway")
nx.set_node_attributes(g, 0, "pathwayID")

for idx, row  in df_sinfo.iterrows():
    gene = row['Label']
    sinfo_label = row['SideInfo_Label']
    path_id = pathway_list.index(sinfo_label)
    g.nodes[gene]['pathway'] = sinfo_label
    g.nodes[gene]['pathwayID'] = path_id

In [10]:
# create side information list

# check if known nodes belonging to the same community have connection
known_partition = dict()  # {0: [], 1: [], 2: []}

edges_known = []  # list of all edges known (both existing and artificially created)
nodes_known = list(df_sinfo['Label'])
for i in nodes_known:
    block = g.nodes[i]['pathway']
    insert_to_dict_list(known_partition, block, i)
    # known_partition[block].append(i)
    # print(i, block)

print("Side Info Summary:")
print("Total side information: %d" % len(nodes_known))
for i in known_partition:
    print(i, len(known_partition[i]))
    
edges_known = []

Side Info Summary:
Total side information: 388
rna processing 79
auxin 46
abscisic acid 44
protein processing 95
fatty acid metabolism 13
salicylic acid 14
signal transduction 8
gibberellin 11
dna repair replicate 12
jasmonic acid 16
ethylene 25
brassinosteroid 14
dna rna metabolism 2
metabolism 1
cytokinin 5
plant pathogen interaction 1
amino acid metabolism 1
circadian rhythm 1


In [11]:
for pathway_type in known_partition:
    pathway_gene_list = known_partition[pathway_type]
    for idx1, gene1 in enumerate(pathway_gene_list):
        for idx2, gene2 in enumerate(pathway_gene_list[idx1+1:]):
                
            # Trial # 2: Soft force of weight to maximum (and ORC)
            if g.has_edge(gene1,gene2):
                edges_known.append((gene1, gene2))
                # g.edges[(gene1, gene2)][''] = 1.0  # unweighted
        
print("Known edges:")
# print(edges_known)
print("Number of known edges:", len(edges_known))

Known edges:
Number of known edges: 81


In [12]:
print(nx.info(g))

for idx, node in enumerate(g.nodes()):
    print(idx, node, g.nodes[node])
    if idx == 5:
        break
    
for idx, edge in enumerate(g.edges()):
    print(idx, edge, g.edges[edge])
    if idx == 5:
        break

Name: PhIBraun
Type: Graph
Number of nodes: 2657
Number of edges: 5506
Average degree:   4.1445
0 AT1G05410 {'pathway': 'unknown', 'pathwayID': 0}
1 AT3G10140 {'pathway': 'unknown', 'pathwayID': 0}
2 AT3G54850 {'pathway': 'unknown', 'pathwayID': 0}
3 AT5G19010 {'pathway': 'signal transduction', 'pathwayID': 18}
4 AT3G07780 {'pathway': 'rna processing', 'pathwayID': 16}
5 AT5G66720 {'pathway': 'unknown', 'pathwayID': 0}
0 ('AT1G05410', 'AT3G10140') {}
1 ('AT1G05410', 'AT4G12450') {}
2 ('AT1G05410', 'AT2G32650') {}
3 ('AT1G05410', 'AT5G65400') {}
4 ('AT1G05410', 'AT1G49850') {}
5 ('AT1G05410', 'AT5G61230') {}


In [9]:
# ORCCI for no side info and complete observation

# g_orc_all, partitions_list = orcci(g, verbose="INFO")

# filename_orc = "../results/Phi_Main/PhiMain_orc_all.gml"
# nx.write_gml(g_orc_all.G,filename_orc,stringizer=str)

### Community Detection

In [13]:
# ORCCI for no side info and complete observation
orc_main, partitions_list = orcci(g, block_label="blockRicci", verbose="ERROR") # "ERROR"

# output .gml filename creation
f_out = "../results/PhI_files/AI-1Main/Braun.gml"

print(f_out)
nx.write_gml(orc_main.G,f_out,stringizer=str)

Using number of cpu threads: = 16


FINAL RESULT
Elapsed time: 2662.9461 (s)
Number of iterations: 3151
Number of communities detected: 637
Size of largest community: 158
List of component sizes (final): [8 3 6 3 4 2 4 4 7 8 2 11 3 4 158 9 6 3 5 7 2 9 5 4 5 3 7 2 4 3 1 36 2 14 5 30 16 5 3 7 3 26 8 3 3 3 12 5 13 4 3 3 20 3 6 5 6 3 5 2 4 5 2 3 82 12 4 6 5 4 3 2 7 4 4 8 4 3 3 2 6 4 3 2 7 14 4 6 3 5 5 5 8 5 8 3 3 1 5 2 5 6 2 5 3 7 2 7 3 14 3 4 4 4 5 2 7 4 3 2 3 3 1 3 5 5 5 6 3 4 3 3 4 2 5 2 3 8 6 3 5 3 3 4 2 3 5 10 3 10 2 6 31 3 2 3 4 3 3 3 3 3 3 3 5 5 5 2 10 3 4 5 3 3 5 4 4 3 6 5 5 8 5 2 3 3 7 3 4 2 2 3 4 8 2 4 2 2 3 4 31 3 2 3 2 5 1 6 2 5 3 7 3 4 5 9 4 2 3 6 3 4 5 3 6 4 3 2 5 3 5 4 3 2 3 2 3 2 2 3 7 3 3 3 4 2 3 2 9 2 6 2 5 3 4 3 2 2 4 3 4 3 18 5 4 4 4 2 5 1 5 2 2 3 2 3 4 3 8 3 2 6 6 2 6 4 2 5 4 4 2 3 2 3 6 7 3 3 3 3 5 3 4 3 5 4 3 2 5 2 7 4 2 4 4 3 4 3 3 1 2 5 2 2 4 3 2 4 3 4 4 3 2 3 5 2 2 4 4 5 6 5 3 12 19 3 5 3 4 3 3 2 3 2 5 3 3 3 3 3 3 2 3 2 2 2 2 4 5 4 4 3 2 2 4 2 10 3 2 4 3 2 3 3 4 1

In [14]:
g_orc = orc_main.G

print(nx.info(g_orc))

for idx, node in enumerate(g_orc.nodes()):
    print(idx, node, g_orc.nodes[node])
    if idx == 5:
        break

Name: PhIBraun
Type: Graph
Number of nodes: 2657
Number of edges: 5506
Average degree:   4.1445
0 AT1G05410 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod': 0, 'blockRicciFinal': 0}
1 AT3G10140 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod': 0, 'blockRicciFinal': 0}
2 AT3G54850 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod': 1, 'blockRicciFinal': 1}
3 AT5G19010 {'pathway': 'signal transduction', 'pathwayID': 18, 'known': 0, 'blockRicciMaxMod': 1, 'blockRicciFinal': 1}
4 AT3G07780 {'pathway': 'rna processing', 'pathwayID': 16, 'known': 0, 'blockRicciMaxMod': 2, 'blockRicciFinal': 2}
5 AT5G66720 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod': 3, 'blockRicciFinal': 3}


#### Side Information

In [15]:
g_orc = orc_main.G

nodes_known = []

for node in g_orc:
    if g_orc.nodes[node]['pathway'] != 'unknown':
        nodes_known.append(node)
#     print(node, g_orc.nodes[node]['pathway'])
# nodes_known

In [16]:
# f_in = "../results/PhI_files/Braun/Braun.gml"
# print(f_in)

orc_sinfo, partitions_list = orcci(g_orc, nodes_known=nodes_known, block_label="blockRicciSinfo", sinfo_label="pathwayID", verbose="ERROR")

Using number of cpu threads: = 16


FINAL RESULT
Elapsed time: 2691.1548 (s)
Number of iterations: 3126
Number of communities detected: 645
Size of largest community: 114
List of component sizes (final): [7 3 5 3 4 2 5 4 14 5 2 11 6 4 114 9 6 9 5 7 3 41 4 10 3 1 3 3 7 2 4 3 1 68 20 2 15 5 16 5 3 7 3 26 2 18 3 3 12 3 13 5 3 3 27 3 6 5 3 10 2 4 5 2 7 73 12 4 7 7 4 3 33 2 6 4 4 8 4 5 2 6 3 4 7 4 3 6 8 3 8 6 8 3 18 4 1 6 5 5 2 5 3 7 2 6 3 3 3 4 4 5 2 7 9 3 8 2 5 3 1 3 7 3 5 7 1 5 3 4 4 2 5 2 10 6 3 4 3 3 4 2 3 5 4 3 10 2 6 3 2 3 4 3 3 3 3 3 3 5 5 5 5 2 10 3 4 5 3 3 4 3 3 1 5 5 4 3 7 8 5 3 3 8 3 3 2 3 4 2 3 2 4 2 2 3 4 31 3 2 3 2 5 1 6 4 8 3 3 5 9 7 2 3 3 4 5 3 2 2 5 3 4 3 2 2 2 2 3 7 3 3 3 4 6 2 4 2 8 2 6 2 5 3 4 3 2 2 4 3 4 3 15 5 4 4 4 2 5 1 3 2 2 2 2 3 4 4 4 3 2 6 3 2 5 4 2 5 4 4 2 3 2 3 6 6 3 3 3 3 2 5 3 7 4 3 5 4 3 2 5 2 7 4 2 4 3 4 3 3 1 2 4 2 2 4 3 2 4 3 4 4 4 2 3 4 5 2 2 4 2 6 5 12 19 3 3 5 3 4 3 3 5 2 5 3 3 3 3 6 2 3 2 2 2 2 4 4 4 3 2 2 4 2 10 3 2 3 4 4 3 2 3 3 3 1 4 1 3 9 3 4 5 1

In [17]:
g_orc = orc_sinfo.G

print(nx.info(g_orc))

for idx, node in enumerate(g_orc.nodes()):
    print(idx, node, g_orc.nodes[node])
    if idx == 5:
        break

Name: PhIBraun
Type: Graph
Number of nodes: 2657
Number of edges: 5506
Average degree:   4.1445
0 AT1G05410 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod': 0, 'blockRicciFinal': 0, 'blockRicciSinfoMaxMod': 0, 'blockRicciSinfoFinal': 0}
1 AT3G10140 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod': 0, 'blockRicciFinal': 0, 'blockRicciSinfoMaxMod': 0, 'blockRicciSinfoFinal': 0}
2 AT3G54850 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod': 1, 'blockRicciFinal': 1, 'blockRicciSinfoMaxMod': 1, 'blockRicciSinfoFinal': 1}
3 AT5G19010 {'pathway': 'signal transduction', 'pathwayID': 18, 'known': 1, 'blockRicciMaxMod': 1, 'blockRicciFinal': 1, 'blockRicciSinfoMaxMod': 1, 'blockRicciSinfoFinal': 1}
4 AT3G07780 {'pathway': 'rna processing', 'pathwayID': 16, 'known': 1, 'blockRicciMaxMod': 2, 'blockRicciFinal': 2, 'blockRicciSinfoMaxMod': 2, 'blockRicciSinfoFinal': 2}
5 AT5G66720 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'bloc

In [18]:
# output .gml filename creation
f_out = "../results/PhI_files/AI-1Main/Braun_sinfo.gml"

print(f_out)
nx.write_gml(orc_sinfo.G,f_out,stringizer=str)

../results/PhI_files/AI-1Main/Braun_sinfo.gml
