# PhI Full PPI network analysis
Arabidopsis PhI_Full Protein-Protein Interaction network analysis
Ollivier-Ricci community detection with side information

In [1]:
%matplotlib inline

import sys; sys.path.insert(0, "..")

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

import collections
from math import log

# import time
import pickle

# from random import sample
from communityDetection.orcci import *
from utils.pobsnet_sinfo import *

from sklearn.metrics import normalized_mutual_info_score

In [2]:
verbose = "TRACE"

filename = "../data/PhIFull/BIOGRID_Altman2020_32612234.xlsx"

df = pd.read_excel(filename)

gene1_label = "Systematic Name Interactor A"
gene2_label = "Systematic Name Interactor B"
g = nx.from_pandas_edgelist(df,source=gene1_label,target=gene2_label)
g.name = 'Arabidopsis PPI'

In [3]:
print(nx.info(g))
print("Number of connected components:", nx.number_connected_components(g), "\n")

edge_selfloops = [e1 for e1, e2 in nx.selfloop_edges(g)]
g.remove_edges_from(nx.selfloop_edges(g))

print("<<< Without self loops >>>")
print(nx.info(g))
print("Number of self loops:", len(edge_selfloops))

# degree list
degree_list = sorted(g.degree, key=lambda x: x[1], reverse=True)
print("Top 5 highest degree nodes:", degree_list[0:5])

Name: Arabidopsis PPI
Type: Graph
Number of nodes: 925
Number of edges: 2072
Average degree:   4.4800
Number of connected components: 40 

<<< Without self loops >>>
Name: Arabidopsis PPI
Type: Graph
Number of nodes: 925
Number of edges: 2021
Average degree:   4.3697
Number of self loops: 51
Top 5 highest degree nodes: [('AT4G37260', 117), ('AT4G32570', 99), ('AT1G31880', 83), ('AT1G69690', 53), ('AT1G04250', 51)]


### Side Information (Global)

In [4]:
filename = "../data/PhIFull/Phi_Full_ppi_all_sideinfo_KEGG_edit.xlsx"

df_sinfo = pd.read_excel(filename,header=0) # ,index_col=1
df_sinfo = df_sinfo[['Label','SideInfo_Label']]
df_sinfo = df_sinfo[[isinstance(x,str) for x in df_sinfo['SideInfo_Label']]]  # filter row with valid SideInfo
pathway_list = list(set(df_sinfo['SideInfo_Label']))
pathway_list.sort()
pathway_list.insert(0,'unknown')
print("Amount of SideInfo: %d" % len(df_sinfo))
print("Number of known pathways: %d" % (len(pathway_list)-1))
print("Pathways:", pathway_list[1:])

Amount of SideInfo: 321
Number of known pathways: 18
Pathways: ['abscisic acid', 'amino acid metabolism', 'auxin', 'brassinosteroid', 'circadian rhythm', 'cytokinin', 'dna repair', 'dna rna metabolism', 'ethylene', 'fatty acid metabolism', 'gibberellin', 'jasmonic acid', 'metabolism', 'plant pathogen interaction', 'protein processing', 'rna processing', 'salicylic acid', 'signal transduction']


In [5]:
# label nodes with known pathways
nx.set_node_attributes(g, 'unknown', "pathway")
nx.set_node_attributes(g, 0, "pathwayID")

for idx, row  in df_sinfo.iterrows():
    gene = row['Label']
    sinfo_label = row['SideInfo_Label']
    path_id = pathway_list.index(sinfo_label)
    g.nodes[gene]['pathway'] = sinfo_label
    g.nodes[gene]['pathwayID'] = path_id

In [6]:
# create side information list

# check if known nodes belonging to the same community have connection
known_partition = dict()  # {0: [], 1: [], 2: []}

edges_known = []  # list of all edges known (both existing and artificially created)
nodes_known = list(df_sinfo['Label'])
for i in nodes_known:
    block = g.nodes[i]['pathway']
    insert_to_dict_list(known_partition, block, i)
    # known_partition[block].append(i)
    # print(i, block)

print("Side Info Summary:")
print("Total side information: %d" % len(nodes_known))
for i in known_partition:
    print(i, len(known_partition[i]))
    
edges_known = []

Side Info Summary:
Total side information: 321
auxin 65
rna processing 25
abscisic acid 55
ethylene 36
brassinosteroid 17
salicylic acid 22
gibberellin 20
jasmonic acid 22
cytokinin 15
protein processing 24
dna rna metabolism 3
dna repair 2
fatty acid metabolism 5
metabolism 2
signal transduction 3
amino acid metabolism 3
plant pathogen interaction 1
circadian rhythm 1


In [7]:
for pathway_type in known_partition:
    pathway_gene_list = known_partition[pathway_type]
    for idx1, gene1 in enumerate(pathway_gene_list):
        for idx2, gene2 in enumerate(pathway_gene_list[idx1+1:]):
                
            # Trial # 2: Soft force of weight to maximum (and ORC)
            if g.has_edge(gene1,gene2):
                edges_known.append((gene1, gene2))
                # g.edges[(gene1, gene2)][''] = 1.0  # unweighted
        
print("Known edges:")
# print(edges_known)
print("Number of known edges:", len(edges_known))

Known edges:
Number of known edges: 390


In [8]:
print(nx.info(g))

for idx, node in enumerate(g.nodes()):
    print(idx, node, g.nodes[node])
    if idx == 5:
        break
    
# for idx, edge in enumerate(g.edges()):
#     print(idx, edge, g.edges[edge])
#     if idx == 5:
#         break

Name: Arabidopsis PPI
Type: Graph
Number of nodes: 925
Number of edges: 2021
Average degree:   4.3697
0 AT1G01030 {'pathway': 'auxin', 'pathwayID': 3}
1 AT3G17600 {'pathway': 'auxin', 'pathwayID': 3}
2 AT1G01140 {'pathway': 'unknown', 'pathwayID': 0}
3 AT2G23290 {'pathway': 'unknown', 'pathwayID': 0}
4 AT1G01360 {'pathway': 'abscisic acid', 'pathwayID': 1}
5 AT4G26080 {'pathway': 'unknown', 'pathwayID': 0}


### Community Detection

In [9]:
# ORCCI for no side info and complete observation
orc_full, partitions_list = orcci(g, block_label="blockRicci", verbose="ERROR") # "INFO"

Using number of cpu threads: = 16


FINAL RESULT
Elapsed time: 217.7631 (s)
Number of iterations: 876
Number of communities detected: 180
Size of largest community: 63
List of component sizes (final): [2 63 50 18 13 3 2 2 3 2 3 12 9 4 8 14 43 10 5 5 5 6 6 2 10 4 18 4 4 8 6 3 13 3 2 4 7 4 4 3 3 5 2 5 2 2 2 2 4 2 7 1 5 9 2 4 1 21 4 3 3 4 3 4 4 4 2 5 2 3 4 10 6 2 2 38 4 3 8 4 6 3 2 5 5 2 7 4 5 3 3 2 10 2 8 1 2 2 4 6 2 6 8 7 3 2 3 2 4 2 3 2 7 2 2 2 3 4 2 4 5 4 3 2 2 2 3 4 3 2 2 6 3 2 4 2 6 3 2 2 3 3 12 10 3 4 2 2 4 2 2 2 2 3 3 2 2 2 2 2 2 3 3 2 6 2 4 2 2 4 2 2 3 4 3 3 2 2 3 2]
MAXIMUM MODULARITY PARTITION RESULT
Modularity: 0.5849
Number of communities detected: 118
List of component sizes (max mod): [2 102 88 18 47 6 3 16 5 33 6 10 14 43 10 11 5 5 9 2 13 50 4 4 19 7 23 3 5 2 13 2 1 5 1 21 4 3 4 6 4 10 6 2 2 4 3 6 2 5 5 2 7 9 7 3 10 2 1 2 4 12 7 2 16 2 3 2 2 2 4 2 5 7 6 2 2 3 3 2 2 6 3 2 2 18 3 2 2 3 10 4 2 2 4 2 2 2 2 3 3 2 2 2 2 2 2 3 2 2 4 2 2 4 2 2 4 2]



In [10]:
# output .gml filename creation
f_out = "../results/PhI_files/PhIFull/PhIFull.gml"

print(f_out)
nx.write_gml(orc_full.G,f_out,stringizer=str)

../results/PhI_files/PhIFull/PhIFull.gml


In [11]:
g_orc = orc_full.G

print(nx.info(g_orc))

for idx, node in enumerate(g_orc.nodes()):
    print(idx, node, g_orc.nodes[node])
    if idx == 5:
        break

Name: Arabidopsis PPI
Type: Graph
Number of nodes: 925
Number of edges: 2021
Average degree:   4.3697
0 AT1G01030 {'pathway': 'auxin', 'pathwayID': 3, 'known': 0, 'blockRicciMaxMod': 0, 'blockRicciFinal': 0}
1 AT3G17600 {'pathway': 'auxin', 'pathwayID': 3, 'known': 0, 'blockRicciMaxMod': 1, 'blockRicciFinal': 1}
2 AT1G01140 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod': 2, 'blockRicciFinal': 2}
3 AT2G23290 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod': 1, 'blockRicciFinal': 1}
4 AT1G01360 {'pathway': 'abscisic acid', 'pathwayID': 1, 'known': 0, 'blockRicciMaxMod': 3, 'blockRicciFinal': 3}
5 AT4G26080 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod': 4, 'blockRicciFinal': 4}


#### Side Information

In [12]:
g_orc = orc_full.G

nodes_known = []

for node in g_orc:
    if g_orc.nodes[node]['pathway'] != 'unknown':
        nodes_known.append(node)
    print(node, g_orc.nodes[node]['pathway'])
# nodes_known

AT1G01030 auxin
AT3G17600 auxin
AT1G01140 unknown
AT2G23290 unknown
AT1G01360 abscisic acid
AT4G26080 unknown
AT1G04100 auxin
AT1G15050 auxin
AT2G33310 auxin
AT3G04730 auxin
AT3G16500 auxin
AT3G23030 auxin
AT4G14560 auxin
AT1G04240 auxin
AT1G04250 auxin
AT3G15540 auxin
AT1G04550 auxin
AT1G07430 abscisic acid
AT4G01026 abscisic acid
AT4G18620 abscisic acid
AT1G10940 auxin
AT2G36270 abscisic acid
AT3G23050 unknown
AT1G15550 gibberellin
AT5G60120 unknown
AT1G17380 jasmonic acid
AT3G29350 cytokinin
AT4G28910 jasmonic acid
AT1G18400 brassinosteroid
AT5G41920 unknown
AT1G19050 cytokinin
AT1G22770 gibberellin
AT1G51950 auxin
AT2G37630 gibberellin
AT2G44950 unknown
AT4G08150 gibberellin
AT4G32570 jasmonic acid
AT4G37260 unknown
AT1G25470 unknown
AT5G08130 brassinosteroid
AT1G30270 unknown
AT5G47100 abscisic acid
AT1G32640 unknown
AT1G37130 salicylic acid
AT5G18930 unknown
AT1G50600 unknown
AT2G20350 unknown
AT1G51660 unknown
AT3G24520 abscisic acid
AT3G45640 salicylic acid
AT3G61830 unknown
AT

In [13]:
# f_in = "../results/PhI_files/PhIFull/PhIFull.gml"
# print(f_in)

orc_sinfo, partitions_list = orcci(g_orc, nodes_known=nodes_known, block_label="blockRicciSinfo", sinfo_label="pathwayID", verbose="ERROR")

Using number of cpu threads: = 16


FINAL RESULT
Elapsed time: 220.1696 (s)
Number of iterations: 687
Number of communities detected: 200
Size of largest community: 64
List of component sizes (final): [64 51 61 1 1 60 3 57 3 10 4 4 12 8 4 5 6 2 2 13 4 4 14 9 4 3 2 2 4 7 4 4 4 4 2 5 2 41 1 1 4 1 2 4 1 22 4 3 4 4 11 2 4 4 2 5 2 3 6 2 2 5 2 2 4 1 2 4 3 1 2 1 4 8 2 1 7 1 1 1 1 2 1 2 4 8 1 1 2 7 2 1 2 2 5 1 2 4 2 2 2 2 4 2 4 5 4 3 5 2 2 2 3 2 1 2 6 3 1 2 1 1 5 2 3 1 2 2 3 3 13 10 3 1 1 1 4 1 1 1 2 1 2 1 1 1 4 1 1 2 1 2 1 1 1 1 2 3 3 1 1 2 2 2 1 1 2 1 3 3 2 6 1 2 1 4 2 2 4 2 2 10 4 1 4 1 1 4 1 1 1 1 1 1 1 3 1 1 4 2]
MAXIMUM MODULARITY PARTITION RESULT
Modularity: 0.5517
Number of communities detected: 161
List of component sizes (max mod): [115 92 72 1 1 66 3 78 5 35 6 12 4 5 10 2 2 4 4 17 2 4 7 13 12 2 7 1 1 1 1 22 7 4 4 11 6 3 6 2 2 2 1 2 4 3 1 2 1 4 8 2 1 7 1 1 1 1 1 2 4 8 1 1 7 2 1 1 2 2 2 2 4 2 5 7 5 2 2 2 3 2 1 2 6 3 1 2 1 1 2 3 1 2 2 3 3 13 10 1 1 1 4 1 1 1 2 1 2 1 1 1 4 1 1 2 1 2 1 1

In [14]:
g_orc = orc_sinfo.G

print(nx.info(g_orc))

for idx, node in enumerate(g_orc.nodes()):
    print(idx, node, g_orc.nodes[node])
    if idx == 5:
        break

Name: Arabidopsis PPI
Type: Graph
Number of nodes: 925
Number of edges: 2021
Average degree:   4.3697
0 AT1G01030 {'pathway': 'auxin', 'pathwayID': 3, 'known': 1, 'blockRicciMaxMod': 0, 'blockRicciFinal': 0, 'blockRicciSinfoMaxMod': 0, 'blockRicciSinfoFinal': 0}
1 AT3G17600 {'pathway': 'auxin', 'pathwayID': 3, 'known': 1, 'blockRicciMaxMod': 1, 'blockRicciFinal': 1, 'blockRicciSinfoMaxMod': 0, 'blockRicciSinfoFinal': 0}
2 AT1G01140 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod': 2, 'blockRicciFinal': 2, 'blockRicciSinfoMaxMod': 1, 'blockRicciSinfoFinal': 1}
3 AT2G23290 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod': 1, 'blockRicciFinal': 1, 'blockRicciSinfoMaxMod': 0, 'blockRicciSinfoFinal': 0}
4 AT1G01360 {'pathway': 'abscisic acid', 'pathwayID': 1, 'known': 1, 'blockRicciMaxMod': 3, 'blockRicciFinal': 3, 'blockRicciSinfoMaxMod': 2, 'blockRicciSinfoFinal': 2}
5 AT4G26080 {'pathway': 'unknown', 'pathwayID': 0, 'known': 0, 'blockRicciMaxMod'

In [15]:
# output .gml filename creation
f_out = "../results/PhI_files/PhIFull/PhIFull_sinfo.gml"

print(f_out)
nx.write_gml(orc_sinfo.G,f_out,stringizer=str)

../results/PhI_files/PhIFull/PhIFull_sinfo.gml
