# **Overlapping Community Detection in Protein-protein interaction network with GCN**





---




Upload the files PPI-Net.txt and Original-Communities.txt

In [9]:
%tensorflow_version 1.x

**Import all the necessary libraries, paths, and constants**

In [10]:
import numpy as np
import tensorflow as tf
from sklearn.utils import shuffle
import math
import random
#define path of graph file and communities file
labelfile = '/content/Original-Communities.txt'
graphfile =  '/content/PPI-Net.txt'
#select number of communities with maximum members
n = 10 

**Function for loading data from graph file and ground truth file**

In [11]:
def loadData(labelfile, graphfile):
  data = []
  n_communities = []
  #save all communities in list-data regardless of the order
  with open(labelfile) as inputfile:
      for line in inputfile:
          data.append(line.split())
  graph = {}
  #save n communities with maximum members in list-n_communities keeping the decreasing order of length
  for i in range(n):
    p = max(data,key=len)
    n_communities.append(p)
    data.remove(p)
  with open(graphfile) as inputfile:
        for line in inputfile:
            node = line.split()[0]#read 1st element in any column(node 1)
            neigh = line.split()[1]#read 2nd element in any column(node 2)
            #Not include those nodes, which are not in n_communities
            g = 0
            for i in range(len(n_communities)):
               if( node in n_communities[i]):
                     g=1
            if g==0:
              continue
            g1=0
            for i in range(len(n_communities)):
               if( neigh in n_communities[i]):
                     g1=1
            if g1==0:
              continue
            #save all the nodes and edges, which are common to the graphfile and n_communities
            if node in graph:
                graph[node].add(neigh)
            else:
                graph[node] = {neigh}
            if neigh in graph:
                graph[neigh].add(node)
            else:
                graph[neigh] = {node}
  return n_communities, graph

*The below function will output a one-hot-encoder label for every node. It takes a list of all the communities having a particular node as the member. If n = 5, the length of this one hot encoder will be 31, which covers the possibility of nodes belonging to atleast one community out of 5..*



In [12]:
def labeller(pos):
  p = [0]*n
  for i in pos:
     p[i]=1
  strings = [str(integer) for integer in p]
  a_string = "".join(strings)
  label = [0]*(2**n-1)
  label[int(a_string, 2)]=1
  return  label

**Preparing adjacency matrix, degree Matrix, and labels of nodes**

In [13]:
def preprocessingData(graph, n_communities):
  adj_matrix = np.zeros((len(graph), len(graph)))
  degree_matrix = np.zeros((len(graph), len(graph)))
  node_index = {}
  j = 0
  for i in graph:
    node_index[i] = j
    j += 1
  for i in node_index:
    for j in graph[i]:
      if(j in node_index):
        adj_matrix[node_index[i]][node_index[j]] = 1
    degree_matrix[node_index[i]][node_index[i]] = len(graph[i])
  label = np.zeros((len(graph), 2**n-1))
  for nodeId in node_index:
    u=[]
    for i in n_communities:
      if nodeId in i:
        u.append(n_communities.index(i))
    label[node_index[nodeId]]=labeller(u)
  return adj_matrix, degree_matrix, label

**Train-Test Split**

In [14]:
def trainData(label):
  k = label.shape[0]
  testingNum = math.floor(k*0.2)
  testIndex = random.sample(range(0, k), testingNum)
  training_label = np.zeros((k, 2**n-1))
  for i in range(k):
    training_label[i] = label[i]
  training_label[testIndex] = [None]
  print(label[testIndex[1]])
  print(testIndex[1])
  return training_label, testIndex

**Building GNN, training, and testing it**

In [15]:
def gnn(adj_matrix, degree_matrix, label, feature_matrix, test_label, test_index, train_index):
    features = tf.placeholder(tf.float32, shape = ((None,len(feature_matrix))))
    adjacency = tf.placeholder(tf.float32, shape = ((None,None)))
    degree = tf.placeholder(tf.float32, shape = ((None,None)))
    labels = tf.placeholder(tf.float32, shape = ((None,2**n-1)))
    weights1 = tf.Variable(tf.random_normal([len(feature_matrix),512], stddev = 1))
    weights2 = tf.Variable(tf.random_normal([512, 2**n-1], stddev = 1))
    trainIndex = tf.placeholder(tf.int32, shape = ((len(train_index))))
#Defining GCN layer
    def layer(features, adjacency, degree, weights):
        with tf.name_scope('gcn_layer'):
            d_ = tf.pow(tf.matrix_inverse(degree), 0.5)
            y = tf.matmul(d_, tf.matmul(adjacency, d_))
            kernel = tf.matmul(features, weights)
            return tf.nn.relu(tf.matmul(y, kernel))

#Building with GCN layer        
    hidden1 = layer(features, adjacency, degree, weights1)
    hidden1 = tf.layers.dropout(hidden1, rate=0.5)
    model = layer(hidden1, adjacency, degree, weights2)
    training_output = tf.gather(model, trainIndex)

#Selecting training labels
    training_label = label[train_index]

#Defining loss function and optimizer
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = training_output, labels = training_label.astype(np.float32)))
        train_op = tf.train.AdamOptimizer(0.01, 0.90).minimize(loss)

    init = tf.global_variables_initializer()

    sess = tf.Session()
    sess.run(init)

    test_label = test_label[test_index]

    b = np.argmax(test_label, axis = 1)
#training up to 200 epochs
    for i in range(200):
        _, cost = sess.run([train_op, loss], feed_dict = {features: feature_matrix, adjacency: adj_matrix, degree: degree_matrix, labels: label, trainIndex: train_index})
        if(i%10 == 0):
            predict = sess.run(tf.nn.softmax(model), feed_dict = {features: feature_matrix, adjacency: adj_matrix, degree: degree_matrix, labels: test_label})
            test_res = predict[test_index]
            a = np.argmax(test_res, axis = 1)
            #checking test accuracy at every 10th epoch
            print("test accuracy: ", np.sum(a == b)/len(test_index))
    return test_res,test_label


if __name__ == '__main__':
  n_communities,graph = loadData(labelfile, graphfile)
  adj_matrix, degree_matrix, label = preprocessingData(graph, n_communities)
  train_label, test_index = trainData(label)
  #calculating feature matrix from adj_matrix
  feature_matrix = np.eye(len(adj_matrix))
  adj_matrix = adj_matrix + feature_matrix
  train_index = []
  for i in range(len(adj_matrix)):
    if i not in test_index:
      train_index.append(i)
  test_res,test_label= gnn(adj_matrix, degree_matrix, train_label, feature_matrix, label, test_index, train_index)

[0. 0. 0. ... 0. 0. 0.]
270


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


test accuracy:  0.06578947368421052
test accuracy:  0.7368421052631579
test accuracy:  0.8026315789473685
test accuracy:  0.8289473684210527
test accuracy:  0.8289473684210527
test accuracy:  0.8421052631578947
test accuracy:  0.8289473684210527
test accuracy:  0.8289473684210527
test accuracy:  0.8421052631578947
test accuracy:  0.8421052631578947
test accuracy:  0.8421052631578947
test accuracy:  0.8421052631578947
test accuracy:  0.8421052631578947
test accuracy:  0.8421052631578947
test accuracy:  0.8421052631578947
test accuracy:  0.8421052631578947
test accuracy:  0.8421052631578947
test accuracy:  0.8421052631578947
test accuracy:  0.8421052631578947
test accuracy:  0.8421052631578947
