In [121]:
import numpy as np
import scipy as sp
import pandas as pd
import random
import matplotlib.pyplot as plt
import  networkx as nx
import tensorflow as tf

%matplotlib inline

The non backtacking matrix is similiar to message passing algorithms used to find clusters.  This works much better in the sparse case (especially since adjacency matrices in that case are very not full rank, so finding their eigenvectors...)

Definition:

Let M be the number of edges, then the matrix B, our non backtracking matrix will be a 2M by 2M matrix such that B_(ei-ej) = 1 if ei and ej are adjacent and are not just one edge, and 0 otherwise (i.e the diagonal should be zero). 


An interesting point is that the Bethe Hessian matrix should have similar performance, however is defined from the laplacian, with a regularizing r...that can be learned.  https://papers.nips.cc/paper/5520-spectral-clustering-of-graphs-with-the-bethe-hessian.pdf

the Bethe Hessian Matrix is the deformed Laplacian is is simply:
H(r): = (r^2 -1)1 - rA +D


for the stochastic block model, optimal r's have been show to equal root(c) where c is the average degree of the graph.


Compute eigenvectors associated with negative eigenvalues of both H(r_c) and H(-r_c)

supposedly, the negative eigenvalues of H(r_c) reveal the assortative aspects, whereas H(-r_c) reviews the disassortative ones.

Tests:

Give a large dataset of stochastic block models with the same average degree, will it learn the correct r_c?


In [122]:
communities = 2 #number of communities, chance to 
group_size = 20 #number of nodes in each communitites (balanced so far)
dim_graph = communities*group_size
A = balanced_stochastic_blockmodel(communities=communities, groupsize=group_size, p_in=0.29, p_out=0.1)

print A

Adj = tf.cast(A, tf.float32)
Diag = tf.diag(tf.reduce_sum(Adj,0)) #just the diagonal matrix of degrees of Adj
r = tf.Variable(tf.random_normal(shape=[1], mean=0.0,
                                 stddev=10.0, dtype=tf.float32,
                                 seed=None, name=None))
Bethe_Hesse = (tf.square(r)-1)*tf.diag(tf.ones(shape=[dim_graph]))-tf.mul(r, Adj)+Diag 


assignment_laplacian, means_laplacian, centroides_laplacian = k_means_spectral(laplacian, communities, group_size)
update_centroides_laplacian = tf.assign(centroides_laplacian, means_laplacian)
loss_laplacian, error_laplacian = cluster_error(assignment_laplacian, group_size)

assignment_adj, means_adj, centroides_adj = k_means_spectral(Adj, communities, group_size)
update_centroides_adj = tf.assign(centroides_adj, means_adj)
loss_adj, error_adj = cluster_error(assignment_adj, group_size)



init = tf.initialize_all_variables()


with tf.Session() as sess:
    sess.run(init)
    print sess.run(Bethe_Hesse)
    
    

init = tf.initialize_all_variables()


with tf.Session() as sess:
    sess.run(init) #, feed_dict = {x: A})
    for step in xrange(10):
        sess.run([update_centroides_adj, update_centroides_laplacian, assignment_adj, assignment_laplacian])
    a, b, c, d = sess.run([assignment_laplacian, assignment_adj, error_adj, error_laplacian])
    print 'Using the Laplacian, the assignment of clusters for each node is {}, with error rate of {}.'.format(a, d)
    print 'Using the adjacency matrix, the assignment of clusters for each node is {}, with error rate of {}'.format(b, c)
    print "Note that a random guess of a balanced partition will on average overlap with the correct clustering by 50%,but we are taking the min of two binomial's--since we wanted it to be invariant under labellings, so randomly we will do a little better than 50% error, min of two binomial 1/2.  But for bigger and bigger vectors this will approach 1/2. "
    
    

    
    

[[0 0 0 ..., 0 0 0]
 [0 0 1 ..., 1 0 1]
 [0 1 0 ..., 0 0 0]
 ..., 
 [0 1 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 1]
 [0 1 0 ..., 0 1 0]]


NameError: name 'laplacian' is not defined


ß

In [123]:
communities = 2 #number of communities, chance to 
group_size = 3 
def get_eigenvectors(A, communities=communities, group_size=group_size):
    """gets first k eigenvalues of matrix"""
    dim_graph = communities*group_size
    eigenval, eigenvec = tf.self_adjoint_eig(A)
    return tf.slice(eigenvec, [0, dim_graph-communities], [dim_graph, communities])

def k_means_spectral(A, communities=communities, group_size=group_size):
    """takes in matrix, does k means on first k eignvectors 
    Y where rows are the data points"""    

    Y = get_eigenvectors(A, communities, group_size)
    centroides = tf.Variable(tf.slice(tf.random_shuffle(Y),[0,0],[communities,-1]))
    expanded_Y = tf.expand_dims(Y, 0)
    expanded_centroides = tf.expand_dims(centroides, 1)
    assignments = tf.argmin(tf.reduce_sum(tf.square(tf.sub(expanded_Y, expanded_centroides)), 2), 0) #these are the clustering assignments based on current centroides
    means = tf.concat(0, [tf.reduce_mean(tf.gather(Y, tf.reshape(tf.where( tf.equal(assignments, c)),[1,-1])),
                                     reduction_indices=[1]) for c in xrange(communities)])
    
    return assignments, means, centroides

def cluster_error(assignment, group_size=group_size, communities=communities):
    """Takes in assignments and compares to the balanced two cluster
    model of the random graph above"""
    dim_graph = communities*group_size
    true_assignment_a = tf.concat(0, [tf.zeros([group_size], dtype=tf.float32),
                                      tf.ones([group_size], dtype=tf.float32)])
    true_assignment_b = tf.concat(0, [tf.ones([group_size], dtype=tf.float32),
                                      tf.zeros([group_size], dtype=tf.float32)])         
    assignment = tf.cast(assignment, dtype = tf.float32)
    loss = tf.minimum(tf.reduce_sum(tf.square(tf.sub(true_assignment_a, assignment))),
                      tf.reduce_sum(tf.square(tf.sub(true_assignment_b, assignment))))
    error = tf.div(loss, dim_graph)
    
    return loss, error


## the following functions allows us to verify that permuted adjacency matrices shoudl have no effect on the network
#of course, which no power method, this is just true by construction but good to have anyways 

def joint_permutation(A):
    #takes adjacency matrix and relabels, gives out permutated adjacency matrix of same relationship
    random_shuffle = np.random.permutation(len(A))

    A_shuffle = A[random_shuffle]
    A_shuffle = np.transpose(A_shuffle)
    A_shuffle = A_shuffle[random_shuffle]

    return A_shuffle, random_shuffle

def balanced_stochastic_blockmodel(communities=2, groupsize=3, p_in=1.0, p_out=0.0):
    #gives dense adjacency matrix representaiton of randomly generated SBM with balanced community size

    G = nx.planted_partition_graph(l=communities, k=groupsize, p_in=p_in, p_out =p_out)
    A = nx.adjacency_matrix(G).todense()
    
    return A



In [185]:
communities = 2 #number of communities, chance to 
group_size = 4 #number of nodes in each communitites (balanced so far)
dim_graph = communities*group_size
A = balanced_stochastic_blockmodel(communities=communities, groupsize=group_size, p_in=0.15, p_out=0.01)

Adj = tf.cast(A, tf.float32)
Diag = tf.diag(tf.reduce_sum(Adj,0)) #just the diagonal matrix of degrees of Adj
r = tf.Variable(tf.random_normal(shape=[1], mean=0.0,
                                 stddev=1.0, dtype=tf.float32,
                                 seed=None, name=None))


Bethe_Hesse_neg = (tf.square(r)-1)*tf.diag(tf.ones(shape=[dim_graph]))-tf.mul(r, Adj)+Diag 

def get_neg_eigenvectors(A, dim_graph=dim_graph):
    """gets neg eigenvalues of matrix, 
    this may not be a differentiable opeartor
    so let's set it to take the last k eigenvalues as usual
    """
    dim_graph = communities*group_size
    eigenval, eigenvec = tf.self_adjoint_eig(A)
    return eigenval, tf.slice(eigenvec, [0, 0], [dim_graph, group_size])

def k_means_spectral(A, dim_graph=dim_graph, communities=communities):
    """takes in matrix, does k means on first k eignvectors 
    Y where rows are the data points"""    

    Y = get_neg_eigenvectors(A, dim_graph)[1]
    centroides = tf.Variable(tf.slice(tf.random_shuffle(Y),[0,0],[communities,-1]))
    expanded_Y = tf.expand_dims(Y, 0)
    expanded_centroides = tf.expand_dims(centroides, 1)
    assignments = tf.argmin(tf.reduce_sum(tf.square(tf.sub(expanded_Y, expanded_centroides)), 2), 0) #these are the clustering assignments based on current centroides
    means = tf.concat(0, [tf.reduce_mean(tf.gather(Y, tf.reshape(tf.where( tf.equal(assignments, c)),[1,-1])),
                                     reduction_indices=[1]) for c in xrange(communities)])
    
    return assignments, means, centroides


def cluster_error(assignment, group_size=group_size, communities=communities):
    """Takes in assignments and compares to the balanced two cluster
    model of the random graph above"""
    dim_graph = communities*group_size
    true_assignment_a = tf.concat(0, [tf.zeros([group_size], dtype=tf.float32),
                                      tf.ones([group_size], dtype=tf.float32)])
    true_assignment_b = tf.concat(0, [tf.ones([group_size], dtype=tf.float32),
                                      tf.zeros([group_size], dtype=tf.float32)])         
    assignment = tf.cast(assignment, dtype = tf.float32)
    loss = tf.minimum(tf.reduce_sum(tf.square(tf.sub(true_assignment_a, assignment))),
                      tf.reduce_sum(tf.square(tf.sub(true_assignment_b, assignment))))
    error = tf.div(loss, dim_graph)
    
    return error


assignment_adj, means_adj, centroides_adj = k_means_spectral(Bethe_Hesse_neg, communities, group_size)
update_centroides_adj = tf.assign(centroides_adj, means_adj)
error_adj = cluster_error(assignment_adj, group_size, communities)


init = tf.initialize_all_variables()



with tf.Session() as sess:
    sess.run(init) #, feed_dict = {x: A})
    for step in xrange(10):
        sess.run([update_centroides_adj, assignment_adj, error_adj])
        a, b = sess.run([assignment_adj, error_adj])
        print 'Using the Besse Hessian Matrix, the assignment of clusters for each node is {}, with error rate of {}'.format(a, b)
  

tf.reset_default_graph()

FailedPreconditionError: Attempting to use uninitialized value Variable
	 [[Node: Variable/read = Identity[T=DT_FLOAT, _class=["loc:@Variable"], _device="/job:localhost/replica:0/task:0/cpu:0"](Variable)]]
Caused by op u'Variable/read', defined at:
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/runpy.py", line 174, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/traitlets/config/application.py", line 596, in launch_instance
    app.start()
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/ipykernel/zmqshell.py", line 498, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-185-8000aa7c3336>", line 10, in <module>
    seed=None, name=None))
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/tensorflow/python/ops/variables.py", line 215, in __init__
    dtype=dtype)
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/tensorflow/python/ops/variables.py", line 327, in _init_from_args
    self._snapshot = array_ops.identity(self._variable, name="read")
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1106, in identity
    result = _op_def_lib.apply_op("Identity", input=input, name=name)
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 703, in apply_op
    op_def=op_def)
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2334, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Users/xiangli/anaconda/envs/tfnighly/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1253, in __init__
    self._traceback = _extract_stack()


In [178]:
tf.reset_default_graph()

In [12]:

sess = tf.Session()
sess.run(init)
print sess.run([Y, centroides, assignments, means, tmp])

    

init = tf.initialize_all_variables()
