[xptree/NetMF: Network Embedding as Matrix Factorization: Unifying DeepWalk, LINE, PTE, and node2vec](https://github.com/xptree/NetMF)

In [69]:
import scipy.io
import scipy.sparse as sparse
from scipy.sparse import csgraph
import numpy as np
import argparse
import logging
import theano
from theano import tensor as T

In [70]:
theano.config.exception_verbosity='high'

## load matrix

In [151]:
def load_adjacency_matrix(file, variable_name="network"):
    data = scipy.io.loadmat(file)
    return data[variable_name]

In [152]:
file = "./data/POS.mat"
# file = "./data/blogcatalog.mat"

In [153]:
A = load_adjacency_matrix(file)
A.shape

(4777, 4777)

In [154]:
A.toarray()[-1]

array([0., 0., 2., ..., 0., 0., 0.])

In [156]:
A.toarray()[0]

array([ 208. ,  989.5, 1003. , ...,    0. ,    0. ,    0. ])

## netmf small

In [75]:
def direct_compute_deepwalk_matrix(A, window, b):
    n = A.shape[0]
    vol = float(A.sum())
    L, d_rt = csgraph.laplacian(A, normed=True, return_diag=True)
    # X = D^{-1/2} A D^{-1/2}
    X = sparse.identity(n) - L
    S = np.zeros_like(X)
    X_power = sparse.identity(n)
    for i in range(window):
        X_power = X_power.dot(X)
        S += X_power
    S *= vol / window / b
    D_rt_inv = sparse.diags(d_rt ** -1)
    M = D_rt_inv.dot(D_rt_inv.dot(S).T)
    m = T.matrix()
    f = theano.function([m], T.log(T.maximum(m, 1)))
    Y = f(M.todense().astype(theano.config.floatX))
    return sparse.csr_matrix(Y)

In [46]:
deepwalk_matrix = direct_compute_deepwalk_matrix(A, window=10, b=1.0)

In [99]:
n = A.shape[0]

In [100]:
vol = float(A.sum())
vol

328685.0

In [95]:
L, d_rt = csgraph.laplacian(A, normed=True, return_diag=True)

In [96]:
d_rt

array([146.34206504, 139.06473313, 119.00420161, ...,   3.        ,
         3.        ,   3.        ])

In [97]:
A.toarray()[0]

array([ 208. ,  989.5, 1003. , ...,    0. ,    0. ,    0. ])

In [98]:
L.toarray()[0]

array([ 1.        , -0.04862164, -0.05759297, ...,  0.        ,
        0.        ,  0.        ])

In [108]:
X = sparse.identity(n) - L
X.shape

(4777, 4777)

In [109]:
S = np.zeros_like(X)
X_power = sparse.identity(n)

In [119]:
for i in range(10):
    X_power = X_power.dot(X)
    S += X_power

In [120]:
S.toarray()[0]

array([0.64201572, 0.60426014, 0.53558712, ..., 0.01188556, 0.01207324,
       0.01225084])

In [121]:
S = S * vol / 10 / 1.0
S.toarray()[0]

array([21102.09369109, 19861.12435463, 17603.94520014, ...,
         390.66068492,   396.82940397,   402.66674426])

In [126]:
d_rt

array([146.34206504, 139.06473313, 119.00420161, ...,   3.        ,
         3.        ,   3.        ])

In [124]:
D_rt_inv = sparse.diags(d_rt ** -1)
D_rt_inv

<4777x4777 sparse matrix of type '<class 'numpy.float64'>'
	with 4777 stored elements (1 diagonals) in DIAgonal format>

In [125]:
D_rt_inv.toarray()[0]

array([0.00683331, 0.        , 0.        , ..., 0.        , 0.        ,
       0.        ])

In [127]:
M = D_rt_inv.dot(D_rt_inv.dot(S).T)

In [128]:
M.toarray()[0]

array([0.98534244, 0.9759277 , 1.01083096, ..., 0.88983457, 0.90388548,
       0.91718159])

In [135]:
M.todense()[0]

matrix([[0.98534244, 0.9759277 , 1.01083096, ..., 0.88983457, 0.90388548,
         0.91718159]])

In [129]:
m = T.matrix()

In [131]:
f = theano.function([m], T.log(T.maximum(m, 1)))

In [136]:
Y = f(M.todense().astype(theano.config.floatX))

In [137]:
res = sparse.csr_matrix(Y)

In [139]:
res.toarray()[0]

array([0.        , 0.        , 0.01077272, ..., 0.        , 0.        ,
       0.        ])

In [142]:
M2 = M.todense()

In [143]:
M2[M2 <= 1] = 1
Y2 = np.log(M2)
res2 = sparse.csr_matrix(Y2)

In [144]:
res2.toarray()[0]

array([0.        , 0.        , 0.01077272, ..., 0.        , 0.        ,
       0.        ])

In [146]:
def svd_deepwalk_matrix(X, dim):
    u, s, v = sparse.linalg.svds(X, dim, return_singular_vectors="u")
    # return U \Sigma^{1/2}
    return sparse.diags(np.sqrt(s)).dot(u.T).T

In [147]:
deepwalk_embedding = svd_deepwalk_matrix(res, dim=128)

In [158]:
deepwalk_embedding.shape

(4777, 128)

## netmf large

In [159]:
def approximate_normalized_graph_laplacian(A, rank, which="LA"):
    n = A.shape[0]
    L, d_rt = csgraph.laplacian(A, normed=True, return_diag=True)
    # X = D^{-1/2} W D^{-1/2}
    X = sparse.identity(n) - L
    evals, evecs = sparse.linalg.eigsh(X, rank, which=which)
    D_rt_inv = sparse.diags(d_rt ** -1)
    D_rt_invU = D_rt_inv.dot(evecs)
    return evals, D_rt_invU

In [160]:
evals, D_rt_invU = approximate_normalized_graph_laplacian(A, rank=256, which="LA")

In [161]:
evals.shape, D_rt_invU.shape

((256,), (4777, 256))

In [162]:
def deepwalk_filter(evals, window):
    for i in range(len(evals)):
        x = evals[i]
        evals[i] = 1. if x >= 1 else x*(1-x**window) / (1-x) / window
    evals = np.maximum(evals, 0)
    return evals

In [163]:
evals = deepwalk_filter(evals, window=10)

In [165]:
evals.shape

(256,)

In [166]:
def approximate_deepwalk_matrix(evals, D_rt_invU, window, vol, b):
    evals = deepwalk_filter(evals, window=window)
    X = sparse.diags(np.sqrt(evals)).dot(D_rt_invU.T).T
    m = T.matrix()
    mmT = T.dot(m, m.T) * (vol/b)
    f = theano.function([m], T.log(T.maximum(mmT, 1)))
    Y = f(X.astype(theano.config.floatX))
    return sparse.csr_matrix(Y)

In [167]:
deepwalk_matrix = approximate_deepwalk_matrix(evals, D_rt_invU, window=10, vol=vol, b=1.0)

In [168]:
deepwalk_matrix.shape

(4777, 4777)

In [169]:
deepwalk_matrix.toarray()[0]

array([0.01909541, 0.00504763, 0.00788785, ..., 0.        , 0.00688923,
       0.00950923])

In [170]:
deepwalk_embedding = svd_deepwalk_matrix(deepwalk_matrix, dim=128)

In [171]:
deepwalk_embedding.shape

(4777, 128)