# Graph-convolutional auto-encoder: BlogCatalog / Batching profiling

## Imports and setup

In [1]:
import time
import random
import csv
from collections import defaultdict
import os
import cProfile

import numpy as np
import scipy as sp
import scipy.stats
import sklearn.preprocessing
import matplotlib.pyplot as plt
import seaborn as sb
import networkx as nx

import keras
from keras import backend as K
import tensorflow as tf
from tensorflow.python import debug as tf_debug
from keras.utils.vis_utils import model_to_dot
from keras_tqdm import TQDMNotebookCallback

from progressbar import ProgressBar
from IPython.display import SVG, HTML, display

from nw2vec import ae
from nw2vec import utils
from nw2vec import codecs
from nw2vec import layers
from nw2vec import viz
from nw2vec import batching
from nw2vec import graph

Using TensorFlow backend.


In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Jacob is using GPU #1

## Load data

In [3]:
### Get the full list of nodes and groups ###

crop = None

# nodes
nodes = []
with open('data/BlogCatalog-dataset/data/nodes.csv') as csvfile:
    reader = csv.reader(csvfile)
    nodes = [int(row[0]) for row in reader]
if crop is not None:
    nodes = nodes[:crop]
assert len(nodes) == len(set(nodes))
nodes = set(nodes)

# groups
groups = []
with open('data/BlogCatalog-dataset/data/groups.csv') as csvfile:
    reader = csv.reader(csvfile)
    groups = [int(row[0]) for row in reader]
assert len(groups) == len(set(groups))
groups = set(groups)


### Generate graph from edges and node data ###

# Read edges.csv and make a network out of it
edges = defaultdict(list)
with open('data/BlogCatalog-dataset/data/edges.csv') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if crop is not None:
            if int(row[0]) in nodes and int(row[1]) in nodes:
                edges[int(row[0])].append(int(row[1]))
        else:
            edges[int(row[0])].append(int(row[1]))

g = nx.from_dict_of_lists(edges, create_using=nx.Graph())
if crop is not None:
    g.add_nodes_from(nodes)

# Read group-edges.csv and add that info to each node
group_edges = defaultdict(list)
with open('data/BlogCatalog-dataset/data/group-edges.csv') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if crop is not None:
            if int(row[0]) in nodes:
                group_edges[int(row[0])].append(int(row[1]))
        else:
            group_edges[int(row[0])].append(int(row[1]))

for node, data in g.nodes.items():
    data['groups'] = group_edges[node]


### Sanity checks ###
assert set(g.nodes) == nodes
#CROP
if crop is None:
    assert set().union(*[groups for _, groups in g.nodes(data='groups')]) == groups

## Set node labels

In [4]:
labels = np.zeros((len(nodes), len(groups)))
nodes_offset = min(nodes)
groups_offset = min(groups)
for n, data in g.nodes.items():
    labels[n - nodes_offset, np.array(data['groups']) - groups_offset] = 1
#labels += np.random.normal(scale=.2, size=labels.shape)

## Build the VAE

In [5]:
# VAE Parameters
n_nodes = len(nodes)
adj = nx.adjacency_matrix(g).astype(np.float32)

n_ξ_samples = 5
dim_data, dim_l1, dim_ξ = len(groups), 10, 2
dims = (dim_data, dim_l1, dim_ξ)
use_bias = False

# Actual VAE
q_model, q_codecs = ae.build_q(dims, use_bias=use_bias)
p_builder = ae.build_p_builder(dims, use_bias=use_bias)
vae, vae_codecs = ae.build_vae(
    (q_model, q_codecs), p_builder,
    n_ξ_samples,
    [
        1.0,  # q loss
        1.0,  # p adj loss
        1.0,  # p v loss
    ],
)

## Target function

In [6]:
features = utils.scale_center(labels)

def target_func(batch_adj, required_nodes, final_nodes):
    return [
        np.zeros(1), # ignored
        utils.expand_dims_tile(utils.expand_dims_tile(batch_adj + np.eye(batch_adj.shape[0]), 0, n_ξ_samples), 0, 1),
        utils.expand_dims_tile(features[final_nodes], 1, n_ξ_samples),
    ]

## Before training

$\xi$ averages and distributions for each community, real and predicted adjacency matrices

In [7]:
#plot_ξ_distribution(g, q_model, adj, labels, batches)

In [8]:
#fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9, 4))
#
#im1 = ax1.imshow(nadj)
#ax1.set_title('Real adjacency matrix')
#plt.colorbar(im1, ax=ax1)
#
#x, _, feeds = next(batches(vae, labels, adj, adj.shape[0], adj.shape[0], None))
#adj_pred = vae.predict_on_fed_batch(x, feeds=feeds)[1]
#im2 = ax2.imshow(scipy.special.expit(adj_pred[0].mean(axis=0)))
#im2.set_norm(im1.norm)
#ax2.set_title('Predicted adjacency matrix')
#plt.colorbar(im2, ax=ax2);

## Train

In [9]:
rm -r ./logs

rm: cannot remove './logs': No such file or directory


In [10]:
n_epochs = 1
seeds_per_batch = 5 #len(features)
max_walk_length = 200
p = 1
q = 1
neighbour_samples = 30

steps_per_epoch = int(np.ceil(len(labels) / seeds_per_batch))

In [12]:
# Precompute the CSGraph
_ = graph.get_csgraph(adj, 1, 1)

In [13]:
i = 0
cProfile.run(
'''for batch in batching.batches(vae, adj, labels, target_func,
                              seeds_per_batch, max_walk_length,
                              p=p, q=q, neighbour_samples=neighbour_samples):
    i += 1
    if i >= steps_per_epoch:
        break''',
             'gcn-ae-blogcatalog_batching-n_epochs={}-seeds_per_batch={}-max_walk_length={}-p={}-q={}-neighbour_samples={}-crop={}.profile'
             .format(n_epochs, seeds_per_batch, max_walk_length, p, q, neighbour_samples, crop)
            )

Usually, the cell above complains with "UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.". See these for more details:
* https://stackoverflow.com/questions/35892412/tensorflow-dense-gradient-explanation#35896823
* https://stackoverflow.com/questions/39111373/tensorflow-chaining-tf-gather-produces-indexedslices-warning

## Results

**After training:** $\xi$ averages and distributions for each community, real and predicted adjacency matrices

In [None]:
plot_ξ_distribution((g, l, k), q_model, adj, features, batches)

Predicted adjacency matrix:

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9, 4))

im1 = ax1.imshow(nx.adjacency_matrix(g).todense())
ax1.set_title('Real adjacency matrix')
plt.colorbar(im1, ax=ax1)

x, _, feeds = next(batches(vae, features, adj, adj.shape[0], adj.shape[0], None))
adj_pred = vae.predict_on_fed_batch(x, feeds=feeds)[1]
im2 = ax2.imshow(scipy.special.expit(adj_pred[0].mean(axis=0)))
im2.set_norm(im1.norm)
ax2.set_title('Predicted adjacency matrix')
plt.colorbar(im2, ax=ax2);

In [None]:
plt.imshow(np.random.binomial(1, scipy.special.expit(adj_pred[0].mean(axis=0))))

In [None]:
for layer in q_model.layers:
    if hasattr(layer, 'kernel'):
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(6, 4))#, sharey=True)
        im1 = ax1.imshow(K.eval(layer.kernel).T)
        ax1.set_title('{} kernel'.format(layer.name))
        plt.colorbar(im1, ax=ax1)
    if hasattr(layer, 'bias') and layer.bias is not None:
        im2 = ax2.imshow(K.eval(K.expand_dims(layer.bias, -1)))
        ax2.set_title('{} bias'.format(layer.name))
        plt.colorbar(im2, ax=ax2)

In [None]:
for layer in p_model.layers:
    if hasattr(layer, 'kernel'):
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(6, 4))#, sharey=True)
        im1 = ax1.imshow(K.eval(layer.kernel).T)
        ax1.set_title('{} kernel'.format(layer.name))
        plt.colorbar(im1, ax=ax1)
    if hasattr(layer, 'bias') and layer.bias is not None:
        im2 = ax2.imshow(K.eval(K.expand_dims(layer.bias, -1)))
        ax2.set_title('{} bias'.format(layer.name))
        plt.colorbar(im2, ax=ax2)