In [1]:
base_dir = '../..'

In [72]:
import os
import random

import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse

import stellargraph as sg
from stellargraph.data import EdgeSplitter
from stellargraph.mapper import GraphSAGELinkGenerator
from stellargraph.layer import GraphSAGE, link_classification
from stellargraph.data import UniformRandomWalk
from stellargraph.data import UnsupervisedSampler
from sklearn.model_selection import train_test_split

import keras 
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score

from stellargraph import globalvar

In [104]:
# This can be simplified once https://github.com/JuliaPy/pyjulia/issues/310 is fixed
from julia.api import LibJulia
api = LibJulia.load()
api.sysimage = os.path.join(base_dir, "julia/sys.so")
api.init_julia()

from julia import Main
Main.eval('include("' + os.path.join(base_dir, 'julia/dataset.jl') + '")')

<PyCall.jlwrap Main.Dataset>

## Settings

In [91]:
testprop = 0.2
bias = False
layer_sizes = [32, 16]

### Load the dataset

**Load adjacency and features from npz**

In [92]:
# Load npz file
cora = np.load(os.path.join(base_dir, 'datasets/gae-benchmarks/cora.npz'))

# Extract adjacency and graph
adj = sparse.coo_matrix((cora['adjdata'], (cora['adjrow'], cora['adjcol'])))
Gnx = nx.from_scipy_sparse_matrix(adj, edge_attribute='label')
nx.set_edge_attributes(Gnx, 'cites', 'label')
nx.set_node_attributes(Gnx, "paper", "label")

# Extract features
node_features = pd.DataFrame(cora['features'], dtype=int)

**Create a nodes test set**

In [105]:
def to_julia_edgelist(g):
    return 1 + nx.to_pandas_edgelist(Gnx)[['source', 'target']].values

def from_julia_edgelist(edgelist):
    pd_edgelist = pd.DataFrame(np.array(edgelist) - 1, columns=['source', 'target'])
    pd_edgelist['label'] = 'cites'
    g = nx.from_pandas_edgelist(pd_edgelist, edge_attr="label")
    nx.set_node_attributes(g, "paper", "label")
    return g

gtrain_edgelist, edges_test_true, edges_test_false = Main.Dataset.make_edges_test_set(to_julia_edgelist(Gnx), testprop)
edges_test_true = edges_test_true - 1
edges_test_false = edges_test_false - 1
Gtrain_nx = from_julia_edgelist(gtrain_edgelist)

Recover nodes that are now isolated in Gtrain_nx, not seen through the edgelist

In [117]:
for n in Gnx.nodes():
    if n not in Gtrain_nx.nodes():
        Gtrain_nx.add_node(n)
nx.set_node_attributes(Gtrain_nx, "paper", "label")

## Train the embedding model

**1. Create the Stellargraph with node features.**

In [118]:
G = sg.StellarGraph(Gnx, node_features=node_features)
Gtrain = sg.StellarGraph(Gtrain_nx, node_features=node_features)
print(G.info())
print(Gtrain.info())

StellarGraph: Undirected multigraph
 Nodes: 2708, Edges: 5278

 Node types:
  paper: [2708]
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [5278]

StellarGraph: Undirected multigraph
 Nodes: 2708, Edges: 4222

 Node types:
  paper: [2708]
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [4222]



**2. Specify the other optional parameter values: root nodes, the number of walks to take per node, the length of each walk, and random seed.**

In [119]:
actual_nodes_train = list(Gtrain.nodes())
#assert set(nodes_train).issuperset(actual_nodes_train)
number_of_walks = 1
length = 5

**3. Create the UnsupervisedSampler instance with the relevant parameters passed to it.**

In [120]:
unsupervised_samples = UnsupervisedSampler(Gtrain, nodes=actual_nodes_train, length=length, number_of_walks=number_of_walks)

The graph G together with the unsupervised sampler will be used to generate samples.

**5. Create a node pair generator:**

Next, create the node pair generator for sampling and streaming the training data to the model. The node pair generator essentially "maps" pairs of nodes `(target, context)` to the input of GraphSAGE: it either takes minibatches of node pairs, or an `UnsupervisedSampler` instance which generates the minibatches of node pairs on demand. The generator samples 2-hop subgraphs with `(target, context)` head nodes extracted from those pairs, and feeds them, together with the corresponding binary labels indicating which pair represent positive or negative sample, to the input layer of the node pair classifier with GraphSAGE node encoder, for SGD updates of the model parameters.

Specify:
1. The minibatch size (number of node pairs per minibatch).
2. The number of epochs for training the model.
3. The sizes of 1- and 2-hop neighbor samples for GraphSAGE:

Note that the length of `num_samples` list defines the number of layers/iterations in the GraphSAGE encoder. In this example, we are defining a 2-layer GraphSAGE encoder.

In [121]:
batch_size = 50
epochs = 4
num_samples = [10, 5]

In the following we show the working of node pair generator with the UnsupervisedSampler, which will generate samples on demand.

In [122]:
train_gen = GraphSAGELinkGenerator(Gtrain, batch_size, num_samples).flow(unsupervised_samples)

Running GraphSAGELinkGenerator with an estimated 542 batches generated on the fly per epoch.


Build the model: a 2-layer GraphSAGE encoder acting as node representation learner, with a link classification layer on concatenated `(node1, node2)` node embeddings.

GraphSAGE part of the model, with hidden layer sizes of 50 for both GraphSAGE layers, a bias term, and no dropout. (Dropout can be switched on by specifying a positive dropout rate, 0 < dropout < 1).
Note that the length of `layer_sizes` list must be equal to the length of `num_samples`, as `len(num_samples)` defines the number of hops (layers) in the GraphSAGE encoder.

In [123]:
assert len(layer_sizes) == len(num_samples)

graphsage = GraphSAGE(
        layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=0.0, normalize="l2"
    )

In [124]:
# Build the model and expose input and output sockets of graphsage, for node pair inputs:
x_inp, x_out = graphsage.build(flatten_output=False)

Final node pair classification layer that takes a pair of nodes' embeddings produced by `graphsage` encoder, applies a binary operator to them to produce the corresponding node pair embedding ('ip' for inner product; other options for the binary operator can be seen by running a cell with `?link_classification` in it), and passes it through a dense layer:

In [125]:
prediction = link_classification(
        output_dim=1, output_act="sigmoid", edge_embedding_method='ip'
    )(x_out)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


Stack the GraphSAGE encoder and prediction layer into a Keras model, and specify the loss

In [126]:
model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
        optimizer=keras.optimizers.Adam(lr=1e-3),
        loss=keras.losses.binary_crossentropy,
        metrics=[keras.metrics.binary_accuracy],
    )

**6. Train the model.**

In [134]:
history = model.fit_generator(
        train_gen,
        epochs=epochs,
        verbose=1,
        use_multiprocessing=False,
        workers=0,
        shuffle=True,
    )

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


## Evaluate link prediction performance

In [135]:
edges_test_all = np.concatenate([edges_test_true, edges_test_false])
edges_real_all = np.concatenate([np.ones(edges_test_true.shape[0]), np.zeros(edges_test_false.shape[0])])
test_gen = GraphSAGELinkGenerator(Gtrain, batch_size, num_samples).flow(edges_test_all)
edges_pred_all = model.predict_generator(test_gen)[:,0]

In [136]:
roc_auc_score(edges_real_all, edges_pred_all), average_precision_score(edges_real_all, edges_pred_all)

(0.9026540260560145, 0.905600714352716)

## Get embeddings for all nodes

In [17]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from stellargraph.mapper import GraphSAGENodeGenerator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

**Building a new node-based model**

The `(src, dst)` node pair classifier `model` has two identical node encoders: one for source nodes in the node pairs, the other for destination nodes in the node pairs passed to the model. We can use either of the two identical encoders to evaluate node embeddings. Below we create an embedding model by defining a new Keras model with `x_inp_src` (a list of odd elements in `x_inp`) and `x_out_src` (the 1st element in `x_out`) as input and output, respectively. Note that this model's weights are the same as those of the corresponding node encoder in the previously trained node pair classifier.

In [18]:
x_inp_src = x_inp[0::2]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

We also need a node generator to feed graph nodes to `embedding_model`. We want to evaluate node embeddings for all nodes in the graph:

In [19]:
node_ids = sorted(G.nodes)
node_gen = GraphSAGENodeGenerator(G, batch_size, num_samples).flow(node_ids)

We now use `node_gen` to feed all nodes into the embedding model and extract their embeddings:

In [20]:
emb = embedding_model.predict_generator(node_gen, workers=4, verbose=1)
node_embeddings = emb[:, 0, :]



## Visualize the node embeddings 

In [21]:
#node_subject = np.where(cora['labels'])[1][node_ids]
#
#X = node_embeddings
#if X.shape[1] > 2:
#    transform = TSNE #PCA 
#
#    trans = transform(n_components=2)
#    emb_transformed = pd.DataFrame(trans.fit_transform(X), index=node_ids)
#    emb_transformed['label'] = node_subject
#else:
#    emb_transformed = pd.DataFrame(X, index=node_ids)
#    emb_transformed = emb_transformed.rename(columns = {'0': 0, '1': 1})
#    emb_transformed['label'] = node_subject

In [22]:
#alpha = 0.7
#
#fig, ax = plt.subplots(figsize=(7,7))
#ax.scatter(emb_transformed[0], emb_transformed[1], c=emb_transformed['label'].astype("category"), 
#            cmap="jet", alpha=alpha)
#ax.set(aspect="equal", xlabel="$X_1$", ylabel="$X_2$")
#plt.title('{} visualization of GraphSAGE embeddings for cora dataset'.format(transform.__name__))
#plt.show()

## Downstream node classification

In [23]:
# X will hold the 50 input features (node embeddings)
X = node_embeddings  
# y holds the corresponding target values
y = np.where(cora['labels'])[1]

We train a Logistic Regression classifier on the training data. 

In [24]:
X_train, X_test, y_train, y_test = X[nodes_train, :], X[nodes_test, :], y[nodes_train], y[nodes_test]

In [25]:
clf = LogisticRegression(verbose=0, solver='liblinear', multi_class="ovr")
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

Predict the hold out test set.

In [26]:
y_pred = clf.predict(X_test)

Calculate the accuracy of the classifier on the test set.

In [27]:
f1_score(y_test, y_pred, average='macro')

0.7044132546668579

In [28]:
f1_score(y_test, y_pred, average='micro')

0.739290989660266