In [27]:
import matplotlib.pyplot as plt
from math import isclose
from sklearn.decomposition import PCA
import os
import networkx as nx
import numpy as np
import pandas as pd
from stellargraph import StellarGraph, datasets
from stellargraph.data import EdgeSplitter, UnsupervisedSampler
from stellargraph.layer import link_classification
from tensorflow import keras
from collections import Counter
import multiprocessing
from IPython.display import display, HTML
from sklearn.model_selection import train_test_split

%matplotlib inline

# Load Custom Dataset

In [3]:
from networkx.readwrite import json_graph
import json

In [4]:
def read_json_file(filename):
    with open(filename) as f:
        js_graph = json.load(f)
    return json_graph.node_link_graph(js_graph)

In [5]:
%time
transport=read_json_file("bus_transport_graph.json")

Wall time: 0 ns


In [6]:
g_feature_attr = transport.copy()


def compute_features(lat, lon):
    # in general this could compute something based on other features, but for this example,
    # we don't have any other features, so we'll just do something basic with the node_id
    return [lat, lon]


for node_id, node_data in g_feature_attr.nodes(data=True):
    node_data["feature"] = compute_features(node_data["latitude"],node_data["longitude"])
    
# let's see what some of them look like:
g_feature_attr.nodes["75009"]

{'type': 'bus_stop',
 'latitude': 1.35407552367477,
 'longitude': 103.94339098473914,
 'feature': [1.35407552367477, 103.94339098473914]}

In [7]:
transport_sg=StellarGraph.from_networkx(g_feature_attr, node_type_default="bus_stop", edge_type_default="bus_route", node_features='feature')
print(transport_sg.info())

StellarDiGraph: Directed multigraph
 Nodes: 5083, Edges: 7459

 Node types:
  bus_stop: [5083]
    Features: float32 vector, length 2
    Edge types: bus_stop-bus_route->bus_stop

 Edge types:
    bus_stop-bus_route->bus_stop: [7459]
        Weights: range=[0, 37.8], mean=0.797319, std=2.41502
        Features: none


In [54]:
# Define an edge splitter on the original graph:
edge_splitter_graph = EdgeSplitter(transport_sg)

# Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from graph, and obtain the
# reduced graph graph_test with the sampled links removed:
graph_test, examples_test, labels_test = edge_splitter_graph.train_test_split(
    p=0.1, method="global"
)

print(graph_test.info())
print(len(examples_test), len(labels_test))

** Sampled 745 positive and 745 negative edges. **
StellarDiGraph: Directed multigraph
 Nodes: 5083, Edges: 6714

 Node types:
  bus_stop: [5083]
    Features: float32 vector, length 2
    Edge types: bus_stop-bus_route->bus_stop

 Edge types:
    bus_stop-bus_route->bus_stop: [6714]
        Weights: range=[0, 37.8], mean=0.790825, std=2.39808
        Features: none
1490 1490


In [57]:
# Do the same process to compute a training subset from within the test graph
edge_splitter_train = EdgeSplitter(graph_test)
graph_train, examples, labels = edge_splitter_train.train_test_split(
    p=0.75, method="global"
)
print(graph_train.info())
print(len(examples), len(labels))

** Sampled 5035 positive and 5035 negative edges. **
StellarDiGraph: Directed multigraph
 Nodes: 5083, Edges: 1679

 Node types:
  bus_stop: [5083]
    Features: float32 vector, length 2
    Edge types: bus_stop-bus_route->bus_stop

 Edge types:
    bus_stop-bus_route->bus_stop: [1679]
        Weights: range=[0, 34.2], mean=0.786004, std=2.5441
        Features: none
10070 10070


In [58]:
pd.DataFrame(
    [
        (
            "Training Set",
            len(examples),
            "Train Graph",
            "Test Graph",
            "Train the Link Classifier",
        ),
        (
            "Test set",
            len(examples_test),
            "Test Graph",
            "Full Graph",
            "Evaluate the best Link Classifier",
        ),
    ],
    columns=("Split", "Number of Examples", "Hidden from", "Picked from", "Use"),
).set_index("Split")

Unnamed: 0_level_0,Number of Examples,Hidden from,Picked from,Use
Split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Training Set,10070,Train Graph,Test Graph,Train the Link Classifier
Test set,1490,Test Graph,Full Graph,Evaluate the best Link Classifier


# Model Definitions

## embeddings

In [59]:
from stellargraph.data import BiasedRandomWalk


def create_biased_random_walker(graph, walk_num, walk_length):
    # parameter settings for "p" and "q":
    p = 1.0
    q = 1.0
    return BiasedRandomWalk(graph, n=walk_num, length=walk_length, p=p, q=q)

In [60]:
walk_length=5
epochs=6
batch_size=50

In [61]:
from stellargraph.mapper import Attri2VecLinkGenerator, Attri2VecNodeGenerator
from stellargraph.layer import Attri2Vec


def attri2vec_embedding(graph, name):

    # Set the embedding dimension and walk number:
    dimension = [128]
    walk_number = 4

    print(f"Training Attri2Vec for '{name}':")

    graph_node_list = list(graph.nodes())

    # Create the biased random walker to generate random walks
    walker = create_biased_random_walker(graph, walk_number, walk_length)

    # Create the unsupervised sampler to sample (target, context) pairs from random walks
    unsupervised_samples = UnsupervisedSampler(
        graph, nodes=graph_node_list, walker=walker
    )

    # Define an Attri2Vec training generator, which generates batches of training pairs
    generator = Attri2VecLinkGenerator(graph, batch_size)

    # Create the Attri2Vec model
    attri2vec = Attri2Vec(
        layer_sizes=dimension, generator=generator, bias=False, normalize=None
    )

    # Build the model and expose input and output sockets of Attri2Vec, for node pair inputs
    x_inp, x_out = attri2vec.in_out_tensors()

    # Use the link_classification function to generate the output of the Attri2Vec model
    prediction = link_classification(
        output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
    )(x_out)

    # Stack the Attri2Vec encoder and prediction layer into a Keras model, and specify the loss
    model = keras.Model(inputs=x_inp, outputs=prediction)
    model.compile(
        optimizer=keras.optimizers.Adam(lr=1e-3),
        loss=keras.losses.binary_crossentropy,
        metrics=[keras.metrics.binary_accuracy],
    )

    # Train the model
    model.fit(
        generator.flow(unsupervised_samples),
        epochs=epochs,
        verbose=2,
        use_multiprocessing=False,
        workers=1,
        shuffle=True,
    )

    # Build the model to predict node representations from node features with the learned Attri2Vec model parameters
    x_inp_src = x_inp[0]
    x_out_src = x_out[0]
    embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

    # Get representations for all nodes in ``graph``
    node_gen = Attri2VecNodeGenerator(graph, batch_size).flow(graph_node_list)
    node_embeddings = embedding_model.predict(node_gen, workers=1, verbose=0)

    def get_embedding(u):
        u_index = graph_node_list.index(u)
        return node_embeddings[u_index]

    return get_embedding

## Pipeline

In [62]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler


# 1. link embeddings
def link_examples_to_features(link_examples, transform_node, binary_operator):
    return [
        binary_operator(transform_node(src), transform_node(dst))
        for src, dst in link_examples
    ]


# 2. training classifier
def train_link_prediction_model(
    link_examples, link_labels, get_embedding, binary_operator
):
    clf = link_prediction_classifier()
    link_features = link_examples_to_features(
        link_examples, get_embedding, binary_operator
    )
    clf.fit(link_features, link_labels)
    return clf


def link_prediction_classifier(max_iter=5000):
    lr_clf = LogisticRegressionCV(Cs=10, cv=10, scoring="roc_auc", max_iter=max_iter)
    return Pipeline(steps=[("sc", StandardScaler()), ("clf", lr_clf)])


# 3. and 4. evaluate classifier
def evaluate_link_prediction_model(
    clf, link_examples_test, link_labels_test, get_embedding, binary_operator
):
    link_features_test = link_examples_to_features(
        link_examples_test, get_embedding, binary_operator
    )
    # print(link_features_test)
    score,preds = evaluate_roc_auc(clf, link_features_test, link_labels_test)
    
    return score,preds


def evaluate_roc_auc(clf, link_features, link_labels):
    predicted = clf.predict_proba(link_features)
    

    # check which class corresponds to positive links
    positive_column = list(clf.classes_).index(1)
    return roc_auc_score(link_labels, predicted[:, positive_column]),predicted

In [63]:
def operator_l2(u, v):
    return (u - v) ** 2



# def run_link_prediction(binary_operator, embedding_train):
#     clf = train_link_prediction_model(
#         examples_train, labels_train, embedding_train, binary_operator
#     )
#     # score, preds, pos_col = evaluate_link_prediction_model(
#     #     clf,
#     #     examples_model_selection,
#     #     labels_model_selection,
#     #     embedding_train,
#     #     binary_operator,
#     # )
#     return clf
#     return {
#         "classifier": clf,
#         # "binary_operator": binary_operator,
#         # "score": score,
#     }


# binary_operators = [ operator_l2]

In [69]:
def train_and_evaluate(embedding, graph=graph_test):
    
    embedding_train = embedding(graph, "Train Graph")

    # Train the link classification model with the learned embedding
    clf = train_link_prediction_model(
        examples, labels, embedding_train, operator_l2
    )
    
    test_score, preds, pos_col = evaluate_link_prediction_model(
        clf,
        examples_test,
        labels_test,
        embedding_train,
        operator_l2,
    )

    return embedding_train, clf, test_score, preds, pos_col

# Training

In [68]:
emb_fn, clf, attri2vec_result, preds, pos_col= train_and_evaluate(attri2vec_embedding)

Training Attri2Vec for 'Train Graph':
link_classification: using 'ip' method to combine node embeddings into edge embeddings
  ...
    to  
  ['...']
Train for 3241 steps
Epoch 1/6
3241/3241 - 25s - loss: 0.7094 - binary_accuracy: 0.5084
Epoch 2/6
3241/3241 - 25s - loss: 0.6961 - binary_accuracy: 0.5348
Epoch 3/6
3241/3241 - 25s - loss: 0.6899 - binary_accuracy: 0.5422
Epoch 4/6
3241/3241 - 25s - loss: 0.6878 - binary_accuracy: 0.5450
Epoch 5/6
3241/3241 - 25s - loss: 0.6873 - binary_accuracy: 0.5472
Epoch 6/6
3241/3241 - 25s - loss: 0.6872 - binary_accuracy: 0.5470


ValueError: not enough values to unpack (expected 3, got 2)

In [34]:
print(preds)

[[0.15486259 0.84513741]
 [0.15538601 0.84461399]
 [0.15480643 0.84519357]
 ...
 [0.96835181 0.03164819]
 [0.57492231 0.42507769]
 [0.97472837 0.02527163]]


In [37]:
print(pos_col)

1


# Test

In [52]:
from stellargraph.mapper import Attri2VecLinkGenerator, Attri2VecNodeGenerator
from stellargraph.layer import Attri2Vec


def attri2vec_embedding(graph, name):

    # Set the embedding dimension and walk number:
    dimension = [128]
    walk_number = 4

    print(f"Training Attri2Vec for '{name}':")

    graph_node_list = list(graph.nodes())

    # Create the biased random walker to generate random walks
    walker = create_biased_random_walker(graph, walk_number, walk_length)

    # Create the unsupervised sampler to sample (target, context) pairs from random walks
    unsupervised_samples = UnsupervisedSampler(
        graph, nodes=graph_node_list, walker=walker
    )

    # Define an Attri2Vec training generator, which generates batches of training pairs
    generator = Attri2VecLinkGenerator(graph, batch_size)

    # Create the Attri2Vec model
    attri2vec = Attri2Vec(
        layer_sizes=dimension, generator=generator, bias=False, normalize=None
    )

    # Build the model and expose input and output sockets of Attri2Vec, for node pair inputs
    x_inp, x_out = attri2vec.in_out_tensors()

    # Use the link_classification function to generate the output of the Attri2Vec model
    prediction = link_classification(
        output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
    )(x_out)

    # Stack the Attri2Vec encoder and prediction layer into a Keras model, and specify the loss
    model = keras.Model(inputs=x_inp, outputs=prediction)
    model.compile(
        optimizer=keras.optimizers.Adam(lr=1e-3),
        loss=keras.losses.binary_crossentropy,
        metrics=[keras.metrics.binary_accuracy],
    )

    # Train the model
    model.fit(
        generator.flow(unsupervised_samples),
        epochs=epochs,
        verbose=2,
        use_multiprocessing=False,
        workers=1,
        shuffle=True,
    )

    # Build the model to predict node representations from node features with the learned Attri2Vec model parameters
    x_inp_src = x_inp[0]
    x_out_src = x_out[0]
    embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

    # Get representations for all nodes in ``graph``
    node_gen = Attri2VecNodeGenerator(graph, batch_size).flow(graph_node_list)
    node_embeddings = embedding_model.predict(node_gen, workers=1, verbose=0)
    return node_embeddings

    def get_embedding(u):
        u_index = graph_node_list.index(u)
        return node_embeddings[u_index]

    return get_embedding

In [53]:
train_emb=attri2vec_embedding(graph_train, "train")

Training Attri2Vec for 'train':
link_classification: using 'ip' method to combine node embeddings into edge embeddings
  ...
    to  
  ['...']
Train for 3183 steps
Epoch 1/6
3183/3183 - 24s - loss: 0.7032 - binary_accuracy: 0.5112
Epoch 2/6
3183/3183 - 26s - loss: 0.6894 - binary_accuracy: 0.5423
Epoch 3/6
3183/3183 - 26s - loss: 0.6838 - binary_accuracy: 0.5505
Epoch 4/6
3183/3183 - 24s - loss: 0.6811 - binary_accuracy: 0.5542
Epoch 5/6
3183/3183 - 25s - loss: 0.6806 - binary_accuracy: 0.5570
Epoch 6/6
3183/3183 - 28s - loss: 0.6805 - binary_accuracy: 0.5574


In [54]:
print(train_emb)

[[5.67323468e-08 5.33161540e-07 1.17049311e-07 ... 7.55333716e-08
  1.01518672e-05 2.96439637e-08]
 [5.67576706e-08 5.33293246e-07 1.17110716e-07 ... 7.55705472e-08
  1.01510923e-05 2.96543128e-08]
 [5.67904834e-08 5.33270850e-07 1.17219315e-07 ... 7.56286624e-08
  1.01400919e-05 2.96590095e-08]
 ...
 [5.85465152e-08 5.48325033e-07 1.20560699e-07 ... 7.78833638e-08
  1.03970606e-05 3.06492041e-08]
 [5.85888422e-08 5.48482944e-07 1.20672397e-07 ... 7.79487479e-08
  1.03926186e-05 3.06637631e-08]
 [5.86011488e-08 5.48163484e-07 1.20760930e-07 ... 7.79871172e-08
  1.03723705e-05 3.06513641e-08]]


In [56]:
train_emb.shape

(5083, 128)

In [59]:
print(graph_train.info())

StellarDiGraph: Directed multigraph
 Nodes: 5083, Edges: 6043

 Node types:
  bus_stop: [5083]
    Features: float32 vector, length 2
    Edge types: bus_stop-bus_route->bus_stop

 Edge types:
    bus_stop-bus_route->bus_stop: [6043]
        Weights: range=[0, 37.8], mean=0.822836, std=2.51736
        Features: none


In [61]:
print(transport_sg.info())

StellarDiGraph: Directed multigraph
 Nodes: 5083, Edges: 7459

 Node types:
  bus_stop: [5083]
    Features: float32 vector, length 2
    Edge types: bus_stop-bus_route->bus_stop

 Edge types:
    bus_stop-bus_route->bus_stop: [7459]
        Weights: range=[0, 37.8], mean=0.797319, std=2.41502
        Features: none


In [62]:
g = StellarGraph.to_networkx(graph_train)

In [64]:
g

<networkx.classes.multidigraph.MultiDiGraph at 0x1b27be61748>

In [66]:
g.nodes()['75009']

{'label': 'bus_stop',
 'feature': array([  1.3540756, 103.94339  ], dtype=float32)}