### Import Libraries

In [1]:
import matplotlib.pyplot as plt
import import_ipynb
from sklearn.manifold import TSNE

import io, os, sys, types
from IPython import get_ipython
from nbformat import read
import networkx as nx
import numpy as np
import pandas as pd
from tensorflow import keras
import stellargraph as sg

from stellargraph import StellarGraph
from stellargraph.data import BiasedRandomWalk
from stellargraph.data import UnsupervisedSampler
from stellargraph.data import BiasedRandomWalk
from stellargraph.mapper import Node2VecLinkGenerator, Node2VecNodeGenerator
from stellargraph.layer import Node2Vec, link_classification
import time
from sklearn.metrics.pairwise import cosine_similarity

start = time.time()
from stellargraph import datasets
from IPython.display import display, HTML

from stellargraph import datasets
from IPython.display import display, HTML

%matplotlib inline

###  Import input data

In [2]:
df = pd.read_csv('All_recipe.csv')

In [3]:
df.head()

Unnamed: 0,source,target,weight
0,feta cheese crumbles,grape tomatoes,15
1,feta cheese crumbles,garbanzo beans,6
2,feta cheese crumbles,garlic,58
3,feta cheese crumbles,romaine lettuce,20
4,feta cheese crumbles,purple onion,83


In [4]:
df.shape

(729862, 3)

### Convert dataframe to network graph

In [5]:
GR = nx.from_pandas_edgelist(df,'source', 'target',["weight"])

In [6]:
print(nx.info(GR))

Name: 
Type: Graph
Number of nodes: 6192
Number of edges: 364435
Average degree: 117.7116


### Convert the Graph into a StellarGraph for Machine Learning

In [7]:
GS = StellarGraph.from_networkx(GR)

In [8]:
print(GS.info())

StellarGraph: Undirected multigraph
 Nodes: 6192, Edges: 364435

 Node types:
  default: [6192]
    Features: none
    Edge types: default-default->default

 Edge types:
    default-default->default: [364435]
        Weights: range=[1, 2659], mean=4.32563, std=20.0651
        Features: none


In [9]:
type(GS.nodes())

pandas.core.indexes.base.Index

In [10]:
subjects = GS.nodes() # Unique nodes

In [11]:
len(list(GS.nodes()))

6192

In [12]:
len(list(GR.nodes()))

6192

In [13]:
type(subjects)

pandas.core.indexes.base.Index

## The Node2Vec algorithm

The number of walks to take per node, the length of each walk

In [14]:
walk_number = 100
walk_length = 5

Create the biased random walker to perform context node sampling, with the specified parameters.

In [15]:
walker = BiasedRandomWalk(
    GS,
    n=walk_number,
    length=walk_length,
    p=0.5,  # defines probability, 1/p, of returning to source node
    q=2.0,  # defines probability, 1/q, for moving to a node away from the source node
)

Create the UnsupervisedSampler instance with the biased random walker.

In [16]:
nodes=list(GS.nodes())
unsupervised_samples = UnsupervisedSampler(GS, nodes=nodes,walker=walker)

In [17]:
unsupervised_samples

<stellargraph.data.unsupervised_sampler.UnsupervisedSampler at 0x28272e6bf40>

Set the batch size and the number of epochs

In [18]:
batch_size = 50
epochs = 2

Define an attri2vec training generator, which generates a batch of (index of target node, index of context node, label of node pair) pairs per iteration

In [19]:
generator = Node2VecLinkGenerator(GS, batch_size)

Build the Node2Vec model, with the dimension of learned node representations set to 128

In [20]:
emb_size = 128  # Embedding dimensions (default: 128)
node2vec = Node2Vec(emb_size, generator=generator)

In [21]:
x_inp, x_out = node2vec.in_out_tensors()

Use the link_classification function to generate the prediction, with the 'dot' edge embedding generation method and the 'sigmoid' activation, which actually performs the dot product of the input embedding of the target node and the output embedding of the context node followed by a sigmoid activation.

In [22]:
prediction = link_classification(
    output_dim=1, output_act="sigmoid", edge_embedding_method="dot"
)(x_out)

link_classification: using 'dot' method to combine node embeddings into edge embeddings


Stack the Node2Vec encoder and prediction layer into a Keras model. Our generator will produce batches of positive and negative context pairs as inputs to the model. Minimizing the binary crossentropy between the outputs and the provided ground truth is much like a regular binary classification task.

In [23]:
model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-3),
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.binary_accuracy],
)

Train the model.

In [24]:
history = model.fit(
    generator.flow(unsupervised_samples),
    epochs=epochs,
    verbose=1,
    use_multiprocessing=False,
    workers=4,
    shuffle=True,
)

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [None]:
sg.utils.plot_history(history)

## Visualise Node Embeddings

Build the node based model for predicting node representations from node ids and the learned parameters. Below a Keras model is constructed, with x_inp[0] as input and x_out[0] as output. Note that this model's weights are the same as those of the corresponding node encoder in the previously trained node pair classifier.

In [None]:
x_inp_src = x_inp[0]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

Get the node embeddings from node ids.

In [None]:
node_gen = Node2VecNodeGenerator(GS, batch_size).flow(subjects)
node_embeddings = embedding_model.predict(node_gen, workers=4, verbose=1)

In [None]:
len(node_embeddings[0])

Transform the embeddings to 2d space for visualisation.

In [None]:
len(node_embeddings)

In [None]:
transform = TSNE  # PCA

trans = transform(n_components=2)
node_embeddings_2d = trans.fit_transform(node_embeddings)

In [None]:
# draw the embedding points, coloring them by the target label (paper subject)
alpha = 0.7
label_map = {l: i for i, l in enumerate(np.unique(subjects))}
node_colours = [label_map[target] for target in subjects]

plt.figure(figsize=(7, 7))
plt.axes().set(aspect="equal")
plt.scatter(
    node_embeddings_2d[:, 0],
    node_embeddings_2d[:, 1],
    c=node_colours,
    cmap="jet",
    alpha=alpha,
)
plt.title("{} visualization of node embeddings".format(transform.__name__))
plt.show()

In [None]:
print(f'{time.time()-start} seconds')   