In [1]:
from stellargraph import StellarGraph
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.metrics import mean_absolute_error, mean_squared_error
from stellargraph.mapper import HinSAGELinkGenerator
from stellargraph.layer import HinSAGE, link_regression
from tensorflow.keras import Model, optimizers, losses, metrics
import tensorflow.keras.backend as K
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing
import pandas as pd
import pickle
from stellargraph import StellarDiGraph

In [18]:
with open('gnn_reviews_10.pkl', 'rb') as f:
    reviews = pickle.load(f)
with open('gnn_friends_10.pkl', 'rb') as f:
    friends = pickle.load(f)
with open('gnn_users_10.pkl', 'rb') as f:
    users = pickle.load(f)
with open('gnn_business_10.pkl', 'rb') as f:
    business = pickle.load(f)


In [19]:
graph = StellarGraph(
    {'users':users,'business':business},
    edges={
    'reviews':reviews,
    'friends':friends
    },
)

In [20]:
graph.info()

'StellarGraph: Undirected multigraph\n Nodes: 1137450, Edges: 645618\n\n Node types:\n  users: [1105275]\n    Features: float32 vector, length 4\n    Edge types: users-friends->users, users-reviews->business\n  business: [32175]\n    Features: float32 vector, length 151\n    Edge types: business-reviews->users\n\n Edge types:\n    business-reviews->users: [420263]\n        Weights: all 1 (default)\n        Features: float32 vector, length 4\n    users-friends->users: [225355]\n        Weights: all 1 (default)\n        Features: none'

In [5]:
edges_train, edges_test = model_selection.train_test_split(
    reviews, train_size=0.8, test_size=0.2
)

edgelist_train = list(edges_train[["source", "target"]].itertuples(index=False))
edgelist_test = list(edges_test[["source", "target"]].itertuples(index=False))

labels_train = edges_train["stars"]
labels_test = edges_test["stars"]

In [6]:
num_samples = [8, 4]

In [7]:
generator = HinSAGELinkGenerator(
    graph, 200, num_samples, head_node_types=["users", "business"]
)
train_gen = generator.flow(edgelist_train, labels_train, shuffle=True)
test_gen = generator.flow(edgelist_test, labels_test)

In [8]:
generator.schema.type_adjacency_list(generator.head_node_types, len(num_samples))

[('users', [2, 3]),
 ('business', [4]),
 ('users', [5, 6]),
 ('business', [7]),
 ('users', [8, 9]),
 ('users', []),
 ('business', []),
 ('users', []),
 ('users', []),
 ('business', [])]

In [9]:
generator.schema.schema

{'business': [EdgeType(n1='business', rel='reviews', n2='users')],
 'users': [EdgeType(n1='users', rel='friends', n2='users'),
  EdgeType(n1='users', rel='reviews', n2='business')]}

In [10]:
hinsage_layer_sizes = [32, 32]
assert len(hinsage_layer_sizes) == len(num_samples)

hinsage = HinSAGE(
    layer_sizes=hinsage_layer_sizes, generator=generator, bias=True, dropout=0.0
)

In [11]:
# Expose input and output sockets of hinsage:
x_inp, x_out = hinsage.in_out_tensors()

In [12]:
# Final estimator layer
score_prediction = link_regression(edge_embedding_method="concat")(x_out)

link_regression: using 'concat' method to combine node embeddings into edge embeddings


In [13]:
def root_mean_square_error(s_true, s_pred):
    return K.sqrt(K.mean(K.pow(s_true - s_pred, 2)))


model = Model(inputs=x_inp, outputs=score_prediction)
model.compile(
    optimizer=optimizers.Adam(lr=1e-2),
    loss=losses.mean_squared_error,
    metrics=[root_mean_square_error, metrics.mae],
)

  super(Adam, self).__init__(name, **kwargs)


In [14]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 8, 4)]       0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 8, 151)]     0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 32, 4)]      0           []                               
                                                                                                  
 input_7 (InputLayer)           [(None, 32, 151)]    0           []                               
                                                                                              

                                                                                                  
 reshape_10 (Reshape)           (None, 1, 8, 32)     0           ['mean_hin_aggregator[2][0]']    
                                                                                                  
 dropout_15 (Dropout)           (None, 1, 32)        0           ['mean_hin_aggregator[0][0]']    
                                                                                                  
 dropout_13 (Dropout)           (None, 1, 8, 32)     0           ['reshape_8[0][0]']              
                                                                                                  
 dropout_14 (Dropout)           (None, 1, 8, 32)     0           ['reshape_9[0][0]']              
                                                                                                  
 dropout_17 (Dropout)           (None, 1, 32)        0           ['mean_hin_aggregator_1[0][0]']  
          

In [15]:
# Specify the number of workers to use for model training
num_workers = 4

In [16]:
test_metrics = model.evaluate(
    test_gen, verbose=1, use_multiprocessing=False, workers=num_workers
)

print("Untrained model's Test Evaluation:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))

Untrained model's Test Evaluation:
	loss: 17.6059
	root_mean_square_error: 4.1954
	mean_absolute_error: 3.9609


In [17]:
history = model.fit(
    train_gen,
    validation_data=test_gen,
    epochs=10,
    verbose=1,
    shuffle=False,
    use_multiprocessing=False,
    workers=num_workers,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

KeyboardInterrupt: 

In [None]:
import stellargraph as sg
sg.utils.plot_history(history)

In [None]:
test_metrics = model.evaluate(
    test_gen, use_multiprocessing=False, workers=num_workers, verbose=1
)

print("Test Evaluation:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))

In [None]:
y_true = labels_test
# Predict the rankings using the model:
y_pred = model.predict(test_gen)
# Mean baseline rankings = mean movie ranking:
y_pred_baseline = np.full_like(y_pred, np.mean(y_true))

rmse = np.sqrt(mean_squared_error(y_true, y_pred_baseline))
mae = mean_absolute_error(y_true, y_pred_baseline)
print("Mean Baseline Test set metrics:")
print("\troot_mean_square_error = ", rmse)
print("\tmean_absolute_error = ", mae)

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)
print("\nModel Test set metrics:")
print("\troot_mean_square_error = ", rmse)
print("\tmean_absolute_error = ", mae)

In [None]:
h_true = plt.hist(y_true, bins=30, facecolor="green", alpha=0.5)
h_pred = plt.hist(y_pred, bins=30, facecolor="blue", alpha=0.5)
plt.xlabel("ranking")
plt.ylabel("count")
plt.legend(("True", "Predicted"))
plt.show()

In [None]:
h_true = plt.hist(y_true, bins=30, facecolor="green", alpha=0.5)

In [None]:
h_pred = plt.hist(y_pred, bins=30, facecolor="blue", alpha=0.5)