# Graph embedding using SkipGram

This is an embedding of the whole graph, no training and validation split and all sources.

In [1]:
import silence_tensorflow.auto # Import needed to avoid TensorFlow warnings and general useless infos.

In [2]:
exp_name = "98_2 training_test_epoch_500_delta_0.0001"
graph_data_dir = "graph"

## Loading the graphs
We load the ppi graph from the repository as an undirected graph.

In [3]:
import urllib
import os
os.makedirs(graph_data_dir, exist_ok=True)
if not os.path.exists(graph_data_dir + "/kg-covid-19-skipgram-aug-2020.tar.gz"):
    with urllib.request.urlopen("https://zenodo.org/record/4011267/files/kg-covid-19-skipgram-aug-2020.tar.gz") as response, \
        open(graph_data_dir + "/kg-covid-19-skipgram-aug-2020.tar.gz", 'wb') as out_file:
            data = response.read()  # a `bytes` object
            out_file.write(data)

In [4]:
import os
os.system("tar -xvzf " + graph_data_dir + "/kg-covid-19-skipgram-aug-2020.tar.gz -C " + graph_data_dir)

0

In [5]:
%%time
from ensmallen_graph import EnsmallenGraph

graph = EnsmallenGraph.from_csv(
    edge_path = graph_data_dir + "/merged-kg_edges.tsv",
    sources_column="subject",
    destinations_column="object",
    directed=False,
    node_path = graph_data_dir + "/merged-kg_nodes.tsv",
    nodes_column = 'id',
    node_types_column = 'category',
    default_node_type = 'biolink:NamedThing'
)

CPU times: user 1min 55s, sys: 8.18 s, total: 2min 4s
Wall time: 2min 3s


As first thing, we print a short report showing all the avalable graph details, including the number of edges, nodes, trap nodes and both the connected components and the strongly connected components.

In [6]:
graph.report()

{'unique_node_types_number': '36',
 'selfloops_rate': '0.000015391581103247148',
 'is_multigraph': 'false',
 'mean_number_of_types_for_edge': '0',
 'edges_number': '30861027',
 'multigraph_edges_ratio': '0',
 'traps_rate': '0.021906677500566116',
 'selfloops_number': '475',
 'degrees_mean': '82.21604837957722',
 'singleton_nodes': '8223',
 'degrees_median': '6',
 'degrees_min': '0',
 'nodes_number': '375365',
 'bidirectional_rate': '1',
 'connected_components_number': '8976',
 'strongly_connected_components_number': '8976',
 'multigraph_edges_number': '0',
 'degrees_mode': '1',
 'degrees_max': '90378',
 'is_directed': 'false',
 'unique_edge_types_number': '0',
 'density': '0.00021902960686152735'}

The followings are check that are not necessary, but are offered as sanity checks:

### Considered parameters
We are going to use the following parameters:

- **Walk lengths:** $100$ nodes.
- **Batch size:** $2^{7} = 128$ walks per batch.
- **Walk iterations:** $20$ iterations on the graph.
- **Window size:** $4$ nodes, meaning $4$ on the left and $4$ on the right of the center nodes. Consider that the first *window_size* values on the left and the right of the walks will be trimmed.
- **Return weight, inverse of $p$:** $1.0$.
- **Explore weight, inverse of $q$:** $1.0$.
- **Embedding size:** $100$.
- **Negative samples:** For the porpose of the [NCE function negative samples](https://www.tensorflow.org/api_docs/python/tf/nn/nce_loss), we are going to use $10$. These are the number of negative classes to randomly sample per batch. This single sample of negative classes is evaluated for each element in the batch.
- **Optimizer:** [Nadam](https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Nadam).
- **Early stopping parameters:** We are going to use an Early Stopping criterion on the *validation loss*, with patience $5$ and delta $0.0001$.
- **Epochs:** The model will be trained up to $1000$ epochs.
- **Learning rate:** default

In [7]:
training, validation = graph.connected_holdout(0.97626, seed=42)

In [8]:
training.report()

{'connected_components_number': '85456',
 'nodes_number': '375365',
 'mean_number_of_types_for_edge': '0',
 'bidirectional_rate': '1',
 'multigraph_edges_ratio': '0',
 'multigraph_edges_number': '0',
 'is_multigraph': 'false',
 'traps_rate': '0.22393403753679753',
 'degrees_min': '0',
 'degrees_mean': '80.26424147163428',
 'density': '0.00021382984953747492',
 'selfloops_number': '475',
 'strongly_connected_components_number': '85456',
 'singleton_nodes': '84057',
 'unique_edge_types_number': '0',
 'degrees_max': '90342',
 'unique_node_types_number': '36',
 'is_directed': 'false',
 'degrees_median': '5',
 'degrees_mode': '0',
 'edges_number': '30128387',
 'selfloops_rate': '0.000015765862274671392'}

In [9]:
validation.report()

{'degrees_median': '1',
 'bidirectional_rate': '1',
 'degrees_mode': '1',
 'singleton_nodes': '8312',
 'unique_edge_types_number': '0',
 'density': '0.000005199757324052417',
 'degrees_min': '0',
 'mean_number_of_types_for_edge': '0',
 'unique_node_types_number': '36',
 'multigraph_edges_ratio': '0',
 'traps_rate': '0.022143780054080693',
 'selfloops_number': '0',
 'nodes_number': '375365',
 'selfloops_rate': '0',
 'degrees_max': '13829',
 'is_directed': 'false',
 'connected_components_number': '9045',
 'strongly_connected_components_number': '9045',
 'degrees_mean': '1.9518069079429354',
 'is_multigraph': 'false',
 'edges_number': '732640',
 'multigraph_edges_number': '0'}

In [10]:
assert graph > training
assert graph > validation
assert (training + validation).contains(graph)  # this + will eventually fail, replace with | to fix
assert graph.contains(training + validation)  # this + will eventually fail, replace with | to fix
assert not training.overlaps(validation)
assert not validation.overlaps(training)

#### Setting up the parameters

In [11]:
walk_length=100
batch_size=2**9
iterations=20
window_size=4
p=1.0
q=1.0
embedding_size=100
negatives_samples=30
patience=5
delta=0.0001
epochs=500

#### Creating the training and validation Keras sequences

## Creating the SkipGram model
We are going to setup the model to use, if available, multiple GPUs.

In [12]:
from embiggen import Node2VecSequence

training_sequence = Node2VecSequence(
    training,
    walk_length=walk_length,
    batch_size=batch_size,
    iterations=iterations,
    window_size=window_size,
    return_weight=1/p,
    explore_weight=1/q
)

validation_sequence = Node2VecSequence(
    graph, # Here we use the entire graph. This will only be used for the early stopping.
    walk_length=walk_length,
    batch_size=batch_size,
    iterations=iterations,
    window_size=window_size,
    return_weight=1/p,
    explore_weight=1/q
)

In [13]:
from tensorflow.distribute import MirroredStrategy
from tensorflow.keras.optimizers import Nadam
from embiggen import SkipGram

# strategy = MirroredStrategy()
#with strategy.scope():
model = SkipGram(
    vocabulary_size=training.get_nodes_number(),
    embedding_size=embedding_size,
    window_size=window_size,
    negatives_samples=negatives_samples,
)

model.summary()

Model: "SkipGram"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
words_embedding (InputLayer)    [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 100)       37536500    words_embedding[0][0]            
__________________________________________________________________________________________________
flatten (Flatten)               (None, 100)          0           embedding[0][0]                  
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 8)]          0                                            
___________________________________________________________________________________________

## Tuning the SkipGram model

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

history = model.fit(
    training_sequence,
    steps_per_epoch=training_sequence.steps_per_epoch,
    validation_data=validation_sequence,
    validation_steps=validation_sequence.steps_per_epoch,
    epochs=epochs,
    callbacks=[
        EarlyStopping(
            "val_loss",
            min_delta=delta,
            patience=patience,
            restore_best_weights=True
        )
    ]
)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
 50/569 [=>............................] - ETA: 9:38 - loss: 45.8378

In [None]:
### Saving the model weights
# We save the obtained model weights:

In [None]:
model.save_weights(f"{model.name}_" + exp_name + "_weights.h5")

### Visualizing the training history
We can visualize the performance of the model during the training process as follows:

In [None]:
from plot_keras_history import plot_history

plot_history(history)

There may be some hiccups in the plot of the history if the model is reloaded from stored weights: [this is a known Keras issue](https://github.com/keras-team/keras/issues/4875) and is not related to either the holdouts used or the model.

## Saving the obtained embeddings
Finally we save our hard earned model embeddings. In another notebook we will show how to do link prediction on the obtained embedding.

In [None]:
import numpy as np

np.save(f"{model.name}_" + exp_name + "_embedding.npy", model.embedding)