# BUILDING TRAIN AND TEST DATASET + NODE EMBEDDINGS

In [1]:
# #  UNCOMMENT IF USING COLLAB
# # install some package if running on Google Colab
# import sys
# if 'google.colab' in sys.modules:
#   %pip install stellargraph==1.2.1
#   %pip install gensim
#   %pip install fastnode2vec

#   from google.colab import drive
#   drive.mount('/content/drive')


In [2]:
import sys
import os
import multiprocessing
from collections import Counter

import numpy as np
import pandas as pd
import networkx as nx
from stellargraph import StellarGraph, datasets
from stellargraph.data import EdgeSplitter, BiasedRandomWalk

from sklearn.model_selection import train_test_split

from gensim.models import Word2Vec
from fastnode2vec import Graph, Node2Vec

In [3]:
# Paths where we will save the datasets
data_folder = "../data"

if 'google.colab' in sys.modules:
  data_folder = "/content/drive/MyDrive/Colab Notebooks/ALTeGraD/Projet/data"
  
datasets_folder = os.path.join(data_folder, "datasets")
models_folder = os.path.join(data_folder, "models")

os.makedirs(datasets_folder, exist_ok=True)
os.makedirs(models_folder, exist_ok=True)

train_edges_path = os.path.join(datasets_folder, 'train_graph_edgelist.txt')
train_pairs_path = os.path.join(datasets_folder, 'train_pairs.csv')
train_target_path = os.path.join(datasets_folder, 'train_target.csv')

test_edges_path = os.path.join(datasets_folder, 'test_graph_edgelist.txt')
test_pairs_path = os.path.join(datasets_folder, 'test_pairs.csv')
test_target_path = os.path.join(datasets_folder, 'test_target.csv')

full_graph_edges_path = os.path.join(data_folder, "initial_data/edgelist.txt") 

### Loading the network

We load our graph using **Networkx**

In [None]:
G = nx.read_edgelist(full_graph_edges_path, delimiter=',', nodetype=int)
nodes = list(G.nodes())
n = G.number_of_nodes()
m = G.number_of_edges()
print('Number of nodes:', n)
print('Number of edges:', m)

Then we will build a `StellarGraph` by loading our Networkx graph. We use the package **stellargraph** because it allows multiple Machine Learning manipulation with graphs.

In [None]:
graph = StellarGraph.from_networkx(
    G, node_type_default="paper", edge_type_default="cites"
)

print(graph.info())

StellarGraph: Undirected multigraph
 Nodes: 138499, Edges: 1091955

 Node types:
  paper: [138499]
    Features: none
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [1091955]
        Weights: all 1 (default)
        Features: none


## 1. Construct splits of the input network

We have to carefully split the data to avoid data leakage and evaluate the algorithms correctly:

- For computing node embeddings, a **Train Graph** (`graph_train`)

- For training classifiers, a classifier **Training Set** (`examples_train`) of positive and negative edges that weren’t used for computing node embeddings

- For choosing the best classifier, an **Model Selection Test Set** (`examples_model_selection`) of positive and negative edges that weren’t used for computing node embeddings or training the classifier

- For the final evaluation, a **Test Graph** (`graph_test`) to compute test node embeddings with more edges than the Train Graph, and a Test Set (`examples_test`) of positive and negative edges not used for neither computing the test node embeddings or for classifier training or model selection

### 1.1. Test graph

We begin with the full graph and use the `EdgeSplitter` class to produce:

- Test Graph
- Test set of positive/negative link examples

The Test Graph is the reduced graph we obtain from removing the test set of links from the full graph.

In [None]:
# Define an edge splitter on the original graph:
edge_splitter_test = EdgeSplitter(graph)

# Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from graph, and obtain the
# reduced graph graph_test with the sampled links removed:
graph_test, examples_test, labels_test = edge_splitter_test.train_test_split(
    p=0.2, method="global", keep_connected=True, seed=12
)

print(graph_test.info())

** Sampled 218391 positive and 218391 negative edges. **
StellarGraph: Undirected multigraph
 Nodes: 138499, Edges: 873564

 Node types:
  paper: [138499]
    Features: none
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [873564]
        Weights: all 1 (default)
        Features: none


### 1.2. Train graph

This time, we use the `EdgeSplitter` on the Test Graph, and perform a train/test split on the examples to produce:

- Train Graph

- Training set of link examples

- Set of link examples for model selection

In [None]:
# Do the same process to compute a training subset from within the test graph
edge_splitter_train = EdgeSplitter(graph_test, graph)
graph_train, examples, labels = edge_splitter_train.train_test_split(
    p=0.2, method="global", keep_connected=True, seed=12
)
(
    examples_train,
    examples_model_selection,
    labels_train,
    labels_model_selection,
) = train_test_split(examples, labels, train_size=0.75, test_size=0.25, random_state=12)

print(graph_train.info())

** Sampled 174712 positive and 174712 negative edges. **
StellarGraph: Undirected multigraph
 Nodes: 138499, Edges: 698852

 Node types:
  paper: [138499]
    Features: none
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [698852]
        Weights: all 1 (default)
        Features: none


Below is a summary of the different splits that have been created in this section

In [None]:
pd.DataFrame(
    [
        (
            "Training Set",
            len(examples_train),
            "Train Graph",
            "Test Graph",
            "Train the Link Classifier",
        ),
        (
            "Model Selection",
            len(examples_model_selection),
            "Train Graph",
            "Test Graph",
            "Select the best Link Classifier model",
        ),
        (
            "Test set",
            len(examples_test),
            "Test Graph",
            "Full Graph",
            "Evaluate the best Link Classifier",
        ),
    ],
    columns=("Split", "Number of Examples", "Hidden from", "Picked from", "Use"),
).set_index("Split")

Unnamed: 0_level_0,Number of Examples,Hidden from,Picked from,Use
Split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Training Set,262068,Train Graph,Test Graph,Train the Link Classifier
Model Selection,87356,Train Graph,Test Graph,Select the best Link Classifier model
Test set,436782,Test Graph,Full Graph,Evaluate the best Link Classifier


### 1.3. Save the data

In [None]:
# Convert test and train graph from stellargraph to networkx
graph_test_nx = graph_test.to_networkx()
graph_train_nx  = graph_train.to_networkx()

In [None]:
# Save the edgelist of each graph
if not os.path.isfile(test_edges_path):
    nx.write_edgelist(
        graph_test_nx,
        test_edges_path,
        delimiter=',',
        data=False
    )

if not os.path.isfile(train_edges_path):
    nx.write_edgelist(
        graph_train_nx,
        train_edges_path,
        delimiter=',',
        data=False
    )

In [None]:
# Save the test and training dataset
pd.DataFrame(examples).to_csv(train_pairs_path, index=False, header=False)
pd.DataFrame(examples_test).to_csv(test_pairs_path, index=False, header=False)

pd.DataFrame(labels).to_csv(train_target_path, index=False, header=False)
pd.DataFrame(labels_test).to_csv(test_target_path, index=False, header=False)

## 2. Node2Vec

**N.B. :** you can restart the kernel from this point + rerun the two first cells

In [4]:
p = 1.0
q = 1.0
dimensions = 128
num_walks = 10
walk_length = 80
window_size = 10
n_epochs = 5
workers = multiprocessing.cpu_count()

In [5]:
workers

4

In [6]:
def node2vec_embedding(graph_edgelist_path):
    G = nx.read_edgelist(graph_edgelist_path, delimiter=',', nodetype=str)

    graph = Graph(list(G.edges()), directed=False, weighted=False)
    node2vec = Node2Vec(
        graph, 
        dim=dimensions, 
        walk_length=walk_length, 
        context=window_size, 
        p=p, 
        q=q,
        workers=workers
    )
    node2vec.train(epochs=n_epochs)

    return node2vec

In [None]:
# Train graph
node2vec_train_graph = node2vec_embedding(train_edges_path)
node2vec_train_graph.wv.save_word2vec_format(
    os.path.join(models_folder, "node2vec_train_graph.model")
)

Reading graph: 100%|██████████| 698852/698852 [00:01<00:00, 658112.69it/s]
Training:   4%|▍         | 27251/692495 [00:41<15:07, 733.32it/s]

In [None]:
# Test graph
node2vec_test_graph = node2vec_embedding(test_edges_path)
node2vec_test_graph.wv.save_word2vec_format(
    os.path.join(models_folder, "node2vec_test_graph.model")
)

In [7]:
# Full graph
node2vec_full_graph = node2vec_embedding(full_graph_edges_path)
node2vec_full_graph.wv.save_word2vec_format(
    os.path.join(models_folder, "node2vec_full_graph.nodevectors")
)

Reading graph: 100%|██████████| 1091955/1091955 [00:01<00:00, 847375.41it/s]
Training: 100%|██████████| 692495/692495 [07:24<00:00, 1558.88it/s]
