# Machine Learning in Network Science
Group Challenge

***
by: Leonardo Basili, Paul Bédier, Lasse Schmidt

within: MS Data Sciences & Business Analytics

at: CentraleSupélec & ESSEC Business School
***

This notebook covers deep learning techniques, namely:
- Variational Graph Normalized Auto-Encoders (based on https://arxiv.org/abs/2108.08046) which allow us to learn graph embeddings in an unsupervised way (based on graph structure and node embeddings)

### 1. Import Packages

In [85]:
from importlib import reload
reload(analyseData)
reload(loadData)
reload(modeling)
reload(autoenc)

<module 'util.autoencoder' from '/Users/macbookpro/Documents/GitHub/Network-Science_Final-Project/util/autoencoder.py'>

In [86]:
# import own scripts
import util.analyse_Data as analyseData
#import util.preprocess_Data as prepData
import util.load_Data as loadData
import util.modeling as modeling
import util.autoencoder as autoenc

In [87]:
# parse & handle data
import os
import numpy as np
import pandas as pd

# modeling
import torch
from torch_geometric.nn import GAE, VGAE

# hyperparam optimization
from ray import tune, air

# evaluation
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, ConfusionMatrixDisplay


# visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [88]:
# set matplotlib and seaborn settings for nicer plots
%matplotlib inline

SMALL_SIZE = 6
MEDIUM_SIZE = 8
BIGGER_SIZE = 10

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)    # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)   # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

### 2. Load Data for Modeling

In [89]:
# might take up to a minute
data, (G, G_train, G_trainval, node_info, train_tf, val_tf, trainval_tf, test_tf) = autoenc.load()

Number of positive edges for training: 3802
Number of positive edges for validation: 1085
Number of positive edges for test: 542
Number of edges in original graph: 5429
Number of edges in training graph: 3802
Number of non-existing edges generated: 29971
Number of negative edges for training: 3802
Number of negative edges for validation: 1085
Number of negative edges for test: 542
sum of train pos edges: 3802
sum of train neg edges: 3802
sum of val pos edges: 1085
sum of val neg edges: 1085
sum of train pos edges: 3802
sum of train neg edges: 3802
sum of val pos edges: 1085
sum of val neg edges: 1085
Enriching node features...


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 13.74it/s]
  A = nx.adjacency_matrix(G, nodelist=list(G), dtype=float)


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 11.12it/s]


Create PyTorch Geometric dataset...


In [90]:
# where to save trial results to
ray_path = os.path.abspath("")+"/ray_results"
if not os.path.isdir(ray_path):
    os.mkdir(ray_path)

### 3. VGNAE Node Embeddings

https://github.com/SeongJinAhn/VGNAE/blob/main/main.py for Variational Graph Normalized Auto-Encoders

#### 3.1 Hyperparameter tuning

In [70]:
# tunable hyperparameter search space --> search using tune.choice([]), tune.uniform(lower, upper), tune.grid_search([])
config = {
    # log params in raytune
    "ray": True,
    
    # print results per epoch
    "verbose": False,
    
    # basic infos
    "data": data,
    "max_epochs": 50,
    "save": True, # if we want to save best model on validation set
    
    # model
    "model": "VGNAE",
    
    ## encoder
    "enc_channels": 64,
    "scaling": 1.8,
    "num_prop": tune.grid_search([4, 16, 32, 64, 128]),
    "teleport": 0, # tune.grid_search([0, 0.1, 0.2]),
    "dropout": 0, # tune.grid_search([0, 0.1, 0.2]),
    
    # optimizer
    "lr": tune.grid_search([1e-3, 1e-4, 1e-5]),
    "wd": 0,
}

In [71]:
# how many trials to run (if grid_search utilized, it will run this number per grid_search value)
num_samples = 1

# run experiment
result_grid = autoenc.run_ray_experiment(
    autoenc.train_validate, config, ray_path, num_samples,
    metric_columns = ["trn_auc", "val_auc", "max_val_auc", "training_iteration"],
    parameter_columns = ["scaling", "num_prop", "lr"]
)

0,1
Current time:,2023-04-23 11:48:36
Running for:,00:05:13.79
Memory:,15.1/16.0 GiB

Trial name,status,loc,scaling,num_prop,lr,trn_auc,val_auc,max_val_auc,training_iterat ion
VGNAE_0.001_0_4982a_00000,TERMINATED,127.0.0.1:8887,1.8,4,0.001,0.941873,0.817512,0.825806,50
VGNAE_0.0001_0_4982a_00001,TERMINATED,127.0.0.1:8903,1.8,4,0.0001,0.693582,0.661751,0.661751,50
VGNAE_1e-05_0_4982a_00002,TERMINATED,127.0.0.1:8887,1.8,4,1e-05,0.693319,0.661751,0.661751,50
VGNAE_0.001_0_4982a_00003,TERMINATED,127.0.0.1:8903,1.8,16,0.001,0.940558,0.814747,0.823963,50
VGNAE_0.0001_0_4982a_00004,TERMINATED,127.0.0.1:8887,1.8,16,0.0001,0.705155,0.658525,0.658986,50
VGNAE_1e-05_0_4982a_00005,TERMINATED,127.0.0.1:8903,1.8,16,1e-05,0.705155,0.654839,0.6553,50
VGNAE_0.001_0_4982a_00006,TERMINATED,127.0.0.1:8887,1.8,32,0.001,0.941084,0.814747,0.819355,50
VGNAE_0.0001_0_4982a_00007,TERMINATED,127.0.0.1:8903,1.8,32,0.0001,0.705155,0.659908,0.659908,50
VGNAE_1e-05_0_4982a_00008,TERMINATED,127.0.0.1:8887,1.8,32,1e-05,0.705024,0.657604,0.657604,50
VGNAE_0.001_0_4982a_00009,TERMINATED,127.0.0.1:8903,1.8,64,0.001,0.94161,0.813825,0.818433,50


2023-04-23 11:43:21,824	INFO worker.py:1553 -- Started a local Ray instance.
2023-04-23 11:48:36,496	INFO tune.py:798 -- Total run time: 313.82 seconds (304.79 seconds for the tuning loop).


#### 3.2 Result of Hyperparameter tuning

In [91]:
restored_tuner, result_grid = autoenc.open_validate_ray_experiment(
    "ray_results/train_validate_2023-04-23_11-43-18",
    autoenc.train_validate
)



Loading results from ray_results/train_validate_2023-04-23_11-43-18...


2023-04-23 12:18:13,365	INFO experiment_analysis.py:789 -- No `self.trials`. Drawing logdirs from checkpoint file. This may result in some information that is out of sync, as checkpointing is periodic.


Done!

No errors! Number of terminated trials: 15


In [92]:
# get best score per trial (highest validation accuracy)
N = 10
best_result_df = result_grid.get_dataframe(
    filter_metric="val_auc", filter_mode="max"
)
best_result_df = best_result_df[["trial_id", "training_iteration", "config/enc_channels",
                                 "config/scaling", "config/num_prop", "config/lr", "config/wd", 
                                 "trn_loss", "val_loss", "trn_auc", "val_auc"]]
best_result_df = best_result_df.sort_values(by=["val_auc"], ascending = False)

if len(result_grid) > N:
    best_result_df = best_result_df.head(N)

best_result_df

Unnamed: 0,trial_id,training_iteration,config/enc_channels,config/scaling,config/num_prop,config/lr,config/wd,trn_loss,val_loss,trn_auc,val_auc
0,4982a_00000,42,64,1.8,4,0.001,0,5.112204,1.409582,0.948185,0.825806
3,4982a_00003,41,64,1.8,16,0.001,0,5.324944,1.422033,0.950026,0.823963
6,4982a_00006,47,64,1.8,32,0.001,0,5.056614,1.652868,0.940163,0.819355
9,4982a_00009,48,64,1.8,64,0.001,0,4.720571,1.399702,0.944503,0.818433
12,4982a_00012,48,64,1.8,128,0.001,0,4.720572,1.399701,0.944503,0.818433
1,4982a_00001,1,64,1.8,4,0.0001,0,9.989462,6.556673,0.693056,0.661751
2,4982a_00002,1,64,1.8,4,1e-05,0,9.989462,6.556799,0.693056,0.661751
10,4982a_00010,35,64,1.8,64,0.0001,0,10.118744,7.35016,0.705155,0.660369
13,4982a_00013,35,64,1.8,128,0.0001,0,10.118744,7.35016,0.705155,0.660369
7,4982a_00007,35,64,1.8,32,0.0001,0,10.118716,7.350062,0.705024,0.659908


#### 3.3 Embeddings based on best model

In [102]:
# load best autoencoder
path = os.path.abspath("")+"/models/VGNAE_0.001_0_4982a_00000_autoencoder.pt"
#path = "models/autoencoder.pt"
model = VGAE(autoenc.Encoder(data.x.size()[1], 64, 1.5, 4, 0, 0))
model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))

<All keys matched successfully>

In [103]:
# get embeddings of nodes
embedding = autoenc.get_embeddings(model, data.x, data.train_pos_edges)

In [104]:
print(embedding)

[[-3.1470058  -0.69348514  1.2015048  ...  0.46626264  0.5235968
   1.6211315 ]
 [ 0.43980443  0.723609    0.37427843 ...  0.1249349   1.095577
   1.0135643 ]
 [-0.9413802   0.17225897  0.26274672 ...  0.93197554 -0.90382165
   0.47807515]
 ...
 [-2.412036   -0.24900346 -0.5787547  ...  2.2004526  -0.18173164
   0.08079515]
 [-0.77156967 -0.38765618 -1.1806533  ... -1.5115606   0.12829044
   0.52693987]
 [ 1.2350285  -0.39330256 -0.08083931 ...  0.09639663 -0.40052375
  -0.05452605]]


In [77]:
# plot embedding
plt.scatter(
    embedding[:, 0],
    embedding[:, 1])
plt.gca().set_aspect('equal', 'datalim')
plt.title('VGNAE projection (first 2 dim) of nodes')
plt.savefig('scatter_plot')
plt.close()

Embeddings look like a sphere, we cannot do any meaningful clusering based on this.

Let us now take our original VGNAE embeddings and put them into a pandas dataframe.

In [105]:
node_emb = pd.DataFrame(embedding).rename(columns = {val: f"x{val+1}" for val in range(embedding.shape[1])})

node_emb

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x55,x56,x57,x58,x59,x60,x61,x62,x63,x64
0,-3.147006,-0.693485,1.201505,-0.049189,-2.477531,0.940949,1.030151,-1.317478,1.021093,1.930461,...,1.356891,0.585385,-0.296732,1.043022,0.670434,0.953403,2.153965,0.466263,0.523597,1.621132
1,0.439804,0.723609,0.374278,0.009205,-0.324646,2.173416,0.206423,-0.613854,0.980051,-0.320066,...,0.414434,0.075904,-1.574996,0.831630,-0.720067,0.735137,0.125447,0.124935,1.095577,1.013564
2,-0.941380,0.172259,0.262747,0.143806,0.144087,0.195176,-0.347765,0.060223,-0.553427,-0.361967,...,0.369703,-0.641352,0.031440,0.562272,-0.603931,0.262770,-1.057136,0.931976,-0.903822,0.478075
3,-0.683729,0.384113,1.066712,-0.784413,-0.817814,0.953324,0.118580,2.385186,-0.285979,0.746182,...,-0.689772,1.163549,0.054138,-0.116594,-0.518696,-0.012660,0.088384,-0.219844,-0.785585,0.113611
4,0.176874,0.846721,-0.172514,-1.459543,0.897850,1.151623,0.558093,-0.481528,-1.528028,0.809247,...,-0.295646,-0.033303,-1.364283,-0.437886,1.031550,-0.510202,-0.003027,-0.601052,-0.762295,0.539120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,1.445216,-1.218790,-0.279565,-0.696438,0.420386,0.128165,-0.505334,-0.203056,1.467542,-1.238313,...,0.406204,1.324047,-0.481330,1.132483,-0.682260,0.788056,0.544333,-2.147440,-0.817016,0.878112
2704,0.725573,0.141230,0.708655,0.041915,0.324548,-1.125085,-1.442970,-0.667905,-0.146254,-1.541423,...,-1.191757,1.182551,0.508355,-0.843233,-0.116571,-0.093316,-0.181522,0.889183,-0.316258,-0.378547
2705,-2.412036,-0.249003,-0.578755,-0.186452,-1.881456,1.590825,-0.353436,1.747569,-1.506543,-0.557148,...,0.549199,1.412113,1.732373,-0.068925,-0.322579,-0.398206,-1.245025,2.200453,-0.181732,0.080795
2706,-0.771570,-0.387656,-1.180653,-1.297818,0.916602,0.793003,-1.803585,0.531975,1.052815,0.675198,...,-1.818894,1.298334,0.072770,-0.878730,2.461720,0.271092,-0.898784,-1.511561,0.128290,0.526940


In [52]:
print(y_train_hat.columns)

Index(['sim', 'y', 'pred'], dtype='object')


#### 3.4 Compute edge features based on best model

In [106]:
# predict train
y_train_hat = pd.DataFrame(autoenc.get_similarity(model, data.x, data.train_pos_edges, data.train_edges))

y_train_hat = (y_train_hat
    .rename(columns = {0: "sim"})
    .assign(y = trainval_tf.loc[trainval_tf.train_mask == True].y.values)
    .assign(pred = lambda df_: (df_.sim > df_.sim.median()).astype(int))
)

print("ROC: ", roc_auc_score(y_train_hat.y, y_train_hat.sim))
print("Acc: ", accuracy_score(y_train_hat.y, y_train_hat.pred))

ROC:  0.9764215364930167
Acc:  0.948185165702262


In [107]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax.scatter(y_train_hat.loc[y_train_hat['y'] == 0, 'sim'], y_train_hat.loc[y_train_hat['y'] == 0, 'y'], label='0')
ax.scatter(y_train_hat.loc[y_train_hat['y'] == 1, 'sim'], y_train_hat.loc[y_train_hat['y'] == 1, 'y'], label='1')
ax.legend()
plt.xlabel('sim')
plt.ylabel('y')
plt.savefig('scatter_plot.png')
plt.close()

In [24]:
y_train_hat_clean = y_train_hat[["sim", "y"]].apply(pd.to_numeric, errors='coerce').dropna()
sns.pairplot(y_train_hat_clean[["sim", "y"]], hue = "y")

KeyError: "None of [Index(['sim', 'y'], dtype='object')] are in the [columns]"

In [108]:
# predict val
y_val_hat = pd.DataFrame(autoenc.get_similarity(model, data.x, data.train_pos_edges, data.val_edges))
print(len(y_val_hat))
y_val_hat = (y_val_hat
    .rename(columns = {0: "sim"})
    .assign(y = trainval_tf.loc[trainval_tf.val_mask == True].y.values)
    .assign(pred = lambda df_: (df_.sim > df_.sim.median()).astype(int))
)
print("ROC: ", roc_auc_score(y_val_hat.y, y_val_hat.sim))
print("Acc: ", accuracy_score(y_val_hat.y, y_val_hat.pred))

2170
ROC:  0.889936503217312
Acc:  0.8258064516129032


In [109]:
tmp = (val_tf
    .assign(sim  = y_val_hat.pred.values)
    .assign(dist = lambda df_: [np.linalg.norm(node_emb.loc[u].values-node_emb.loc[v].values) for u, v in zip(df_.source, df_.target)])
)

In [84]:
sns.pairplot(tmp[["sim", "dist", "y"]], hue = "y")

NameError: name 'tmp' is not defined

In [110]:
tmp[['target', 'source', 'y', 'sim', 'dist']].corr()

Unnamed: 0,target,source,y,sim,dist
target,1.0,0.320768,-0.493566,-0.420214,0.226317
source,0.320768,1.0,-0.487168,-0.398406,0.295535
y,-0.493566,-0.487168,1.0,0.651613,-0.352756
sim,-0.420214,-0.398406,0.651613,1.0,-0.337812
dist,0.226317,0.295535,-0.352756,-0.337812,1.0


In [111]:
# predict test
y_test_hat = pd.DataFrame(autoenc.get_similarity(model, data.x, data.trainval_pos_edges, data.test_edges))
y_test_hat = (y_test_hat
    .rename(columns = {0: "sim"})
    .assign(pred = lambda df_: (df_.sim > df_.sim.median()).astype(int))
)
y_test_hat[["pred"]].value_counts()

pred
0       542
1       542
dtype: int64