# Machine Learning in Network Science
Group Challenge

***
by: Leonardo Basili, Paul Bédier, Lasse Schmidt

within: MS Data Sciences & Business Analytics

at: CentraleSupélec & ESSEC Business School
***

This notebook covers deep learning techniques, namely:
- Variational Graph Normalized Auto-Encoders (based on https://arxiv.org/abs/2108.08046) which allow us to learn graph embeddings in an unsupervised way (based on graph structure and node embeddings)

### 1. Import Packages

In [29]:
from importlib import reload
reload(analyseData)
reload(loadData)
reload(modeling)
reload(autoenc)

<module 'util.autoencoder' from '/Users/macbookpro/Documents/GitHub/Network-Science_Final-Project/util/autoencoder.py'>

In [1]:
# import own scripts
import util.analyse_Data as analyseData
#import util.preprocess_Data as prepData
import util.load_Data as loadData
import util.modeling as modeling
import util.autoencoder as autoenc

In [3]:
# parse & handle data
import os
import numpy as np
import pandas as pd

# modeling
import torch
from torch_geometric.nn import GAE, VGAE

# hyperparam optimization
from ray import tune, air

# evaluation
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, ConfusionMatrixDisplay


# visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# load raw data
node_info, edgelist, class_to_idx_dict, idx_to_class_dict = loadData.load_raw()
G = loadData.init_nx_graph(edgelist)
nodes = set(edgelist.source).union(edgelist.target)
print(nodes)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

In [5]:
# set matplotlib and seaborn settings for nicer plots
%matplotlib inline

SMALL_SIZE = 6
MEDIUM_SIZE = 8
BIGGER_SIZE = 10

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)    # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)   # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

### 2. Load Data for Modeling

In [6]:
reload(loadData)
(G, G_train, G_trainval, node_info, train_tf, val_tf, trainval_tf, test_tf) = loadData.load()


Number of positive edges for training: 3802
Number of positive edges for validation: 1085
Number of positive edges for test: 542
Number of edges in original graph: 5429
Number of edges in training graph: 3802
Number of non-existing edges generated: 29971
Number of negative edges for training: 3802
Number of negative edges for validation: 1085
Number of negative edges for test: 542


In [30]:
# might take up to a minute
data, (G, G_train, G_trainval, node_info, train_tf, val_tf, trainval_tf, test_tf) = autoenc.load()

Number of positive edges for training: 3802
Number of positive edges for validation: 1085
Number of positive edges for test: 542
Number of edges in original graph: 5429
Number of edges in training graph: 3802
Number of non-existing edges generated: 29971
Number of negative edges for training: 3802
Number of negative edges for validation: 1085
Number of negative edges for test: 542
sum of train pos edges: 3802
sum of train neg edges: 3802
sum of val pos edges: 1085
sum of val neg edges: 1085
sum of train pos edges: 3802
sum of train neg edges: 3802
sum of val pos edges: 1085
sum of val neg edges: 1085
Enriching node features...


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 12.33it/s]
  A = nx.adjacency_matrix(G, nodelist=list(G), dtype=float)


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:01<00:00,  9.86it/s]


Create PyTorch Geometric dataset...


In [9]:
# where to save trial results to
ray_path = os.path.abspath("")+"/ray_results"
if not os.path.isdir(ray_path):
    os.mkdir(ray_path)

### 3. VGNAE Node Embeddings

https://github.com/SeongJinAhn/VGNAE/blob/main/main.py for Variational Graph Normalized Auto-Encoders

#### 3.1 Hyperparameter tuning

In [163]:
# tunable hyperparameter search space --> search using tune.choice([]), tune.uniform(lower, upper), tune.grid_search([])
config = {
    # log params in raytune
    "ray": True,
    
    # print results per epoch
    "verbose": False,
    
    # basic infos
    "data": data,
    "max_epochs": 50,
    "save": True, # if we want to save best model on validation set
    
    # model
    "model": "VGNAE",
    
    ## encoder
    "enc_channels": 64,
    "scaling": 1.8,
    "num_prop": tune.grid_search([4, 16, 32, 64, 128]),
    "teleport": 0, # tune.grid_search([0, 0.1, 0.2]),
    "dropout": 0, # tune.grid_search([0, 0.1, 0.2]),
    
    # optimizer
    "lr": tune.grid_search([1e-3, 1e-4, 1e-5]),
    "wd": 0,
}

In [164]:
# how many trials to run (if grid_search utilized, it will run this number per grid_search value)
num_samples = 1

# run experiment
result_grid = autoenc.run_ray_experiment(
    autoenc.train_validate, config, ray_path, num_samples,
    metric_columns = ["trn_auc", "val_auc", "max_val_auc", "training_iteration"],
    parameter_columns = ["scaling", "num_prop", "lr"]
)

0,1
Current time:,2023-04-22 15:17:28
Running for:,00:05:13.99
Memory:,14.7/16.0 GiB

Trial name,status,loc,scaling,num_prop,lr,trn_auc,val_auc,max_val_auc,training_iterat ion
VGNAE_0.001_0_4c954_00000,TERMINATED,127.0.0.1:70601,1.8,4,0.001,0.941873,0.817512,0.825806,50
VGNAE_0.0001_0_4c954_00001,TERMINATED,127.0.0.1:70614,1.8,4,0.0001,0.693582,0.661751,0.661751,50
VGNAE_1e-05_0_4c954_00002,TERMINATED,127.0.0.1:70601,1.8,4,1e-05,0.693319,0.661751,0.661751,50
VGNAE_0.001_0_4c954_00003,TERMINATED,127.0.0.1:70614,1.8,16,0.001,0.940558,0.814747,0.823963,50
VGNAE_0.0001_0_4c954_00004,TERMINATED,127.0.0.1:70601,1.8,16,0.0001,0.705155,0.658525,0.658986,50
VGNAE_1e-05_0_4c954_00005,TERMINATED,127.0.0.1:70614,1.8,16,1e-05,0.705155,0.654839,0.6553,50
VGNAE_0.001_0_4c954_00006,TERMINATED,127.0.0.1:70601,1.8,32,0.001,0.941084,0.814747,0.819355,50
VGNAE_0.0001_0_4c954_00007,TERMINATED,127.0.0.1:70614,1.8,32,0.0001,0.705155,0.659908,0.659908,50
VGNAE_1e-05_0_4c954_00008,TERMINATED,127.0.0.1:70601,1.8,32,1e-05,0.705024,0.657604,0.657604,50
VGNAE_0.001_0_4c954_00009,TERMINATED,127.0.0.1:70614,1.8,64,0.001,0.94161,0.813825,0.818433,50


2023-04-22 15:12:13,361	INFO worker.py:1553 -- Started a local Ray instance.
2023-04-22 15:17:28,419	INFO tune.py:798 -- Total run time: 314.03 seconds (305.95 seconds for the tuning loop).


#### 3.2 Result of Hyperparameter tuning

In [32]:
restored_tuner, result_grid = autoenc.open_validate_ray_experiment(
    "ray_results/train_validate_2023-04-22_15-12-11",
    autoenc.train_validate
)


The trainable will be overwritten - this should be done with caution: it's possible to supply an incompatible trainable, and there are no guarantees that the resumed experiment will continue successfully. If you encounter errors during training, ensure that you are passing in the same trainable that was passed into the initial `Tuner` object.


Loading results from ray_results/train_validate_2023-04-22_15-12-11...
Done!

No errors! Number of terminated trials: 15


In [34]:
# get best score per trial (highest validation accuracy)
N = 10
best_result_df = result_grid.get_dataframe(
    filter_metric="val_auc", filter_mode="max"
)
best_result_df = best_result_df[["trial_id", "training_iteration", "config/enc_channels",
                                 "config/scaling", "config/num_prop", "config/lr", "config/wd", 
                                 "trn_loss", "val_loss", "trn_auc", "val_auc"]]
best_result_df = best_result_df.sort_values(by=["val_auc"], ascending = False)

if len(result_grid) > N:
    best_result_df = best_result_df.head(N)

best_result_df

Couldn't read config from 14 paths


Unnamed: 0,trial_id,training_iteration,config/enc_channels,config/scaling,config/num_prop,config/lr,config/wd,trn_loss,val_loss,trn_auc,val_auc
0,4c954_00000,42,64,1.8,4,0.001,0,5.112204,1.409582,0.948185,0.825806


#### 3.3 Embeddings based on best model

In [35]:
# load best autoencoder
path = os.path.abspath("")+"/models/VGNAE_0.001_0_4c954_00000_autoencoder.pt"
#path = "models/autoencoder.pt"
model = VGAE(autoenc.Encoder(data.x.size()[1], 64, 1.5, 4, 0, 0))
model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))

<All keys matched successfully>

In [36]:
# get embeddings of nodes
embedding = autoenc.get_embeddings(model, data.x, data.train_pos_edges)

In [37]:
# plot embedding
plt.scatter(
    embedding[:, 0],
    embedding[:, 1])
plt.gca().set_aspect('equal', 'datalim')
plt.title('VGNAE projection (first 2 dim) of nodes')

Text(0.5, 1.0, 'VGNAE projection (first 2 dim) of nodes')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte

Embeddings look like a sphere, we cannot do any meaningful clusering based on this.

Let us now take our original VGNAE embeddings and put them into a pandas dataframe.

In [38]:
node_emb = pd.DataFrame(embedding).rename(columns = {val: f"x{val+1}" for val in range(embedding.shape[1])})

node_emb

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x55,x56,x57,x58,x59,x60,x61,x62,x63,x64
0,-3.309478,0.206754,0.374594,0.084685,-2.413626,0.516780,0.564035,-1.217949,1.343269,0.983644,...,2.042603,0.784332,-0.047315,1.332707,1.380343,1.935409,2.039677,1.465458,-0.341519,0.599926
1,-1.155899,0.423404,-0.423475,-0.028813,-0.980220,-0.173735,0.160932,0.415496,-0.401508,1.241986,...,0.721122,0.411283,-1.477461,0.223495,-0.211122,-0.780023,1.086434,-0.280101,2.446642,-0.686368
2,-0.435892,-0.167979,-0.786887,-1.623288,-0.481958,0.982450,0.814728,0.595350,-0.469417,0.302892,...,-0.542122,0.474864,-0.181739,-0.441218,1.213146,0.794380,0.262811,0.346724,-1.336649,-0.542086
3,0.569048,0.464201,0.165344,-0.682300,-0.040294,-0.061181,0.258084,-0.587930,-0.685422,0.687297,...,-0.349226,-1.029475,2.056812,0.402505,-0.213800,0.001614,-0.455678,-0.458563,-1.049768,-1.002158
4,0.196160,0.512066,-1.508833,-0.082863,-0.550839,0.927825,0.210382,0.817241,0.815435,-0.088797,...,0.184519,0.461432,1.924741,0.901504,-0.422673,0.167884,-0.980999,0.010288,-0.402992,-0.766070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,-0.684204,0.462192,-0.438589,0.405070,0.329540,0.159429,-0.560250,0.388082,1.694141,-0.383235,...,-1.549708,-0.376530,-0.413481,0.620350,-1.105991,-1.065909,0.885642,-1.409608,-0.615764,-0.978133
2704,-0.688122,0.195674,0.152141,-1.133137,1.764305,1.701825,-0.680980,-0.055955,0.694889,0.950244,...,1.524095,1.774572,0.273069,-0.715192,0.325856,1.531773,-0.435225,-2.294260,1.093841,0.228341
2705,0.230485,-1.243988,0.452897,-0.828860,0.746502,0.400512,1.251848,-0.011066,-1.055300,-0.862269,...,-1.048554,-1.410208,-2.319679,-0.359559,-2.395675,0.484409,1.449576,-1.170174,-0.015283,1.290733
2706,1.396172,-0.040397,-0.843639,0.074305,0.538276,0.970200,-0.294978,-0.183704,0.363582,-1.349674,...,-0.558662,-0.372223,-0.126262,0.123231,-0.845528,-1.446459,0.992192,1.726884,-0.478976,1.577100


In [23]:
print(data.x.shape[1])

1441


#### 3.4 Compute edge features based on best model

In [39]:
# predict train
y_train_hat = pd.DataFrame(autoenc.get_similarity(model, data.x, data.train_pos_edges, data.train_edges))

y_train_hat = (y_train_hat
    .rename(columns = {0: "sim"})
    .assign(y = trainval_tf.loc[trainval_tf.train_mask == True].y.values)
    .assign(pred = lambda df_: (df_.sim > df_.sim.median()).astype(int))
)

print("ROC: ", roc_auc_score(y_train_hat.y, y_train_hat.sim))
print("Acc: ", accuracy_score(y_train_hat.y, y_train_hat.pred))

ROC:  0.9764215364930167
Acc:  0.948185165702262


In [40]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax.scatter(y_train_hat.loc[y_train_hat['y'] == 0, 'sim'], y_train_hat.loc[y_train_hat['y'] == 0, 'y'], label='0')
ax.scatter(y_train_hat.loc[y_train_hat['y'] == 1, 'sim'], y_train_hat.loc[y_train_hat['y'] == 1, 'y'], label='1')
ax.legend()
plt.xlabel('sim')
plt.ylabel('y')
plt.savefig('scatter_plot.png')
plt.close()

In [24]:
y_train_hat_clean = y_train_hat[["sim", "y"]].apply(pd.to_numeric, errors='coerce').dropna()
sns.pairplot(y_train_hat_clean[["sim", "y"]], hue = "y")

KeyError: "None of [Index(['sim', 'y'], dtype='object')] are in the [columns]"

In [42]:
# predict val
y_val_hat = pd.DataFrame(autoenc.get_similarity(model, data.x, data.train_pos_edges, data.val_edges))
print(len(y_val_hat))
y_val_hat = (y_val_hat
    .rename(columns = {0: "sim"})
    .assign(y = trainval_tf.loc[trainval_tf.val_mask == True].y.values)
    .assign(pred = lambda df_: (df_.sim > df_.sim.median()).astype(int))
)
print("ROC: ", roc_auc_score(y_val_hat.y, y_val_hat.sim))
print("Acc: ", accuracy_score(y_val_hat.y, y_val_hat.pred))

2170
ROC:  0.889936503217312
Acc:  0.8258064516129032


In [43]:
tmp = (val_tf
    .assign(sim  = y_val_hat.pred.values)
    .assign(dist = lambda df_: [np.linalg.norm(node_emb.loc[u].values-node_emb.loc[v].values) for u, v in zip(df_.source, df_.target)])
)

In [84]:
sns.pairplot(tmp[["sim", "dist", "y"]], hue = "y")

NameError: name 'tmp' is not defined

In [45]:
tmp[['target', 'source', 'y', 'sim', 'dist']].corr()

Unnamed: 0,target,source,y,sim,dist
target,1.0,0.320768,-0.493566,-0.420214,0.049604
source,0.320768,1.0,-0.487168,-0.398406,0.165143
y,-0.493566,-0.487168,1.0,0.651613,-0.017876
sim,-0.420214,-0.398406,0.651613,1.0,-0.111566
dist,0.049604,0.165143,-0.017876,-0.111566,1.0


In [46]:
# predict test
y_test_hat = pd.DataFrame(autoenc.get_similarity(model, data.x, data.trainval_pos_edges, data.test_edges))
y_test_hat = (y_test_hat
    .rename(columns = {0: "sim"})
    .assign(pred = lambda df_: (df_.sim > df_.sim.median()).astype(int))
)
y_test_hat[["pred"]].value_counts()

pred
0       542
1       542
dtype: int64