# Machine Learning in Network Science
Group Challenge

***
by: Leonardo Basili, Paul Bédier, Lasse Schmidt

within: MS Data Sciences & Business Analytics

at: CentraleSupélec & ESSEC Business School
***

This notebook covers classical unsupervised link prediction techniques.

### 1. Import Packages

In [42]:
from importlib import reload
reload(analyseData)
reload(prepData)
reload(loadData)
reload(modeling)
reload(autoenc)

<module 'util.autoencoder' from '/Users/macbookpro/Documents/GitHub/Network-Science_Final-Project/util/autoencoder.py'>

In [43]:
# import own scripts
import util.analyse_Data as analyseData
import util.preprocess_Data as prepData
import util.load_Data as loadData
import util.modeling as modeling
import util.autoencoder as autoenc

In [44]:
# basic stuff
from itertools import product, combinations
from collections import OrderedDict

# parse & handle data
import os
import csv
import numpy as np
import pandas as pd
import networkx as nx # graph data
import sknetwork

# modeling
import torch
from torch_geometric.nn import GAE, VGAE
from xgboost import XGBClassifier
import sknetwork

# evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [45]:
# set matplotlib and seaborn settings for nicer plots
%matplotlib inline

SMALL_SIZE = 6
MEDIUM_SIZE = 8
BIGGER_SIZE = 10

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)    # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)   # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

### 2. Load Data for Modeling

In [17]:
# might take up to a few minutes
(G, G_train, G_trainval, node_info,
 train_tf, val_tf, trainval_tf, test_tf,
 X_train, y_train, X_val, y_val, X_trainval, y_trainval,
 X_test, y_test) = loadData.load_transform(val_ratio = 0.2, test_ratio = 0.1, n2v_train=False)

Number of positive edges for training: 3802
Number of positive edges for validation: 1085
Number of positive edges for test: 542
Number of edges in original graph: 5429
Number of edges in training graph: 3802
Number of non-existing edges generated: 29971
Number of negative edges for training: 3802
Number of negative edges for validation: 1085
Number of negative edges for test: 542


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:01<00:00,  9.07it/s]


Enriching train data...


  adj = nx.adjacency_matrix(G, nodelist = nodelist)
  adj = nx.adjacency_matrix(G, nodelist = nodelist)
  adj = nx.adjacency_matrix(G, nodelist = nodelist).toarray()


Enriching validation data...
Enriching test data...


In [46]:
# might take up to a minute
data, _ = autoenc.load(val_ratio = 0.2, test_ratio = 0.1)

Number of positive edges for training: 3802
Number of positive edges for validation: 1085
Number of positive edges for test: 542
Number of edges in original graph: 5429
Number of edges in training graph: 3802
Number of non-existing edges generated: 29971
Number of negative edges for training: 3802
Number of negative edges for validation: 1085
Number of negative edges for test: 542
sum of train pos edges: 3802
sum of train neg edges: 3802
sum of val pos edges: 1085
sum of val neg edges: 1085
sum of train pos edges: 3802
sum of train neg edges: 3802
sum of val pos edges: 1085
sum of val neg edges: 1085
Enriching node features...


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 13.72it/s]
  A = nx.adjacency_matrix(G, nodelist=list(G), dtype=float)


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:01<00:00,  9.75it/s]


Create PyTorch Geometric dataset...


In [47]:
# get node embeddings (IF YOU CHANGE SEED = 42, then you need to train new autoencoder as train-val split is different)

# load best autoencoder
path = os.path.abspath("")+"/models/VGNAE_0.001_0_4982a_00000_autoencoder.pt"
model = VGAE(autoenc.Encoder(data.x.size()[1], 64, 1.2, 5, 0.2, 0))
model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))

# get embeddings of nodes
embedding = autoenc.get_embeddings(model, data.x, data.trainval_edges)
node_emb = pd.DataFrame(embedding).rename(columns = {val: f"x{val+1}" for val in range(embedding.shape[1])})

# enrich train
train_sim = pd.DataFrame(autoenc.get_similarity(model, data.x, data.train_pos_edges, data.train_edges)).rename(columns = {0: "sim"})
train_tf = (train_tf
    .assign(sim = train_sim.sim.values)
    .assign(sim_scaled = (train_sim.sim.values - train_sim.sim.values.min()) / (train_sim.sim.values.max() - train_sim.sim.values.min()))
    .assign(dist = lambda df_: [np.linalg.norm(node_emb.loc[u].values-node_emb.loc[v].values) for u, v in zip(df_.source, df_.target)])
)

# enrich val
val_sim = pd.DataFrame(autoenc.get_similarity(model, data.x, data.train_pos_edges, data.val_edges)).rename(columns = {0: "sim"})
val_tf = (val_tf
    .assign(sim = val_sim.sim.values)
    .assign(sim_scaled = (val_sim.sim.values - val_sim.sim.values.min()) / (val_sim.sim.values.max() - val_sim.sim.values.min()))
    .assign(dist = lambda df_: [np.linalg.norm(node_emb.loc[u].values-node_emb.loc[v].values) for u, v in zip(df_.source, df_.target)])
)

# enrich test
test_sim = pd.DataFrame(autoenc.get_similarity(model, data.x, data.trainval_pos_edges, data.test_edges)).rename(columns = {0: "sim"})
test_tf = (test_tf
    .assign(sim = test_sim.sim.values)
    .assign(sim_scaled = (test_sim.sim.values - test_sim.sim.values.min()) / (test_sim.sim.values.max() - test_sim.sim.values.min()))
    .assign(dist = lambda df_: [np.linalg.norm(node_emb.loc[u].values-node_emb.loc[v].values) for u, v in zip(df_.source, df_.target)])
)

# split
X_train, y_train = loadData.split_frame(train_tf)
X_val, y_val     = loadData.split_frame(val_tf)
X_test, y_test   = loadData.split_frame(test_tf)

In [48]:
X_train

Unnamed: 0,train_mask,val_mask,nodeInfo_dupl,nodeInfo_diff,source_DCT,target_DCT,BCT_diff,SaI,SoI,HProm,...,node2vec_1,node2vec_2,node2vec_3,node2vec_4,friendLink,PR1,PR2,sim,sim_scaled,dist
0,True,False,0,40,0.001847,0.046177,1.342636e-04,0.040000,0.015385,0.200000,...,0.557283,0.779172,0.494198,0.917683,0.0,0.004162,5.753754e-05,0.998753,0.998675,9.415458
1,True,False,0,39,0.001108,0.046177,1.363114e-04,0.000000,0.000000,0.000000,...,0.614453,0.328992,0.216422,0.515967,0.0,0.004186,5.679651e-05,0.636056,0.612702,8.376766
2,True,False,0,37,0.002216,0.046177,1.359018e-04,0.109545,0.045802,0.500000,...,0.587833,0.129990,0.149956,0.365903,0.0,0.004183,5.688328e-05,0.999914,0.999911,9.076403
5,True,False,0,37,0.000369,0.046177,1.371304e-04,0.000000,0.000000,0.000000,...,0.428600,4.267884,3.676824,1.702036,0.0,0.004121,5.876902e-05,0.996487,0.996264,9.238278
6,True,False,0,43,0.001108,0.046177,1.371304e-04,0.051640,0.015625,0.333333,...,0.823316,1.817058,0.853819,1.267661,0.0,0.004121,5.876902e-05,0.994311,0.993949,9.695809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9227,True,False,0,27,0.001108,0.001108,0.000000e+00,0.000000,0.000000,0.000000,...,0.000190,0.016349,0.128112,0.001507,0.0,0.000288,0.000000e+00,0.597307,0.571466,11.521257
9228,True,False,0,32,0.001108,0.000369,0.000000e+00,0.000000,0.000000,0.000000,...,0.007720,0.001883,0.274263,0.004281,0.0,0.000299,4.676405e-10,0.559729,0.531476,10.325033
9229,True,False,0,27,0.001108,0.000739,2.730322e-07,0.000000,0.000000,0.000000,...,0.275338,1.656101,0.010896,1.768237,0.0,0.000314,2.707818e-09,0.616462,0.591850,9.938886
9230,True,False,0,31,0.001108,0.000739,0.000000e+00,0.000000,0.000000,0.000000,...,0.011136,0.847360,0.120730,0.241226,0.0,0.000288,0.000000e+00,0.579637,0.552662,11.032274


In [11]:
# plot correlation with target (first train, then validation)
for df in [train_tf, val_tf]:
    analyseData.plot_corr_matrix(df.iloc[:, 2:])

TypeError: plot() got an unexpected keyword argument 'text_kw'

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte

### 3. Unsupervised Link Prediction

In [49]:
def compute_score(df, cols, method, thresh):
    # we assume that all metrics get better with increasing values!
    
    df_ = df[list(cols)]
    
    if method == "rank_avg":
        df_ = df_.rank(pct = True).mean(axis = 1)  
    elif method == "avg":
        df_ = df_.mean(axis = 1)
    elif method == "whitened_sigmoid_avg":
        df_ = pd.DataFrame({col: sknetwork.linkpred.whitened_sigmoid(df_[col].to_numpy()) for col in df_.columns})
        df_ = df_.mean(axis = 1)
        
    if thresh == "top50%":
        y_hat = (df_ > df_.median()).astype(int)
    elif thresh == "thresh":
        y_hat = (df_ > 0.5).astype(int)
    elif thresh == "return_probas":
        y_hat = df_.rank(pct = True)
        
    return y_hat

We only consider a subset of somewhat decorrelated metrics (otherwise this search will run too long). For example, Sorensen Index is highly correlated (.99) with Salton Index, thus it is enough to only include one of them in our below search (same for Adamic Adar and Resource Allocation, but not for Adamic Adar and SCF Resource Allocation).

In [50]:
# which cols we want to use for link prediction
res = OrderedDict()
#removed from columns "CNC", "SCF_RA", "JCC", "AA", "PA_log",
cols = ["SoI", "HProm", # local methods
        "katz_idx", "sim_rank", "root_pagerank", "node2vec_1", "node2vec_4", # global methods
        "friendLink"]                                           # quasi-local methods

methods = ["rank_avg", "avg", "whitened_sigmoid_avg"]
threshs = ["top50%", "thresh"]

# generate all combinations of columns in cols
sampled_cols = []
for n in range(2, len(cols) + 1):
    sampled_cols += list([c for c in combinations(cols, n)])

for s, m, t in tqdm(product(sampled_cols, methods, threshs)):
    y_train_hat = compute_score(X_train, s, m, t)
    y_val_hat   = compute_score(X_val, s, m, t)
    trn_acc     = accuracy_score(y_train, y_train_hat)
    val_acc     = accuracy_score(y_val, y_val_hat)

    res[(s, m, t)] = {"trn_acc": trn_acc, "val_acc": val_acc}

1482it [00:05, 262.86it/s]


In [51]:
ordered_res = (sorted(res.items(), key = lambda kv: kv[1]["val_acc"], reverse = True))

for (col, m, t), val_dict in ordered_res[0:10]:
    print(f"using {col}, {m}, {t}")
    print(f"Train Accuracy {round(val_dict['trn_acc'], 5)}, Val Accuracy {round(val_dict['val_acc'], 5)}\n")

using ('SoI', 'HProm', 'katz_idx', 'node2vec_4'), rank_avg, thresh
Train Accuracy 0.68898, Val Accuracy 0.72396

using ('SoI', 'HProm', 'node2vec_4', 'friendLink'), rank_avg, thresh
Train Accuracy 0.66399, Val Accuracy 0.72396

using ('SoI', 'HProm', 'katz_idx', 'sim_rank', 'root_pagerank', 'node2vec_4'), rank_avg, thresh
Train Accuracy 0.6924, Val Accuracy 0.7235

using ('SoI', 'HProm', 'sim_rank', 'root_pagerank', 'node2vec_4', 'friendLink'), rank_avg, thresh
Train Accuracy 0.67438, Val Accuracy 0.7235

using ('SoI', 'HProm', 'katz_idx', 'sim_rank', 'root_pagerank', 'node2vec_4', 'friendLink'), rank_avg, thresh
Train Accuracy 0.69398, Val Accuracy 0.7235

using ('SoI', 'HProm', 'node2vec_4'), rank_avg, thresh
Train Accuracy 0.65637, Val Accuracy 0.72304

using ('SoI', 'HProm', 'katz_idx', 'sim_rank', 'node2vec_4'), rank_avg, thresh
Train Accuracy 0.69214, Val Accuracy 0.72304

using ('SoI', 'HProm', 'katz_idx', 'root_pagerank', 'node2vec_4'), rank_avg, thresh
Train Accuracy 0.69214, 

In [52]:
# use best settings
s = ['SoI', 'HProm', 'katz_idx', 'node2vec_4']
m = "rank_avg"
t = "thresh"
y_train_hat = compute_score(X_train, s, m, t)
y_val_hat   = compute_score(X_val, s, m, t)
y_test_hat  = compute_score(X_test, s, m, t)

In [53]:
# detailed performance analysis
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, y_train_hat))

print('Validation performance')
print('-------------------------------------------------------')
print(classification_report(y_val, y_val_hat))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_val, y_val_hat))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_val, y_val_hat))

Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.66      0.77      0.71      3802
           1       0.73      0.60      0.66      3802

    accuracy                           0.69      7604
   macro avg       0.69      0.69      0.69      7604
weighted avg       0.69      0.69      0.69      7604

Validation performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.69      0.81      0.75      1085
           1       0.77      0.64      0.70      1085

    accuracy                           0.72      2170
   macro avg       0.73      0.72      0.72      2170
weighted avg       0.73      0.72      0.72      2170

Roc_auc score
-------------------------------------------------------
0.723963133640553

Confusion matrix
-------------------------------------------------------
[[880 205]
 [394 691]]


In [33]:
# save test predictions
save_test = modeling.save_test_preds(test_tf[['source', 'target']], test_tf, y_test_hat)

In [34]:
# look at predicted labels
save_test.value_counts()

Predicted
0            648
1            436
dtype: int64

let us now combine our local ensemble model with the embedding similarities from our autoencoder

doesn't really work well...

In [54]:
# now let's try to average these predictions with the results from our autoencoder
#removed 'JCC', 'AA', 'PA_log',
s = ['root_pagerank']
m = "rank_avg"
t = "return_probas"
y_train_hat = compute_score(X_train, s, m, t)
y_val_hat   = compute_score(X_val, s, m, t)
y_test_hat  = compute_score(X_test, s, m, t)

In [55]:
# take max
#understand where is sim
y_train_hat = pd.concat([y_train_hat, train_tf.sim.rank(pct = True)], axis = 1).max(axis = 1)
y_train_hat = (y_train_hat > y_train_hat.median()).astype(int)

y_val_hat = pd.concat([y_val_hat, val_tf.sim.rank(pct = True)], axis = 1).max(axis = 1)
y_val_hat = (y_val_hat > y_val_hat.median()).astype(int)

y_test_hat = pd.concat([y_test_hat, test_tf.sim.rank(pct = True)], axis = 1).max(axis = 1)
y_test_hat = (y_test_hat > y_test_hat.median()).astype(int)

In [56]:
# add scaled similarities
alpha = 0.66
beta  = 0.34

y_train_hat = (alpha * y_train_hat + beta * train_tf.sim.rank(pct = True))
y_train_hat = (y_train_hat > y_train_hat.median()).astype(int)

y_val_hat = (alpha * y_val_hat + beta * val_tf.sim.rank(pct = True))
y_val_hat = (y_val_hat > y_val_hat.median()).astype(int)

y_test_hat = (alpha * y_test_hat + beta * test_tf.sim.rank(pct = True))
y_test_hat = (y_test_hat > y_test_hat.median()).astype(int)

In [57]:
# detailed performance analysis
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, y_train_hat))

print('Validation performance')
print('-------------------------------------------------------')
print(classification_report(y_val, y_val_hat))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_val, y_val_hat))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_val, y_val_hat))

Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      3802
           1       0.95      0.95      0.95      3802

    accuracy                           0.95      7604
   macro avg       0.95      0.95      0.95      7604
weighted avg       0.95      0.95      0.95      7604

Validation performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.84      0.84      0.84      1085
           1       0.84      0.84      0.84      1085

    accuracy                           0.84      2170
   macro avg       0.84      0.84      0.84      2170
weighted avg       0.84      0.84      0.84      2170

Roc_auc score
-------------------------------------------------------
0.8423963133640553

Confusion matrix
-------------------------------------------------------
[[914 171]
 [171 914]]
