# Machine Learning in Network Science
Group Challenge

***
by: Leonardo Basili, Paul Bédier, Lasse Schmidt

within: MS Data Sciences & Business Analytics

at: CentraleSupélec & ESSEC Business School
***

This notebook covers deep learning techniques, namely:
- Variational Graph Normalized Auto-Encoders (based on https://arxiv.org/abs/2108.08046) which allow us to learn graph embeddings in an unsupervised way (based on graph structure and node embeddings)

### 1. Import Packages

In [181]:
from importlib import reload
reload(analyseData)
reload(loadData)
reload(modeling)
reload(autoenc)

<module 'util.autoencoder' from '/Users/macbookpro/Desktop/LABS/Network-Science_Final-Project/util/autoencoder.py'>

In [3]:
# import own scripts
import util.analyse_Data as analyseData
#import util.preprocess_Data as prepData
import util.load_Data as loadData
import util.modeling as modeling
import util.autoencoder as autoenc

In [6]:
# parse & handle data
import os
import numpy as np
import pandas as pd

# modeling
import torch
from torch_geometric.nn import GAE, VGAE

# hyperparam optimization
from ray import tune, air

# evaluation
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, ConfusionMatrixDisplay


# visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
# load raw data
node_info, edgelist, class_to_idx_dict, idx_to_class_dict = loadData.load_raw()
G = loadData.init_nx_graph(edgelist)
nodes = set(edgelist.source).union(edgelist.target)
print(nodes)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

In [7]:
# set matplotlib and seaborn settings for nicer plots
%matplotlib inline

SMALL_SIZE = 6
MEDIUM_SIZE = 8
BIGGER_SIZE = 10

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)    # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)   # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

### 2. Load Data for Modeling

In [153]:
reload(loadData)
(G, G_train, G_trainval, node_info, train_tf, val_tf, trainval_tf, test_tf) = loadData.load()


Number of positive edges for training: 3802
Number of positive edges for validation: 1085
Number of positive edges for test: 542
Number of edges in original graph: 5429
Number of edges in training graph: 3802
Number of non-existing edges generated: 29971
Number of negative edges for training: 3802
Number of negative edges for validation: 1085
Number of negative edges for test: 542
3802
3802


In [154]:
print(f"sum of train pos edges: {((trainval_tf['y'] == 1) & (trainval_tf['train_mask'] == 1)).sum()}")
print(f"sum of train neg edges: {((trainval_tf['y'] == 0) & (trainval_tf['train_mask'] == 1)).sum()}")
print(f"sum of val pos edges: {((trainval_tf['y'] == 1) & (trainval_tf['val_mask'] == 1)).sum()}")
print(f"sum of val neg edges: {((trainval_tf['y'] == 0) & (trainval_tf['val_mask'] == 1)).sum()}")

sum of train pos edges: 3802
sum of train neg edges: 3802
sum of val pos edges: 1085
sum of val neg edges: 1085


In [158]:
# might take up to a minute
reload(loadData)
reload(modeling)
reload(autoenc)
data, (G, G_train, G_trainval, node_info, train_tf, val_tf, trainval_tf, test_tf) = autoenc.load()

Number of positive edges for training: 3802
Number of positive edges for validation: 1085
Number of positive edges for test: 542
Number of edges in original graph: 5429
Number of edges in training graph: 3802
Number of non-existing edges generated: 29971
Number of negative edges for training: 3802
Number of negative edges for validation: 1085
Number of negative edges for test: 542
sum of train pos edges: 3802
sum of train neg edges: 3802
sum of val pos edges: 1085
sum of val neg edges: 1085
sum of train pos edges: 3802
sum of train neg edges: 3802
sum of val pos edges: 1085
sum of val neg edges: 1085
Enriching node features...


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 14.00it/s]
  A = nx.adjacency_matrix(G, nodelist=list(G), dtype=float)


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 10.84it/s]


Create PyTorch Geometric dataset...


In [141]:
print(train_tf.index.has_duplicates)
print(val_tf.index.has_duplicates)
print(trainval_tf.index.has_duplicates)

False
False
False


In [159]:
print((trainval_tf['y'] == 1).sum())
print((trainval_tf['y'] == 0).sum())
print((train_tf['y'] == 1).sum())
print((train_tf['y'] == 0).sum())
print((val_tf['y'] == 1).sum())
print((val_tf['y'] == 0).sum())
print(len(trainval_tf))
"""print(f"sum of train pos edges: {((trainval_tf['y'] == 1) & (trainval_tf['is_train'] == 1)).sum()}")
print(f"sum of train neg edges: {((trainval_tf['y'] == 0) & (trainval_tf['is_train'] == 1)).sum()}")
print(f"sum of val pos edges: {((trainval_tf['y'] == 1) & (trainval_tf['is_val'] == 1)).sum()}")
print(f"sum of val neg edges: {((trainval_tf['y'] == 0) & (trainval_tf['is_val'] == 1)).sum()}")"""
print(f"sum of train pos edges: {((trainval_tf['y'] == 1) & (trainval_tf['train_mask'] == 1)).sum()}")
print(f"sum of train neg edges: {((trainval_tf['y'] == 0) & (trainval_tf['train_mask'] == 1)).sum()}")
print(f"sum of val pos edges: {((trainval_tf['y'] == 1) & (trainval_tf['val_mask'] == 1)).sum()}")
print(f"sum of val neg edges: {((trainval_tf['y'] == 0) & (trainval_tf['val_mask'] == 1)).sum()}")
print(f"duplicates: {((trainval_tf['train_mask'] == 1) & (trainval_tf['val_mask'] == 1)).sum()}")
print(f"duplicates: {((trainval_tf['train_mask'] == 0) & (trainval_tf['val_mask'] == 0)).sum()}")
print(f"sum of val pos edges: {(trainval_tf['y'] == 1 & (trainval_tf['val_mask'] == 1)).sum()}")
print(f"sum of val edges: {((trainval_tf['val_mask'] == 1)).sum()}")
print(f"sum of train edges: {((trainval_tf['train_mask'] == 0)).sum()}")
print(f"sum of val edges: {((trainval_tf['val_mask'] == 0)).sum()}")
print(f"sum of train edges: {((trainval_tf['train_mask'] == 1)).sum()}")
print(f"train edges: {((trainval_tf['train_mask'] == 1) & (trainval_tf['val_mask'] == 0)).sum()}")
print(f"val edges: {((trainval_tf['train_mask'] == 0) & (trainval_tf['val_mask'] == 1)).sum()}")


4887
4887
3802
3802
1085
1085
9774
sum of train pos edges: 3802
sum of train neg edges: 3802
sum of val pos edges: 1085
sum of val neg edges: 1085
duplicates: 0
duplicates: 0
sum of val pos edges: 4887
sum of val edges: 2170
sum of train edges: 2170
sum of val edges: 7604
sum of train edges: 7604
train edges: 7604
val edges: 2170


In [161]:

print(data.val_pos_edges.shape[1])
print(data.val_neg_edges.shape[1])
print(data.val_edges.shape[1])

print(data.train_pos_edges.shape[1])
print(data.train_neg_edges.shape[1])
print(data.train_edges.shape[1])
"""Number of positive edges for training: 3802
Number of positive edges for validation: 1085
Number of positive edges for test: 542
Number of edges in original graph: 5429
Number of edges in training graph: 3802
Number of non-existing edges generated: 29971
Number of negative edges for training: 3802
Number of negative edges for validation: 1085
Number of negative edges for test: 542"""

1085
1085
2170
3802
3802
7604


'Number of positive edges for training: 3802\nNumber of positive edges for validation: 1085\nNumber of positive edges for test: 542\nNumber of edges in original graph: 5429\nNumber of edges in training graph: 3802\nNumber of non-existing edges generated: 29971\nNumber of negative edges for training: 3802\nNumber of negative edges for validation: 1085\nNumber of negative edges for test: 542'

In [162]:
# where to save trial results to
ray_path = os.path.abspath("")+"/ray_results"
if not os.path.isdir(ray_path):
    os.mkdir(ray_path)

### 3. VGNAE Node Embeddings

https://github.com/SeongJinAhn/VGNAE/blob/main/main.py for Variational Graph Normalized Auto-Encoders

#### 3.1 Hyperparameter tuning

In [163]:
# tunable hyperparameter search space --> search using tune.choice([]), tune.uniform(lower, upper), tune.grid_search([])
config = {
    # log params in raytune
    "ray": True,
    
    # print results per epoch
    "verbose": False,
    
    # basic infos
    "data": data,
    "max_epochs": 50,
    "save": True, # if we want to save best model on validation set
    
    # model
    "model": "VGNAE",
    
    ## encoder
    "enc_channels": 64,
    "scaling": 1.8,
    "num_prop": tune.grid_search([4, 16, 32, 64, 128]),
    "teleport": 0, # tune.grid_search([0, 0.1, 0.2]),
    "dropout": 0, # tune.grid_search([0, 0.1, 0.2]),
    
    # optimizer
    "lr": tune.grid_search([1e-3, 1e-4, 1e-5]),
    "wd": 0,
}

In [164]:
# how many trials to run (if grid_search utilized, it will run this number per grid_search value)
num_samples = 1

# run experiment
result_grid = autoenc.run_ray_experiment(
    autoenc.train_validate, config, ray_path, num_samples,
    metric_columns = ["trn_auc", "val_auc", "max_val_auc", "training_iteration"],
    parameter_columns = ["scaling", "num_prop", "lr"]
)

0,1
Current time:,2023-04-22 15:17:28
Running for:,00:05:13.99
Memory:,14.7/16.0 GiB

Trial name,status,loc,scaling,num_prop,lr,trn_auc,val_auc,max_val_auc,training_iterat ion
VGNAE_0.001_0_4c954_00000,TERMINATED,127.0.0.1:70601,1.8,4,0.001,0.941873,0.817512,0.825806,50
VGNAE_0.0001_0_4c954_00001,TERMINATED,127.0.0.1:70614,1.8,4,0.0001,0.693582,0.661751,0.661751,50
VGNAE_1e-05_0_4c954_00002,TERMINATED,127.0.0.1:70601,1.8,4,1e-05,0.693319,0.661751,0.661751,50
VGNAE_0.001_0_4c954_00003,TERMINATED,127.0.0.1:70614,1.8,16,0.001,0.940558,0.814747,0.823963,50
VGNAE_0.0001_0_4c954_00004,TERMINATED,127.0.0.1:70601,1.8,16,0.0001,0.705155,0.658525,0.658986,50
VGNAE_1e-05_0_4c954_00005,TERMINATED,127.0.0.1:70614,1.8,16,1e-05,0.705155,0.654839,0.6553,50
VGNAE_0.001_0_4c954_00006,TERMINATED,127.0.0.1:70601,1.8,32,0.001,0.941084,0.814747,0.819355,50
VGNAE_0.0001_0_4c954_00007,TERMINATED,127.0.0.1:70614,1.8,32,0.0001,0.705155,0.659908,0.659908,50
VGNAE_1e-05_0_4c954_00008,TERMINATED,127.0.0.1:70601,1.8,32,1e-05,0.705024,0.657604,0.657604,50
VGNAE_0.001_0_4c954_00009,TERMINATED,127.0.0.1:70614,1.8,64,0.001,0.94161,0.813825,0.818433,50


2023-04-22 15:12:13,361	INFO worker.py:1553 -- Started a local Ray instance.
2023-04-22 15:17:28,419	INFO tune.py:798 -- Total run time: 314.03 seconds (305.95 seconds for the tuning loop).


#### 3.2 Result of Hyperparameter tuning

In [165]:
restored_tuner, result_grid = autoenc.open_validate_ray_experiment(
    "ray_results/train_validate_2023-04-22_15-12-11",
    autoenc.train_validate
)




Loading results from ray_results/train_validate_2023-04-22_15-12-11...


2023-04-22 15:21:06,539	INFO experiment_analysis.py:789 -- No `self.trials`. Drawing logdirs from checkpoint file. This may result in some information that is out of sync, as checkpointing is periodic.


Done!

No errors! Number of terminated trials: 15


In [166]:
# get best score per trial (highest validation accuracy)
N = 10
best_result_df = result_grid.get_dataframe(
    filter_metric="val_auc", filter_mode="max"
)
best_result_df = best_result_df[["trial_id", "training_iteration", "config/enc_channels",
                                 "config/scaling", "config/num_prop", "config/lr", "config/wd", 
                                 "trn_loss", "val_loss", "trn_auc", "val_auc"]]
best_result_df = best_result_df.sort_values(by=["val_auc"], ascending = False)

if len(result_grid) > N:
    best_result_df = best_result_df.head(N)

best_result_df

Unnamed: 0,trial_id,training_iteration,config/enc_channels,config/scaling,config/num_prop,config/lr,config/wd,trn_loss,val_loss,trn_auc,val_auc
0,4c954_00000,42,64,1.8,4,0.001,0,5.112204,1.409582,0.948185,0.825806
3,4c954_00003,41,64,1.8,16,0.001,0,5.324944,1.422033,0.950026,0.823963
6,4c954_00006,47,64,1.8,32,0.001,0,5.056614,1.652868,0.940163,0.819355
9,4c954_00009,48,64,1.8,64,0.001,0,4.720571,1.399702,0.944503,0.818433
12,4c954_00012,48,64,1.8,128,0.001,0,4.720572,1.399701,0.944503,0.818433
1,4c954_00001,1,64,1.8,4,0.0001,0,9.989462,6.556673,0.693056,0.661751
2,4c954_00002,1,64,1.8,4,1e-05,0,9.989462,6.556799,0.693056,0.661751
10,4c954_00010,35,64,1.8,64,0.0001,0,10.118744,7.35016,0.705155,0.660369
13,4c954_00013,35,64,1.8,128,0.0001,0,10.118744,7.35016,0.705155,0.660369
7,4c954_00007,35,64,1.8,32,0.0001,0,10.118716,7.350062,0.705024,0.659908


#### 3.3 Embeddings based on best model

In [167]:
# load best autoencoder
path = os.path.abspath("")+"/models/VGNAE_0.001_0_4c954_00000\\autoencoder.pt"
#path = "models/autoencoder.pt"
model = VGAE(autoenc.Encoder(data.x.size()[1], 64, 1.5, 4, 0, 0))
model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))

<All keys matched successfully>

In [22]:
print(type(data))

<class 'torch_geometric.data.data.Data'>


In [169]:
# get embeddings of nodes
embedding = autoenc.get_embeddings(model, data.x, data.train_pos_edges)

In [24]:

print(len(np.unique(embedding[:, 0])))

# Count NaN values in the vector
print(np.isnan(embedding[:, 0]).sum())

2708
0


In [170]:
# plot embedding
plt.scatter(
    embedding[:, 0],
    embedding[:, 1])
plt.gca().set_aspect('equal', 'datalim')
plt.title('VGNAE projection (first 2 dim) of nodes')

Text(0.5, 1.0, 'VGNAE projection (first 2 dim) of nodes')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte

Embeddings look like a sphere, we cannot do any meaningful clusering based on this.

Let us now take our original VGNAE embeddings and put them into a pandas dataframe.

In [171]:
node_emb = pd.DataFrame(embedding).rename(columns = {val: f"x{val+1}" for val in range(embedding.shape[1])})

node_emb

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x55,x56,x57,x58,x59,x60,x61,x62,x63,x64
0,-3.631031,0.028460,0.926335,0.548944,-2.193384,0.731222,1.397902,-1.545606,1.244934,1.000957,...,1.358602,-0.166094,-0.160326,1.100742,0.812491,0.704597,0.639013,0.758376,0.568736,0.215883
1,-0.628823,-0.912323,0.148418,0.129887,-1.000357,-0.196018,0.929288,-0.577871,0.262739,-0.001889,...,-1.556029,0.267822,1.300768,-0.427655,0.539979,0.189573,1.328683,-0.685855,-1.975274,0.329615
2,-0.532698,-1.655529,-0.401727,0.540130,-0.163291,0.445807,0.616954,0.578093,-1.522068,0.336746,...,-0.092530,-0.547607,0.559317,0.046573,-0.247027,1.177279,-0.246121,-0.709579,-0.604069,-0.077611
3,-0.868176,-0.862198,-2.047325,-0.001196,0.021318,1.498208,-0.649617,-0.621859,0.534795,-0.572370,...,1.019343,-0.060611,0.787599,-1.255767,-0.586231,0.898220,0.127469,1.042326,0.220740,-1.133598
4,-0.422011,0.741677,1.214974,-1.098017,0.588002,0.224144,-0.653673,0.718662,0.375990,0.592233,...,0.916731,0.274831,-0.720984,-0.199879,-0.208231,-0.757123,-2.359159,0.350891,-0.981901,1.005091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,-0.510620,-0.318713,0.370595,-1.302752,0.970978,-1.405791,-1.257336,-0.411335,-0.240713,-1.554785,...,-0.990376,-1.784252,-0.037024,-0.316709,-0.957318,0.321584,0.484114,0.216402,0.538928,-0.736905
2704,-2.070222,0.338095,1.204159,-0.458026,0.531755,1.257530,-0.139964,1.501166,0.246643,0.710152,...,1.511560,0.023357,1.944482,-1.088314,-1.085923,-0.271345,0.466946,-0.680112,0.460005,0.125971
2705,0.345489,-0.906130,0.703535,-0.096268,2.555202,0.376796,-1.372653,1.828496,-1.323692,-0.089441,...,-1.178744,0.424242,-0.661691,-1.280372,-0.989201,-0.125848,0.536621,-0.372583,1.535238,-1.350510
2706,-0.300246,-0.125582,0.530144,0.632183,2.185719,0.849456,0.142994,1.541359,-1.005819,0.318848,...,-0.610017,0.270635,-1.764078,0.850458,-1.076462,1.976640,0.914309,-0.265571,1.933712,-2.279320


In [75]:

trainval_tf2 = trainval_tf
trainval_tf2['train_mask'] = (trainval_tf2['y'] == 1)
print(trainval_tf.head())
print(trainval_tf2.head())
print(len(train_tf))
print(len(val_tf))

train_tf['train_mask'] = True

   y  source  target  train_mask
0  0       1      21       False
1  0    1698    2609       False
2  0       1     905       False
3  0     690    1698       False
4  0       1     906       False
   y  source  target  train_mask
0  0       1      21       False
1  0    1698    2609       False
2  0       1     905       False
3  0     690    1698       False
4  0       1     906       False
7604
2170
sim     0
y       0
pred    0
dtype: int64


#### 3.4 Compute edge features based on best model

In [172]:
# predict train
y_train_hat = pd.DataFrame(autoenc.get_similarity(model, data.x, data.train_pos_edges, data.train_edges))

y_train_hat = (y_train_hat
    .rename(columns = {0: "sim"})
    .assign(y = trainval_tf.loc[trainval_tf.train_mask == True].y.values)
    .assign(pred = lambda df_: (df_.sim > df_.sim.median()).astype(int))
)

print("ROC: ", roc_auc_score(y_train_hat.y, y_train_hat.sim))
print("Acc: ", accuracy_score(y_train_hat.y, y_train_hat.pred))

7604
7604
ROC:  0.9764215364930167
Acc:  0.948185165702262


In [15]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax.scatter(y_train_hat.loc[y_train_hat['y'] == 0, 'sim'], y_train_hat.loc[y_train_hat['y'] == 0, 'y'], label='0')
ax.scatter(y_train_hat.loc[y_train_hat['y'] == 1, 'sim'], y_train_hat.loc[y_train_hat['y'] == 1, 'y'], label='1')
ax.legend()
plt.xlabel('sim')
plt.ylabel('y')
plt.savefig('scatter_plot.png')
plt.close()

In [173]:
y_train_hat_clean = y_train_hat[["sim", "y"]].apply(pd.to_numeric, errors='coerce').dropna()
sns.pairplot(y_train_hat_clean[["sim", "y"]], hue = "y")

<seaborn.axisgrid.PairGrid at 0x7fdd18d41a00>

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte

In [28]:
print(len(trainval_tf))
print(len(train_tf))
print(len(y_train_hat))
print(len(val_tf))
print(len(y_val_hat))
print(data.train_pos_edges.shape[1])
print(data.train_edges.shape[1])
print(data.val_edges.shape[1])
"""Number of positive edges for training: 3802
Number of positive edges for validation: 1085
Number of positive edges for test: 542
Number of edges in original graph: 5429
Number of edges in training graph: 3802
Number of non-existing edges generated: 29971
Number of negative edges for training: 3802
Number of negative edges for validation: 1085
Number of negative edges for test: 542"""

9774
7604
4942
2170
4832
2361
4942
4832


'Number of positive edges for training: 3802\nNumber of positive edges for validation: 1085\nNumber of positive edges for test: 542\nNumber of edges in original graph: 5429\nNumber of edges in training graph: 3802\nNumber of non-existing edges generated: 29971\nNumber of negative edges for training: 3802\nNumber of negative edges for validation: 1085\nNumber of negative edges for test: 542'

In [174]:
# predict val
y_val_hat = pd.DataFrame(autoenc.get_similarity(model, data.x, data.train_pos_edges, data.val_edges))
print(len(y_val_hat))
y_val_hat = (y_val_hat
    .rename(columns = {0: "sim"})
    .assign(y = trainval_tf.loc[trainval_tf.val_mask == True].y.values)
    .assign(pred = lambda df_: (df_.sim > df_.sim.median()).astype(int))
)
print("ROC: ", roc_auc_score(y_val_hat.y, y_val_hat.sim))
print("Acc: ", accuracy_score(y_val_hat.y, y_val_hat.pred))

2170
ROC:  0.889936503217312
Acc:  0.8258064516129032


In [41]:
len(val_tf)


2170

In [175]:
tmp = (val_tf
    .assign(sim  = y_val_hat.pred.values)
    .assign(dist = lambda df_: [np.linalg.norm(node_emb.loc[u].values-node_emb.loc[v].values) for u, v in zip(df_.source, df_.target)])
)

In [84]:
sns.pairplot(tmp[["sim", "dist", "y"]], hue = "y")

NameError: name 'tmp' is not defined

In [179]:
tmp[['target', 'source', 'y', 'sim', 'dist']].corr()

Unnamed: 0,target,source,y,sim,dist
target,1.0,0.320768,-0.493566,-0.420214,0.199539
source,0.320768,1.0,-0.487168,-0.398406,0.319183
y,-0.493566,-0.487168,1.0,0.651613,-0.285842
sim,-0.420214,-0.398406,0.651613,1.0,-0.27346
dist,0.199539,0.319183,-0.285842,-0.27346,1.0


In [180]:
# predict test
y_test_hat = pd.DataFrame(autoenc.get_similarity(model, data.x, data.trainval_pos_edges, data.test_edges))
y_test_hat = (y_test_hat
    .rename(columns = {0: "sim"})
    .assign(pred = lambda df_: (df_.sim > df_.sim.median()).astype(int))
)
y_test_hat[["pred"]].value_counts()

pred
0       542
1       542
dtype: int64