# Machine Learning in Network Science
Group Challenge

***
by: Leonardo Basili, Paul Bédier, Lasse Schmidt

within: MS Data Sciences & Business Analytics

at: CentraleSupélec & ESSEC Business School
***

This notebook covers global graph feature extraction such as Rooted Pagerank and SimRank.

### 1. Import Packages

In [5]:
from importlib import reload
reload(analyseData)
reload(prepData)
reload(loadData)
reload(modeling)
reload(autoenc)

<module 'util.autoencoder' from 'c:\\Users\\lasse\\OneDrive\\Dokumente\\2_Bildung\\2_MSc\\1_Classes\\Y2T2_Machine Learning in Network Science\\4_challenge (git)\\Network-Science_Challenge\\util\\autoencoder.py'>

In [1]:
# import own scripts
import util.analyse_Data as analyseData
import util.preprocess_Data as prepData
import util.load_Data as loadData
import util.modeling as modeling
import util.autoencoder as autoenc

In [26]:
# basic stuff
from itertools import product, combinations
from collections import OrderedDict

# parse & handle data
import os
import csv
import json
import numpy as np
import pandas as pd
import networkx as nx # graph data
import sknetwork

# evaluation
from sklearn.metrics import accuracy_score

from tqdm import tqdm

In [3]:
# might take up to a few minutes
(G, G_train, node_info,
 train_tf, val_tf, trainval_tf,
 test, test_tf,
 X_train, y_train, X_val, y_val, X_trainval, y_trainval,
 X_test) = loadData.load_transform(testing_ratio = 0.2)

Number of positive edges for training: 4174
Number of positive edges for validation: 1043
Number of edges in original graph: 5217
Number of edges in training graph: 4174
The graph is connected
Enriching train data...
Enriching validation data...
Enriching test data...


### 2. Rooted Pagerank

In [4]:
def compute_save_rooted_pagerank_json(G, df, damp, eps, trainval = False):
    # create dictionary to store result
    res = dict()

    # compute rooted pagerank
    pagerank = {root: prepData.rooted_pagerank(G, root, d = damp, epsilon = eps) for root in sorted(df.node1.unique())}

    # only store the edges we actually need in result dict
    for u, v in zip(df.node1, df.node2):
        res[str(u)+"_"+str(v)] = pagerank[u][v]

    # save in json file
    if trainval:
        fname = f"rooted_pagerank_trainval_d{str(int(damp*100))}_eps{str(eps)}.json"
    else:
        fname = f"rooted_pagerank_test_d{str(int(damp*100))}_eps{str(eps)}.json"

    with open("data/" + fname, "w") as file:
        json.dump(res, file)

In [5]:
# search space
dampening_facts = [0.5, 0.75, 0.9, 0.95, 0.99]
eps = [1e-4, 1e-6]

# create rooted page rank using different hyperparams
for damp in dampening_facts:
    for e in eps:
        print(f"Computing pagerank using damp {damp} and eps {e}...")

        # trainval edges
        compute_save_rooted_pagerank_json(G_train, trainval_tf, damp = damp, eps = e, trainval = True)

        # test edges
        compute_save_rooted_pagerank_json(G, test_tf, damp = damp, eps = e, trainval = False)

Computing pagerank using damp 0.5 and eps 0.0001...
Computing pagerank using damp 0.5 and eps 1e-06...
Computing pagerank using damp 0.75 and eps 0.0001...
Computing pagerank using damp 0.75 and eps 1e-06...
Computing pagerank using damp 0.9 and eps 0.0001...
Computing pagerank using damp 0.9 and eps 1e-06...
Computing pagerank using damp 0.95 and eps 0.0001...
Computing pagerank using damp 0.95 and eps 1e-06...
Computing pagerank using damp 0.99 and eps 0.0001...
Computing pagerank using damp 0.99 and eps 1e-06...


Let us now find the best hyperparameters of our rooted pagerank by validating each of them with our supervised model.

In [15]:
# get the names of the files
fnames_trainval, fnames_test = [], []

# used search space
dampening_facts = [0.5, 0.75, 0.9, 0.95, 0.99]
eps = [1e-4, 1e-6]

# get names
for damp in dampening_facts:
    for e in eps:
        fnames_trainval.append(f"data/rooted_pagerank_trainval_d{str(int(damp*100))}_eps{str(e)}.json")
        fnames_test.append(f"data/rooted_pagerank_test_d{str(int(damp*100))}_eps{str(e)}.json")

In [17]:
def compute_score(df, cols, method, thresh):
    # we assume that all metrics get better with increasing values!
    
    df_ = df[list(cols)]
    
    if method == "rank_avg":
        df_ = df_.rank(pct = True).mean(axis = 1)  
    elif method == "avg":
        df_ = df_.mean(axis = 1)
    elif method == "whitened_sigmoid_avg":
        df_ = pd.DataFrame({col: sknetwork.linkpred.whitened_sigmoid(df_[col].to_numpy()) for col in df_.columns})
        df_ = df_.mean(axis = 1)
        
    if thresh == "top50%":
        y_hat = (df_ > df_.median()).astype(int)
    elif thresh == "thresh":
        y_hat = (df_ > 0.5).astype(int)
    elif thresh == "return_probas":
        y_hat = df_.rank(pct = True)
        
    return y_hat

In [45]:
# where we will store result
res = OrderedDict()

# search space
dampening_facts = [0.5, 0.75, 0.9, 0.95, 0.99]
eps = [1e-4, 1e-6]

# create rooted page rank using different hyperparams
for (trainval, test, (damp, e)) in zip(fnames_trainval, fnames_test, product(dampening_facts, eps)):

    # read json files for rank algorithms
    with open(trainval, "r") as file:
        r_pgr_trainval = json.load(file)
    with open(test, "r") as file:
        r_pgr_test = json.load(file)

    def read_pagerank_json(json, u, v):
        key = str(u)+"_"+str(v)
        if key in json.keys():
            return json[key]

    # append to dataframes
    train_tf = train_tf.assign(root_pagerank = lambda df_: [read_pagerank_json(r_pgr_trainval, u, v) for u, v in zip(df_.node1, df_.node2)])
    val_tf =     val_tf.assign(root_pagerank = lambda df_: [read_pagerank_json(r_pgr_trainval, u, v) for u, v in zip(df_.node1, df_.node2)])
    test_tf  =  test_tf.assign(root_pagerank = lambda df_: [read_pagerank_json(r_pgr_test, u, v) for u, v in zip(df_.node1, df_.node2)])

    # which cols we want to use for link prediction
    cols = ["root_pagerank"]

    methods = ["rank_avg"]
    threshs = ["thresh"]

    # generate all combinations of columns in cols
    sampled_cols = []
    for n in range(1, len(cols) + 1):
        sampled_cols += list([c for c in combinations(cols, n)])

    for s, m, t in tqdm(product(sampled_cols, methods, threshs)):
        y_train_hat = compute_score(X_train, s, m, t)
        y_val_hat   = compute_score(X_val, s, m, t)
        trn_acc     = accuracy_score(y_train, y_train_hat)
        val_acc     = accuracy_score(y_val, y_val_hat)
        
        y_test_hat  = compute_score(X_test, s, m, t)
        tst_acc     = accuracy_score(enriched_test_tf.y, y_test_hat)
        res[(s, m, t, damp, e)] = {"trn_acc": trn_acc, "val_acc": val_acc, "test_acc": tst_acc}

1it [00:00, 71.44it/s]
1it [00:00, 119.50it/s]
1it [00:00, 123.20it/s]
1it [00:00, 143.11it/s]
1it [00:00, 145.05it/s]
1it [00:00, 140.89it/s]
1it [00:00, 142.60it/s]
1it [00:00, 167.36it/s]
1it [00:00, 166.69it/s]
1it [00:00, 166.27it/s]


In [46]:
ordered_res = (sorted(res.items(), key = lambda kv: kv[1]["val_acc"], reverse = True))

for (col, m, t, damp, e), val_dict in ordered_res[0:30]:
    print(f"using {damp}, {e}, {col}, {m}, {t}")
    print(f"Train Accuracy {round(val_dict['trn_acc'], 5)}, Val Accuracy {round(val_dict['val_acc'], 5)}, Test Accuracy {round(val_dict['test_acc'], 5)} \n")

using 0.5, 0.0001, ('root_pagerank',), rank_avg, thresh
Train Accuracy 0.99761, Val Accuracy 0.71811, Test Accuracy 0.74145 

using 0.5, 1e-06, ('root_pagerank',), rank_avg, thresh
Train Accuracy 0.99761, Val Accuracy 0.71811, Test Accuracy 0.74145 

using 0.75, 0.0001, ('root_pagerank',), rank_avg, thresh
Train Accuracy 0.99761, Val Accuracy 0.71811, Test Accuracy 0.74145 

using 0.75, 1e-06, ('root_pagerank',), rank_avg, thresh
Train Accuracy 0.99761, Val Accuracy 0.71811, Test Accuracy 0.74145 

using 0.9, 0.0001, ('root_pagerank',), rank_avg, thresh
Train Accuracy 0.99761, Val Accuracy 0.71811, Test Accuracy 0.74145 

using 0.9, 1e-06, ('root_pagerank',), rank_avg, thresh
Train Accuracy 0.99761, Val Accuracy 0.71811, Test Accuracy 0.74145 

using 0.95, 0.0001, ('root_pagerank',), rank_avg, thresh
Train Accuracy 0.99761, Val Accuracy 0.71811, Test Accuracy 0.74145 

using 0.95, 1e-06, ('root_pagerank',), rank_avg, thresh
Train Accuracy 0.99761, Val Accuracy 0.71811, Test Accuracy 0.

Apparently hyperparam tuning of pagerank has absolutely no influence on overall score. When investigating the files, it is clear that the pagerank values change quite a lot -- but the global ordering of the pagerank scores of the edges is kept.

### 3. SimRank

In [12]:
# run simrank on G and G_train for each node
simrank_test, simrank_trainval = prepData.get_simrank(G, G_train, test_tf, trainval_tf)

# save resulting dictionaries in json files
with open("data/simrank_trainval.json", "w") as file:
    json.save(simrank_trainval, file)
with open("data/simrank_test.json", "w") as file:
    json.save(simrank_test, file)