# Machine Learning in Network Science
Group Challenge

***
by: Leonardo Basili, Paul Bédier, Lasse Schmidt

within: MS Data Sciences & Business Analytics

at: CentraleSupélec & ESSEC Business School
***

This notebook covers global graph feature extraction such as Rooted Pagerank and SimRank.

### 1. Import Packages

In [20]:
from importlib import reload
reload(analyseData)
reload(prepData)
reload(loadData)
reload(modeling)
reload(autoenc)

<module 'util.autoencoder' from '/Users/macbookpro/Documents/GitHub/Network-Science_Final-Project/util/autoencoder.py'>

In [2]:
# import own scripts
import util.analyse_Data as analyseData
import util.preprocess_Data as prepData
import util.load_Data as loadData
import util.modeling as modeling
import util.autoencoder as autoenc

In [4]:
# basic stuff
from itertools import product, combinations
from collections import OrderedDict

# parse & handle data
import os
import csv
import json
import numpy as np
import pandas as pd
import networkx as nx # graph data
import sknetwork

# evaluation
from sklearn.metrics import accuracy_score

from tqdm import tqdm

In [5]:
(G, G_train, G_trainval, node_info, train_tf, val_tf, trainval_tf, test_tf) = loadData.load()

Number of positive edges for training: 3802
Number of positive edges for validation: 1085
Number of positive edges for test: 542
Number of edges in original graph: 5429
Number of edges in training graph: 3802
Number of non-existing edges generated: 29971
Number of negative edges for training: 3802
Number of negative edges for validation: 1085
Number of negative edges for test: 542


In [26]:
# might take up to a few minutes
reload(analyseData)
reload(prepData)
reload(loadData)
reload(modeling)
reload(autoenc)
(G, G_train, G_trainval, node_info,
 train_tf, val_tf, trainval_tf, test_tf,
 X_train, y_train, X_val, y_val, X_trainval, y_trainval,
 X_test, y_test) = loadData.load_transform(val_ratio = 0.2, test_ratio = 0.1)


Number of positive edges for training: 3802
Number of positive edges for validation: 1085
Number of positive edges for test: 542
Number of edges in original graph: 5429
Number of edges in training graph: 3802
Number of non-existing edges generated: 29971
Number of negative edges for training: 3802
Number of negative edges for validation: 1085
Number of negative edges for test: 542


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:01<00:00,  9.20it/s]


Enriching train data...


  adj = nx.adjacency_matrix(G, nodelist = nodelist)
  adj = nx.adjacency_matrix(G, nodelist = nodelist)
  adj = nx.adjacency_matrix(G, nodelist = nodelist).toarray()


Enriching validation data...
Enriching test data...


### 2. Rooted Pagerank

In [99]:
def compute_save_rooted_pagerank_json(G, df, damp, eps, trainval_tf = False):
    # create dictionary to store result
    res = dict()

    # compute rooted pagerank
    pagerank = {root: prepData.rooted_pagerank(G, root, d = damp, epsilon = eps) for root in sorted(df.source.unique())}

    # only store the edges we actually need in result dict
    for u, v in zip(df.source, df.target):
        res[str(u)+"_"+str(v)] = pagerank[u][v]

    # save in json file
    if trainval_tf:
        fname = f"rooted_pagerank_trainval_d{str(int(damp*100))}_eps{str(eps)}.json"
    else:
        fname = f"rooted_pagerank_test_d{str(int(damp*100))}_eps{str(eps)}.json"

    with open("data/" + fname, "w") as file:
        json.dump(res, file)

In [100]:
print(X_train.head())

   nodeInfo_dupl  nodeInfo_diff  source_DCT  target_DCT      BCT_diff  \
0              0             38    0.001108    0.001847  2.866839e-06   
0              0             26    0.001847    0.001108 -1.822490e-05   
1              0             37    0.001108    0.001108  8.190967e-07   
1              0             41    0.001108    0.001847  1.795187e-05   
2              0             33    0.001108    0.002216  1.228645e-06   

   katz_idx  sim_rank  root_pagerank  node2vec_1  node2vec_2  node2vec_3  \
0       0.0       0.0            0.0    0.000004    0.254515    0.070102   
0       0.0       0.0            0.0    1.296436    0.799854    0.164829   
1       0.0       0.0            0.0    0.001248    0.038168    0.000728   
1       0.0       0.0            0.0    2.122296    1.059141    0.082487   
2       0.0       0.0            0.0    0.000330    0.000312    0.002599   

   node2vec_4  friendLink       PR1           PR2  
0    0.142970         0.0  0.000407  5.812262e-09  


In [101]:
# search space
dampening_facts = [0.5, 0.75, 0.9, 0.95, 0.99]
eps = [1e-4, 1e-6]

# create rooted page rank using different hyperparams
for damp in dampening_facts:
    for e in eps:
        print(f"Computing pagerank using damp {damp} and eps {e}...")

        # trainval edges
        compute_save_rooted_pagerank_json(G_train, trainval_tf, damp = damp, eps = e, trainval_tf = True)

        # test edges
        compute_save_rooted_pagerank_json(G, test_tf, damp = damp, eps = e, trainval_tf = False)

Computing pagerank using damp 0.5 and eps 0.0001...
Computing pagerank using damp 0.5 and eps 1e-06...
Computing pagerank using damp 0.75 and eps 0.0001...
Computing pagerank using damp 0.75 and eps 1e-06...
Computing pagerank using damp 0.9 and eps 0.0001...
Computing pagerank using damp 0.9 and eps 1e-06...
Computing pagerank using damp 0.95 and eps 0.0001...
Computing pagerank using damp 0.95 and eps 1e-06...
Computing pagerank using damp 0.99 and eps 0.0001...
Computing pagerank using damp 0.99 and eps 1e-06...


Let us now find the best hyperparameters of our rooted pagerank by validating each of them with our supervised model.

In [102]:
# get the names of the files
fnames_trainval, fnames_test = [], []

# used search space
dampening_facts = [0.5, 0.75, 0.9, 0.95, 0.99]
eps = [1e-4, 1e-6]

# get names
for damp in dampening_facts:
    for e in eps:
        fnames_trainval.append(f"data/rooted_pagerank_trainval_d{str(int(damp*100))}_eps{str(e)}.json")
        fnames_test.append(f"data/rooted_pagerank_test_d{str(int(damp*100))}_eps{str(e)}.json")

In [103]:
def compute_score(df, cols, method, thresh):
    # we assume that all metrics get better with increasing values!
    df_ = df[list(cols)]
    if method == "rank_avg":
        df_ = df_.rank(pct = True).mean(axis = 1)  
    elif method == "avg":
        df_ = df_.mean(axis = 1)
    elif method == "whitened_sigmoid_avg":
        df_ = pd.DataFrame({col: sknetwork.linkpred.whitened_sigmoid(df_[col].to_numpy()) for col in df_.columns})
        df_ = df_.mean(axis = 1)
        
    if thresh == "top50%":
        y_hat = (df_ > df_.median()).astype(int)
    elif thresh == "thresh":
        y_hat = (df_ > 0.5).astype(int)
    elif thresh == "return_probas":
        y_hat = df_.rank(pct = True)
        
    return y_hat

In [87]:
print(sampled_cols)

[('root_pagerank',), ('sim_rank',), ('root_pagerank', 'sim_rank')]


In [104]:
# where we will store result
res = OrderedDict()

# search space
dampening_facts = [0.5, 0.75, 0.9, 0.95, 0.99]
eps = [1e-4, 1e-6]

# create rooted page rank using different hyperparams
for (trainval, test, (damp, e)) in zip(fnames_trainval, fnames_test, product(dampening_facts, eps)):

    # read json files for rank algorithms
    with open(trainval, "r") as file:
        r_pgr_trainval = json.load(file)
    with open(test, "r") as file:
        r_pgr_test = json.load(file)

    def read_pagerank_json(json, u, v):
        key = str(u)+"_"+str(v)
        if key in json.keys():
            return json[key]

    # append to dataframes
    train_tf = train_tf.assign(root_pagerank = lambda df_: [read_pagerank_json(r_pgr_trainval, u, v) for u, v in zip(df_.source, df_.target)])
    val_tf = val_tf.assign(root_pagerank = lambda df_: [read_pagerank_json(r_pgr_trainval, u, v) for u, v in zip(df_.source, df_.target)])
    test_tf  = test_tf.assign(root_pagerank = lambda df_: [read_pagerank_json(r_pgr_test, u, v) for u, v in zip(df_.source, df_.target)])

    # which cols we want to use for link prediction
    cols = ["root_pagerank"]

    methods = ["rank_avg"]
    threshs = ["thresh"]

    # generate all combinations of columns in cols
    sampled_cols = []
    for n in range(1, len(cols) + 1):
        sampled_cols += list([c for c in combinations(cols, n)])

    for s, m, t in tqdm(product(sampled_cols, methods, threshs)):
        y_train_hat = compute_score(X_train, s, m, t)
        y_val_hat   = compute_score(X_val, s, m, t)
        trn_acc     = accuracy_score(y_train, y_train_hat)
        val_acc     = accuracy_score(y_val, y_val_hat)
        
        y_test_hat  = compute_score(X_test, s, m, t)
        tst_acc     = accuracy_score(test_tf.y, y_test_hat)
        res[(s, m, t, damp, e)] = {"trn_acc": trn_acc, "val_acc": val_acc, "test_acc": tst_acc}

0it [00:00, ?it/s]


TypeError: tuple indices must be integers or slices, not list

In [105]:
ordered_res = (sorted(res.items(), key = lambda kv: kv[1]["val_acc"], reverse = True))

for (col, m, t, damp, e), val_dict in ordered_res[0:30]:
    print(f"using {damp}, {e}, {col}, {m}, {t}")
    print(f"Train Accuracy {round(val_dict['trn_acc'], 5)}, Val Accuracy {round(val_dict['val_acc'], 5)}, Test Accuracy {round(val_dict['test_acc'], 5)} \n")

Apparently hyperparam tuning of pagerank has absolutely no influence on overall score. When investigating the files, it is clear that the pagerank values change quite a lot -- but the global ordering of the pagerank scores of the edges is kept.

### 3. SimRank

In [49]:
# run simrank on G and G_train for each node
simrank_test, simrank_trainval = prepData.get_simrank(G, G_train, test_tf, trainval_tf)

# save resulting dictionaries in json files
with open("data/simrank_trainval.json", "w") as file:
    json.dump(simrank_trainval, file)
with open("data/simrank_test.json", "w") as file:
    json.dump(simrank_test, file)

In [48]:
pagerank_test, pagerank_trainval = prepData.get_simrank(G, G_train, test_tf, trainval_tf)

# save resulting dictionaries in json files
with open("data/pagerank_trainval.json", "w") as file:
    json.dump(pagerank_trainval, file)
with open("data/pagerank_test.json", "w") as file:
    json.dump(pagerank_test, file)

In [7]:
import util.official_Data as officialData

In [8]:
enriched_test = officialData.enrich_test()



In [9]:
enriched_test_tf = pd.merge(test_tf, enriched_test, how = "left", on = ["node1", "node2"])
y_col = enriched_test_tf.pop("y")
enriched_test_tf.insert(2, "y", y_col)

In [11]:
enriched_test_tf.y.value_counts()

0    1748
1    1733
Name: y, dtype: int64