# Machine Learning in Network Science
Group Challenge

***
by: Leonardo Basili, Paul Bédier, Lasse Schmidt

within: MS Data Sciences & Business Analytics

at: CentraleSupélec & ESSEC Business School
***

This notebook covers global graph feature extraction such as Rooted Pagerank and SimRank.

### 1. Import Packages

In [18]:
from importlib import reload
reload(analyseData)
reload(prepData)
reload(loadData)
reload(modeling)
reload(autoenc)

<module 'util.autoencoder' from 'c:\\Users\\pbedi\\Documents\\GitHub\\Network-Science_Challenge\\util\\autoencoder.py'>

In [None]:
# import own scripts
import util.analyse_Data as analyseData
import util.preprocess_Data as prepData
import util.load_Data as loadData
import util.modeling as modeling
import util.autoencoder as autoenc

In [None]:
# basic stuff
from itertools import product, combinations
from collections import OrderedDict

# parse & handle data
import os
import csv
import json
import numpy as np
import pandas as pd
import networkx as nx # graph data
import sknetwork

### 2. Rooted Pagerank

In [4]:
# might take up to a few minutes
(G, G_train, node_info,
 train_tf, val_tf, trainval_tf,
 test, test_tf,
 X_train, y_train, X_val, y_val, X_trainval, y_trainval,
 X_test) = loadData.load_transform(testing_ratio = 0.2)

Number of positive edges for training: 4174
Number of positive edges for validation: 1043
Number of edges in original graph: 5217
Number of edges in training graph: 4174
The graph is connected
Enriching train data...
Enriching validation data...
Enriching test data...


In [None]:
# run rooted pagerank on G and G_train for each source node as root
pagerank_G_train = [prepData.rooted_pagerank(G_train, root) for root in trainval_tf.node1]
pagerank_G = [prepData.rooted_pagerank(G, root) for root in test_tf.node1]

# generate dictionaries
pagerank_trainval = dict()
pagerank_test = dict()
for u, v in zip(trainval_tf.node1, trainval_tf.node2):
    pagerank_trainval[str(u)+"_"+str(v)] = pagerank_G_train[str(u)][str(v)]
for u, v in zip(test_tf.node1, test_tf.node2):
    pagerank_test[str(u)+"_"+str(v)] = pagerank_G[str(u)][str(v)]

# save dictionaries in json files
with open("data/pagerank_trainval.json", "w") as file:
    json.save(pagerank_trainval, file)
with open("data/pagerank_test.json", "w") as file:
    json.save(pagerank_test, file)

### 3. SimRank

In [12]:
# run simrank on G and G_train for each node
simrank_test, simrank_trainval = prepData.get_simrank(G, G_train, test_tf, trainval_tf)

# save resulting dictionaries in json files
with open("data/simrank_trainval.json", "w") as file:
    json.save(simrank_trainval, file)
with open("data/simrank_test.json", "w") as file:
    json.save(simrank_test, file)