In [1]:
import numpy as np
import pandas as pd
from collections import Counter

import networkx as nx
from graphrole import RecursiveFeatureExtractor, RoleExtractor

from pprint import pprint

import warnings
warnings.filterwarnings("ignore")

In [2]:
edgelist = '../graphs/erasmus.weighted.edgelist'
G = nx.read_weighted_edgelist(edgelist, create_using=nx.DiGraph)

In [3]:
# extract recursive features (may take some time)
feature_extractor = RecursiveFeatureExtractor(G, max_generations=10)
df_features = feature_extractor.extract_features()

print(f'Features extracted from {feature_extractor.generation_count} recursive generations')
display(df_features)

Features extracted from 6 recursive generations


Unnamed: 0,external_edges(mean)(mean)(mean)(mean)(mean),external_edges(mean)(mean)(mean)(mean),in_degree(mean)(mean)(mean)(mean),external_edges(mean)(mean)(mean),external_edges(sum)(mean)(mean),in_degree(mean)(mean)(mean),internal_edges(mean)(mean)(mean),external_edges(mean)(mean),external_edges(sum)(mean),in_degree(mean)(mean),...,external_edges(sum),in_degree(mean),internal_edges(mean),out_degree(mean),total_degree(mean),external_edges,in_degree,internal_edges,out_degree,total_degree
A__BADEN01,145872.735983,147234.072295,1648.438477,150100.381828,2.393653e+07,1719.977510,95710.686151,147089.610383,2.735347e+07,1547.934582,...,2091414.0,1929.615385,95401.307692,1734.846154,3664.461538,22113.0,39.0,497.0,57.0,96.0
A__DORNBIR01,140796.345033,139357.315492,1561.016769,136178.848831,2.036462e+07,1519.730122,80424.080669,126590.103192,1.834780e+07,1381.258737,...,7148348.0,1368.935484,54767.532258,1187.483871,2556.419355,67019.0,319.0,7045.0,440.0,759.0
A__EISENST01,146186.842411,147283.474239,1640.960041,148249.994481,2.435319e+07,1648.415919,91963.204196,147461.397326,2.300403e+07,1501.270859,...,683215.0,1217.800000,70065.600000,1358.600000,2576.400000,6751.0,29.0,68.0,26.0,55.0
A__EISENST02,143570.042382,143573.515287,1615.259526,141482.052386,2.269528e+07,1573.059351,84877.269691,137721.623927,2.222955e+07,1562.420831,...,5970714.0,1621.214286,75585.857143,1399.690476,3020.904762,54806.0,147.0,4179.0,198.0,345.0
A__EISENST05,114744.920677,106447.105320,1193.635043,94942.417391,1.302815e+07,1040.738758,52021.453292,79459.623190,7.742854e+06,841.886294,...,324101.0,364.200000,12833.000000,362.000000,726.200000,1802.0,67.0,18.0,10.0,77.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Unknown_PT,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,39.0,0.0,0.0,39.0
Unknown_S,143673.982646,144216.937091,1624.705248,142741.090716,2.317479e+07,1592.639936,86850.885537,140933.715583,1.984955e+07,1576.270540,...,9689570.0,1416.578947,64126.776316,1232.000000,2648.578947,84018.0,531.0,9768.0,154.0,685.0
Unknown_SK,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,4.0,0.0,0.0,4.0
Unknown_TR,149752.078361,153436.749266,1739.489771,157657.419963,2.826685e+07,1744.385806,101158.031593,167362.391370,4.309888e+07,1888.827451,...,477746.0,1316.500000,202491.500000,1756.000000,3072.500000,3512.0,2.0,4.0,4.0,6.0


In [4]:
# assign node roles
role_extractor = RoleExtractor(n_roles=None)
role_extractor.extract_role_factors(df_features)
dic_node_role = role_extractor.roles

df_node_role_percentage = role_extractor.role_percentage.round(2)

print('Node role membership by percentage')
display(df_node_role_percentage)

Node role membership by percentage


Unnamed: 0,role_0,role_1,role_2,role_3,role_4,role_5,role_6,role_7
A__BADEN01,0.01,0.39,0.01,0.01,0.20,0.01,0.20,0.20
A__DORNBIR01,0.01,0.20,0.01,0.01,0.01,0.01,0.39,0.39
A__EISENST01,0.01,0.39,0.01,0.20,0.20,0.01,0.01,0.20
A__EISENST02,0.00,0.31,0.00,0.00,0.00,0.15,0.52,0.00
A__EISENST05,0.01,0.32,0.01,0.01,0.63,0.01,0.01,0.01
...,...,...,...,...,...,...,...,...
Unknown_PT,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12
Unknown_S,0.00,0.00,0.00,0.00,0.00,0.00,0.61,0.36
Unknown_SK,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12
Unknown_TR,0.00,0.45,0.13,0.00,0.13,0.00,0.27,0.00


In [5]:
role_count = Counter(dic_node_role.values())
pprint(role_count)

Counter({'role_1': 1213,
         'role_4': 570,
         'role_0': 513,
         'role_6': 321,
         'role_2': 252,
         'role_3': 168,
         'role_7': 149,
         'role_5': 106})


## Feature Grouping
Given a node-feature matrix $V_{n\times f}$ obtained from recursive feature extraction\
Approximate $$GF \approx V$$
$G_{n\times r}$: nodes's membership in each role\
$F_{r\times f}$: role's contribution in each feature

In [6]:
# TODO
# 1. Use Non-negative Matrix Factorization to decompose V.
#    graphrole/rolse/factor.py > get_nmf_decomposition()
# Get G to further calculate NodeSense.

In [8]:
df_node_role = pd.DataFrame.from_dict(data=dic_node_role, orient="index", columns=["roles"])
features = df_features.join(df_node_role)

In [9]:
pr = nx.pagerank(G, alpha=0.9)

# features["pagerank"] = pr.values()

In [10]:
prdf = pd.DataFrame.from_dict(data=pr, orient="index", columns=["pagerank"]) 
features = features.join(prdf)
print(features)
# print(len(pr.keys()),len(node_roles.keys()))

              external_edges(mean)(mean)(mean)(mean)(mean)  \
A__BADEN01                                   145872.735983   
A__DORNBIR01                                 140796.345033   
A__EISENST01                                 146186.842411   
A__EISENST02                                 143570.042382   
A__EISENST05                                 114744.920677   
...                                                    ...   
Unknown_PT                                        0.000000   
Unknown_S                                    143673.982646   
Unknown_SK                                        0.000000   
Unknown_TR                                   149752.078361   
Unknown_UK                                        0.000000   

              external_edges(mean)(mean)(mean)(mean)  \
A__BADEN01                             147234.072295   
A__DORNBIR01                           139357.315492   
A__EISENST01                           147283.474239   
A__EISENST02                   

In [11]:
features.to_csv("../graphs/node_roles.csv")