# Compute Graph Centrality Measures

Takes DBpedia data and compute Graph Centrality Measures. In order to recreate the data, the "Instance-types" and "Mappingbased-objects" listed below need to be downloaded first.
- Both are accessable via the following link: https://downloads.dbpedia.org/repo/dbpedia/mappings/
- Before executing, the files need to be merged into one, for instance by concatenating them in the shell like:
    - cat mappingbased-objects_lang=en.ttl instance-types_lang=en_specific.ttl > mappingbased-objects_instance-types.ttl

In [3]:
# Instance types
# Version: 2021.12.01/ 24-Jan-2022 00:21
# Specific: 24-Jan-2022 00:20            44905249
# Transitive: 24-Jan-2022 00:20           145896226

# Mapping based objects
# Version: 2021.12.01/ 21-Apr-2022 23:33
# 24-Jan-2022 00:34           184053203

# Category data
# https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2021-12
# Version: 2021.12.01

# Importing packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from networkx import Graph as NXGraph
import igraph as ig
from igraph import Graph as iGraph
from rdflib import Graph as RDFGraph
from rdflib.extras.external_graph_libs import rdflib_to_networkx_graph
import csv
import statistics
import collections

#import danker # https://github.com/athalhammer/danker
from scipy import stats
from urllib import parse

import mlnotify # https://github.com/aporia-ai/mlnotify
# Use %%notify at beginning of cell

import os.path, time
# Use %%time at beginning of cell

print('------------')
!python --version
print("Pandas " + pd.__version__)
print("Numpy " + np.__version__)
print("Networkx " + nx.__version__)
print("iGraph " + ig.__version__)
print('------------')
print("All packages loaded and ready to roll :-)")

------------
Python 3.7.6
Pandas 1.3.4
Numpy 1.18.1
Networkx 2.4
iGraph 0.9.11
------------
All packages loaded and ready to roll :-)


In [4]:
# Function for displaying the first X rows of triples in a rdf graph

def showXGraph(graph, rows):
    for index, (sub, pred, obj) in enumerate (graph):
        print(sub, pred, obj)
        print("------------------")
        if index == rows:
            break
            
            
# Function for calculating the mean

def mean(numbers):  
    return float(sum(numbers)) / max(len(numbers), 1)


# Function for calculating the number of pendandts of the NetworkX graph
  # pendant = terminal nodes that end the graph (equals the number of nodes with degree 1)
    
def number_of_pendants(g):
    pendants = 0
    for u in g:
        if g.degree[u] == 1:
            pendants += 1  
    return pendants

In [5]:
%%time
#%%notify

# RDF graph loading for "mapping based objects x instance types"
# takes around 40-60 min to load on the DWS Server (tested on dws-01/dws-02)

path_ttl = "../data/DBpedia_databus/mappingbased-objects_instance-types.ttl" #insert path to the concated DBpedia turtle file
g1 = RDFGraph()
g1.parse(path_ttl, format='turtle')
print("RDFLib Graph loaded successfully with {} triples".format(len(g1)))

# Output: RDFLib Graph loaded successfully with 29489019 triples


print()
print("TIME")
print("============")


TIME
CPU times: user 442 µs, sys: 445 µs, total: 887 µs
Wall time: 805 µs


In [4]:
showXGraph(g1, 10)

http://dbpedia.org/resource/2021–22_A.S._Roma_season__Matías_Viña__1 http://dbpedia.org/ontology/position http://dbpedia.org/resource/Defender_(association_football)
------------------
http://dbpedia.org/resource/Mick_Lawlor_(footballer) http://dbpedia.org/ontology/team http://dbpedia.org/resource/League_of_Ireland_XI
------------------
http://dbpedia.org/resource/Maurice_Banide http://dbpedia.org/ontology/careerStation http://dbpedia.org/resource/Maurice_Banide__CareerStation__1
------------------
http://dbpedia.org/resource/Gulab_Mohanlal_Hiranandani http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://dbpedia.org/ontology/MilitaryPerson
------------------
http://dbpedia.org/resource/2021–22_AaB_season http://dbpedia.org/ontology/manager http://dbpedia.org/resource/Martí_Cifuentes
------------------
http://dbpedia.org/resource/Matthew_Otten__CareerStation__5 http://dbpedia.org/ontology/team http://dbpedia.org/resource/Mersey_Tigers
------------------
http://dbpedia.org/resource/Pet

### Network Analysis in NetworkX

In [5]:
%%time
#%%notify

# Transform rdflib graph into a networkx graph
# takes around 10-20 min to transform

G = rdflib_to_networkx_graph(g1)
print("NetworkX Graph loaded successfully with length {}".format(len(G)))

# Output: NetworkX Graph loaded successfully with length 8243333

print()
print("TIME")
print("============")

NetworkX Graph loaded successfully with length 8243333

TIME
CPU times: user 7min 12s, sys: 1min 2s, total: 8min 14s
Wall time: 8min 15s


In [6]:
%%time

# Analysis

# Network size
print("NETWORK SIZE")
print("============")
print("The network has {} nodes and {} edges".format(G.number_of_nodes(), G.number_of_edges()))
print()

# Pendants
print("PENDANTS")
print("============")
print("The network has {} pendants".format(number_of_pendants(G)))
print()

# Density
print("DENSITY")
print("============")
print("The network density is {}".format(nx.density(G)))
print()

# Network direction
print("DIRECTED")
print("============")
print("The network direction is {}".format(nx.is_directed(G)))
print() # --> The network direction is False

# Connectivity
print("CONNECTIVITY")
print("============")
print("The network connectivity is {}".format(nx.node_connectivity(G)))
print() # --> The network connectivity is False


# Output

#NETWORK SIZE
#============
#The network has 8243333 nodes and 28263172 edges

#PENDANTS
#============
#The network has 1693880 pendants

#DENSITY
#============
#The network density is 8.318504610951633e-07

#CONNECTIVITY
#============
#The network connectivity is 0

NETWORK SIZE
The network has 8243333 nodes and 28263172 edges

PENDANTS
The network has 1693880 pendants

DENSITY
The network density is 8.318504610951633e-07

DIRECTED
The network direction is False

CONNECTIVITY
The network connectivity is 0

CPU times: user 2min 1s, sys: 17.5 s, total: 2min 18s
Wall time: 2min 18s


In [7]:
%%time

# Compute NetworkX degree centrality
  # takes less than 1 min


#mean and stdev
dc = nx.degree_centrality(G)
degreesDC = []
for k,v in dc.items():
    degreesDC.append(v)
    
print("DEGREE CENTRALITY")
print("=================")
print("The mean degree centrality is {}, with stdev {}".format(mean(degreesDC), statistics.stdev(degreesDC)))
print("The maximum node is {}, with value {}".format(max(dc, key=dc.get), max(dc.values())))
print("The minimum node is {}, with value {}".format(min(dc, key=dc.get), min(dc.values())))
print()


# Output

#DEGREE CENTRALITY
#=================
#The mean degree centrality is 8.318504610137836e-07, with stdev 8.382745147113584e-05
#The maximum node is http://dbpedia.org/ontology/CareerStation, with value 0.19711446779045175
#The minimum node is http://dbpedia.org/resource/Royal_College_of_Dentists_of_Canada, with value 1.213101692373909e-07

DEGREE CENTRALITY
The mean degree centrality is 8.318504610137952e-07, with stdev 8.382745147113584e-05
The maximum node is http://dbpedia.org/ontology/CareerStation, with value 0.19711446779045175
The minimum node is http://dbpedia.org/resource/Presbyterian_Church_in_the_United_States, with value 1.213101692373909e-07

CPU times: user 42.6 s, sys: 3.98 s, total: 46.6 s
Wall time: 46.6 s


In [None]:
# Compute NetworkX connected components

cc = list(nx.connected_components(G))
print("CONNECTED COMPONENTS")
print("====================")
print("The graph has {} connected components".format(len(cc)))
for i,c in enumerate(cc):
    print("Connected component {} has {} nodes".format(i,len(c)))
print()


# Output

#CONNECTED COMPONENTS
#====================
#The graph has 2 connected components
#Connected component 0 has 8242958 nodes
#Connected component 1 has 375 nodes

In [8]:
%%time

# Compute NetworkX PageRank
  # takes around 5-7 min

pr = nx.pagerank(G, alpha=0.85) #default value for alpha = 0.85
degreesPR = []
for k,v in pr.items():
    degreesPR.append(v)
print("PageRank CENTRALITY")
print("======================")
print("The mean PageRank is {}, with stdev {}".format(mean(degreesPR), statistics.stdev(degreesPR)))
print("The maximum node is {}, with value {}".format(max(pr, key=pr.get), max(pr.values())))
print("The minimum node is {}, with value {}".format(min(pr, key=pr.get), min(pr.values())))
#histogram(pagerank)
print()


# Output

#PageRank CENTRALITY
#======================
#The mean PageRank is 1.2131015452175617e-07, with stdev 2.8733452386547328e-05
#The maximum node is http://dbpedia.org/ontology/CareerStation, with value 0.056819360035487425
#The minimum node is http://dbpedia.org/resource/Old_Tom_(killer_whale), with value 1.8196678183366034e-08

PageRank CENTRALITY
The mean PageRank is 1.213101545217515e-07, with stdev 2.8733452386546505e-05
The maximum node is http://dbpedia.org/ontology/CareerStation, with value 0.05681936003548549
The minimum node is http://dbpedia.org/resource/Sept_haïkaï, with value 1.8196678183366034e-08

CPU times: user 6min 15s, sys: 53.1 s, total: 7min 8s
Wall time: 7min 8s


In [13]:
# Write the computed NetworkX GCMs to a .csv-file

df_nx_gcm = pd.DataFrame(dict(
    degree = dc,
    #eigenvector = eigenvector_centrality,
    pagerank = pr
))

df_nx_gcm.index.name='resource'
df_nx_gcm.to_csv('../data/gcm_computed/nx-gcm.csv') # optionally change path to your folder structure
df_nx_gcm.head()

Unnamed: 0_level_0,degree,pagerank
resource,Unnamed: 1_level_1,Unnamed: 2_level_1
http://dbpedia.org/resource/2021–22_A.S._Roma_season__Matías_Viña__1,3.639305e-07,2.392686e-08
http://dbpedia.org/resource/Defender_(association_football),0.007996645,0.0007201337
http://dbpedia.org/resource/Mick_Lawlor_(footballer),3.154064e-06,4.681092e-07
http://dbpedia.org/resource/League_of_Ireland_XI,4.585524e-05,7.535209e-06
http://dbpedia.org/resource/Maurice_Banide,1.940963e-06,2.318505e-07


In [14]:
print("Data types:")
print(df_nx_gcm.dtypes)
print('----------------------')
print("Memory usage:")
print(df_nx_gcm.memory_usage(deep=True))
print('----------------------')
print(df_nx_gcm.size)

Data types:
degree      float64
pagerank    float64
dtype: object
----------------------
Memory usage:
Index       1201448499
degree        65946664
pagerank      65946664
dtype: int64
----------------------
16486666


In [15]:
print("Dataframe shape is {}".format(df_nx_gcm.shape))
print("======================")

print("Degree Centrality Kurtosis: %s" % stats.kurtosis(df_nx_gcm['degree']))
print("Degree Centrality Skewness: %s" % stats.skew(df_nx_gcm['degree']))
print("======================")

print("PageRank Kurtosis: %s" % stats.kurtosis(df_nx_gcm['pagerank']))
print("PageRank Skewness: %s" % stats.skew(df_nx_gcm['pagerank']))

Dataframe shape is (8243333, 2)
Degree Centrality Kurtosis: 3845700.69263639
Degree Centrality Skewness: 1758.647695035311
PageRank Kurtosis: 2944428.0604075748
PageRank Skewness: 1628.0911779681292


### Network Analysis iGraph

In [16]:
# Load NetworkX graph into a iGraph for faster computing of GCM
# takes around 5-10 min to transform

#https://igraph.org/python/tutorial/latest/generation.html#from-external-libraries
iGr = iGraph.from_networkx(G)
#iGr = iGraph._construct_graph_from_networkx(G, vertex_attr_hashable='_nx_name')


print("iGraph loaded successfully with {} nodes ".format(iGraph.vcount(iGr)) 
      + "and {} edges".format(iGraph.ecount(iGr)))

# Output: iGraph loaded successfully with 8243333 nodes and 28263172 edges

iGraph loaded successfully with 8243333 nodes and 28263172 edges


In [17]:
# Summary of the graph
iGraph.summary(iGr)

# Output: 'IGRAPH U-W- 8243333 28263172 -- \n+ attr: _nx_name (v), triples (e), weight (e)'

'IGRAPH U-W- 8243333 28263172 -- \n+ attr: _nx_name (v), triples (e), weight (e)'

In [18]:
print("Graph is connected: ", iGraph.is_connected(iGr))
print("Graph is directed: ", iGraph.is_directed(iGr))

Graph is connected:  False
Graph is directed:  False


In [26]:
%%time

# Compute iGraph Degree Centrality
  # takes less than 1 min

degree_centrality = iGraph.degree(iGr)

CPU times: user 128 ms, sys: 16 ms, total: 144 ms
Wall time: 144 ms


In [24]:
%%time

# Compute iGraph Eigenvector Centrality
  # takes around 1 min

eigenvector_centrality = iGraph.eigenvector_centrality(iGr)

In [25]:
%%time

# Compute iGraph PageRank
  # takes less than 2 min

pagerank = iGraph.pagerank(iGr)

In [None]:
# Write the computed iGraph GCMs to a .csv-file
  # takes around 1 min

with open('../data/gcm_computed/ig-gcm.csv', mode='w') as gcm_export: # optionally change path your folder structure
    writer = csv.writer(gcm_export, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['resource', 'degree','eigenvector', 'pagerank'])
    for v in iGr.vs:
        writer.writerow([v['_nx_name'], degree_centrality[v.index], eigenvector_centrality[v.index], pagerank[v.index]])

In [28]:
df_ig_gcm = pd.read_csv("../data/gcm_computed/ig-gcm.csv") # optionally change path to correct file
df_ig_gcm.head()

Unnamed: 0,resource,degree,eigenvector,pagerank
0,http://dbpedia.org/resource/Billy_Coggins__Car...,3,0.0007845722,5.936257e-08
1,http://dbpedia.org/ontology/CareerStation,1624880,1.0,0.02791082
2,http://dbpedia.org/resource/News_Patrol__A2Z_N...,9,7.198654e-11,1.150764e-07
3,http://dbpedia.org/resource/ZOE_Broadcasting_N...,20,9.183437e-10,2.792414e-07
4,http://dbpedia.org/resource/Yu_Yongfu,6,2.790954e-09,1.047756e-07


In [29]:
print("Data types:")
print(df_ig_gcm.dtypes)
print('----------------------')
print("Memory usage:")
print(df_ig_gcm.memory_usage(deep=True))
print('----------------------')
print(df_ig_gcm.size)

print("Dataframe shape is {}".format(df_ig_gcm.shape))
print("======================")

print("Degree Centrality Kurtosis: %s" % stats.kurtosis(df_ig_gcm['degree']))
print("Degree Centrality Skewness: %s" % stats.skew(df_ig_gcm['degree']))
print("======================")

print("Eigenvector Centrality Kurtosis: %s" % stats.kurtosis(df_ig_gcm['eigenvector']))
print("Eigenvector Centrality Skewness: %s" % stats.skew(df_ig_gcm['eigenvector']))
print("======================")

print("PageRank Kurtosis: %s" % stats.kurtosis(df_ig_gcm['pagerank']))
print("PageRank Skewness: %s" % stats.skew(df_ig_gcm['pagerank']))


# Output

#Data types:
#resource        object
#degree           int64
#eigenvector    float64
#pagerank       float64
#dtype: object
#----------------------
#Memory usage:
#Index                128
#resource       964958363
#degree          65946664
#eigenvector     65946664
#pagerank        65946664
#dtype: int64
#----------------------
#32973332
#Dataframe shape is (8243333, 4)
#======================
#Degree Centrality Kurtosis: 3845700.692636409
#Degree Centrality Skewness: 1758.6476950353158
#======================
#Eigenvector Centrality Kurtosis: 2535495.292056733
#Eigenvector Centrality Skewness: 1186.2822293812544
#======================
#PageRank Kurtosis: 2865512.8079392132
#PageRank Skewness: 1578.3902507400448

Data types:
resource        object
degree           int64
eigenvector    float64
pagerank       float64
dtype: object
----------------------
Memory usage:
Index                128
resource       964958363
degree          65946664
eigenvector     65946664
pagerank        65946664
dtype: int64
----------------------
32973332
Dataframe shape is (8243333, 4)
Degree Centrality Kurtosis: 3845700.692636409
Degree Centrality Skewness: 1758.6476950353158
Eigenvector Centrality Kurtosis: 2535495.292056733
Eigenvector Centrality Skewness: 1186.2822293812544
PageRank Kurtosis: 2865512.8079392132
PageRank Skewness: 1578.3902507400448
