In [1]:
# Take dbpedia data and compute PageRank

#https://downloads.dbpedia.org/repo/dbpedia/mappings/


# Instance types
# Version: 2021.12.01/ 24-Jan-2022 00:21
# Specific: 24-Jan-2022 00:20            44905249
# Transitive: 24-Jan-2022 00:20           145896226

# Mapping based objects
# Version: 2021.12.01/ 21-Apr-2022 23:33
# 24-Jan-2022 00:34           184053203



# Category data
# https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2021-12
# Version: 2021.12.01

# Hypothesis: If I take the top 10 by PageRank within a category (e.g.: European capitals)
# the result are the categories prototypes. -> calculate Precision/Recall against the goldstandard (survey) 
# data to verify the prototypes detection

# Installing packages
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install networkx
!pip install rdflib
!pip install mlnotify

# Importing packages
import pandas as pd
#from SPARQLWrapper import SPARQLWrapper, JSON, N3
#import lxml
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns
import networkx as nx
from networkx import Graph as NXGraph
from rdflib import Graph as RDFGraph
from rdflib.extras.external_graph_libs import rdflib_to_networkx_graph
import statistics
import collections

#import danker # https://github.com/athalhammer/danker
from scipy import stats
from urllib import parse

import mlnotify 
# https://github.com/aporia-ai/mlnotify
# Use %%notify at beginning of cell

import os.path, time
# Use %%time at beginning of cell

print('------------')
!python --version
print("Pandas " + pd.__version__)
print("Numpy " + np.__version__)
#print("Seaborn " + sns.__version__)
print("Networkx " + nx.__version__)
print('------------')
print("All packages loaded and ready to roll :-)")

------------
Python 3.9.7
Pandas 1.4.2
Numpy 1.22.3
Seaborn 0.11.2
Networkx 2.8
------------
All packages loaded and ready to roll :-)


In [2]:
%%time
#%%notify

# RDF graph loading for "mapping based objects x instance types"

#os.chdir("../data")

path1 = "keseitz/data/mappingbased-objects_instance-types.ttl"
g1 = RDFGraph()
g1.parse(path1, format='turtle')
print("RDFLib Graph loaded successfully with {} triples".format(len(g1)))

print()
print("TIME")
print("============")

rdflib Graph loaded successfully with 29489019 triples

TIME
CPU times: user 1h 9min 6s, sys: 2min 26s, total: 1h 11min 33s
Wall time: 1h 11min 33s


In [None]:
### Subgraph construction (optional)
entity = input("http://dbpedia.org/ontology/Person")
relation = input("http://dbpedia.org/ontology/starring")

# TODO: Use entity and relation as parameters of a CONSTRUCT query
query = """
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX schema: <http://schema.org/>

CONSTRUCT {{ ?u a {} . ?u {} ?v }} WHERE {{ ?u a {} . ?u {} ?v }}""".format(entity, relation, entity, relation)
# print(query)
subg = g1.query(query)

#gm = subg

In [None]:
query = """

CONSTRUCT {

?s ?p ?o.
?o ?op ?oo.
?oo ?oop ?ooo.
?ooo ?ooop ?oooo.
?oooo ?oooop ?ooooo.
}
WHERE {
BIND(city:_London as ?s)
?s ?p ?o.
OPTIONAL {
   ?o ?op ?oo.
   OPTIONAL {
      ?oo ?oop ?ooo.
       OPTIONAL {
          ?ooo ?ooop ?oooo.
          OPTIONAL {
              ?oooo ?oooop ?ooooo.
           }
        }
     }
  }
}
"""

subg = g1.query(query)

In [3]:
%%time
#%%notify

# Transform rdflib graph into a networkx graph
#G = rdflib_to_networkx_graph(gConcat)
G = rdflib_to_networkx_graph(g1)
print("NetworkX Graph loaded successfully with length {}".format(len(G)))
#print(nx.info(G))

print()
print("TIME")
print("============")

networkx Graph loaded successfully with length 8243333

TIME
CPU times: user 13min 4s, sys: 2min 9s, total: 15min 14s
Wall time: 15min 14s


In [5]:
def mean(numbers):
    return float(sum(numbers)) / max(len(numbers), 1)

def number_of_pendants(g):
    """
    Equals the number of nodes with degree 1
    """
    pendants = 0
    for u in g:
        if g.degree[u] == 1:
            pendants += 1
    return pendants


def histogram(l):
    degree_sequence = sorted([d for n, d in list(l.items())], reverse=True)
    degreeCount = collections.Counter(degree_sequence)
    deg, cnt = zip(*degreeCount.items())
    print(deg, cnt)
    
    fig, ax = plt.subplots()
    plt.bar(deg, cnt, width=0.80, color='b')

    plt.title("Histogram")
    plt.ylabel("Count")
    plt.xlabel("Value")
    ax.set_xticks([d + 0.4 for d in deg])
    ax.set_xticklabels(deg)

    plt.show()

In [6]:
%%time
# Analysis

# Network size
print("NETWORK SIZE")
print("============")
print("The network has {} nodes and {} edges".format(G.number_of_nodes(), G.number_of_edges()))
print()

# Network size
# = Terminal nodes that end the graph
print("PENDANTS")
print("============")
print("The network has {} pendants".format(number_of_pendants(G)))
print()

# Density
print("DENSITY")
print("============")
print("The network density is {}".format(nx.density(G)))
print()

NETWORK SIZE
The network has 8243333 nodes and 28263172 edges

PENDANTS
The network has 1693880 pendants

DENSITY
The network density is 8.318504610951633e-07

CPU times: user 43.7 s, sys: 4.86 s, total: 48.5 s
Wall time: 48.5 s


In [None]:
%%time
#%%notify

# Degree centrality -- mean and stdev
dc = nx.degree_centrality(G)
degrees = []
for k,v in dc.items():
    degrees.append(v)
    
print("DEGREE CENTRALITY")
print("=================")
print("The mean degree centrality is {}, with stdev {}".format(mean(degrees), statistics.stdev(degrees)))
print("The maximum node is {}, with value {}".format(max(dc, key=dc.get), max(dc.values())))
print("The minimum node is {}, with value {}".format(min(dc, key=dc.get), min(dc.values())))
histogram(dc)
print()

In [None]:
# Eigenvector centrality -- mean and stdev
ec = nx.eigenvector_centrality_numpy(G)
degrees = []
for k,v in ec.items():
    degrees.append(v)
    
print("EIGENVECTOR CENTRALIY")
print("======================")
print("The mean network eigenvector centrality is {}, with stdev {}".format(mean(degrees), statistics.stdev(degrees)))
print("The maximum node is {}, with value {}".format(max(ec, key=ec.get), max(ec.values())))
print("The minimum node is {}, with value {}".format(min(ec, key=ec.get), min(ec.values())))
histogram(ec)
print()

In [None]:
# Betweenness centrality -- mean and stdev
bc = nx.betweenness_centrality(G)
degrees = []
for k,v in bc.items():
    degrees.append(v)
print("BETWEENNESS CENTRALITY")
print("======================")
print("The mean betwenness centrality is {}, with stdev {}".format(mean(degrees), statistics.stdev(degrees)))
print("The maximum node is {}, with value {}".format(max(bc, key=bc.get), max(bc.values())))
print("The minimum node is {}, with value {}".format(min(bc, key=bc.get), min(bc.values())))
histogram(bc)
print()

In [None]:
# Connected components
cc = list(nx.connected_components(G))
print("CONNECTED COMPONENTS")
print("====================")
print("The graph has {} connected components".format(len(cc)))
for i,c in enumerate(cc):
    print("Connected component {} has {} nodes".format(i,len(c)))
print()

In [None]:
# Clusters
cl = nx.clustering(G)
print("CLUSTERS")
print("========")
print("The graph has {} clusters".format(len(cl)))
for i,c in enumerate(cl):
    print("Cluster {} has {} nodes".format(i,len(c)))
print()

In [5]:
# Plot
#print("Visualizing the graph:")
#plt.plot()
#nx.draw(G, with_labels=True, font_weight='bold')

In [7]:
%%time
# Compute PageRank for "mapping based objects x instance types"

prG = nx.pagerank(G)

CPU times: user 6min 56s, sys: 1min 10s, total: 8min 7s
Wall time: 8min 7s


In [15]:
#nx.write_edgelist(G,'data/PageRank/PageRanksV1.csv')
# filesize is around 9.8GB
prG

{rdflib.term.URIRef('http://dbpedia.org/resource/Klipheuvel'): 7.80575502826517e-08,
 rdflib.term.URIRef('http://dbpedia.org/resource/Lepelle-Nkumpi_Local_Municipality'): 1.4979715470394886e-07,
 rdflib.term.URIRef('http://dbpedia.org/resource/Battle_of_Châteaudun'): 1.5008148872809573e-07,
 rdflib.term.URIRef('http://dbpedia.org/resource/Châteaudun'): 4.688362445997308e-07,
 rdflib.term.URIRef('http://dbpedia.org/resource/Stanley_Dance'): 1.3641895359796242e-07,
 rdflib.term.URIRef('http://dbpedia.org/resource/Braintree,_Essex'): 9.322985845555261e-07,
 rdflib.term.URIRef('http://dbpedia.org/resource/Toabré,_Panama'): 2.635494297127074e-08,
 rdflib.term.URIRef('http://dbpedia.org/resource/Eastern_Time_Zone'): 0.0004551016629632455,
 rdflib.term.URIRef('http://dbpedia.org/resource/John_Ferguson_(footballer,_born_1891)'): 1.0042426283476968e-07,
 rdflib.term.URIRef('http://dbpedia.org/resource/Flers,_Somme'): 5.563272873975838e-08,
 rdflib.term.URIRef('http://dbpedia.org/resource/Mother

In [53]:
%%time

df_pr = pd.DataFrame(list(prG.items()),columns = ['resource','pagerank'])
ids = range(1, len(df_pr) + 1)
idx = 0
df_pr.insert(loc=idx, column='resource_id', value=ids)

df_pr.head()

CPU times: user 10.2 s, sys: 1.14 s, total: 11.3 s
Wall time: 11.2 s


Unnamed: 0,resource_id,resource,pagerank
0,1,http://dbpedia.org/resource/Klipheuvel,7.805755e-08
1,2,http://dbpedia.org/resource/Lepelle-Nkumpi_Loc...,1.497972e-07
2,3,http://dbpedia.org/resource/Battle_of_Châteaudun,1.500815e-07
3,4,http://dbpedia.org/resource/Châteaudun,4.688362e-07
4,5,http://dbpedia.org/resource/Stanley_Dance,1.36419e-07


In [33]:
print("Dataframe shape is {}".format(df_pr.shape))
print("Kurtosis: %s" % stats.kurtosis(df_pr['pagerank']))
print("Skewness: %s" % stats.skew(df_pr['pagerank']))

Dataframe shape is (8243333, 2)
Kurtosis: 2944428.060407545
Skewness: 1628.0911779681285


In [50]:
url = "http://www.w3.org/2002/07/owl#Thing"
parse.urlsplit(url).path.split('/', 1)[-1]

'2002/07/owl'

In [54]:
%%time

def node_path (row):
    return parse.urlsplit(row['resource']).path.split('/', 1)[-1]

#def node_subfolder (row):
#    return parse.urlsplit(row['resource']).path.split('/')[1]

df_pr_nl = df_pr.copy()

df_pr_nl['resource'] = df_pr_nl.apply(lambda row: node_path(row), axis=1)
#df_pr_nl['resource'] = df_pr_nl.apply (lambda row: node_subfolder(row), axis=1)

df_pr_nl.head()

CPU times: user 6min 52s, sys: 21 s, total: 7min 13s
Wall time: 7min 14s


Unnamed: 0,resource_id,resource,pagerank
0,1,resource/Klipheuvel,7.805755e-08
1,2,resource/Lepelle-Nkumpi_Local_Municipality,1.497972e-07
2,3,resource/Battle_of_Châteaudun,1.500815e-07
3,4,resource/Châteaudun,4.688362e-07
4,5,resource/Stanley_Dance,1.36419e-07


In [58]:
%%time

df_pr_nl.to_csv('data/PageRank/df_pr.csv', encoding='utf-8')#, sep='\t', encoding='utf-8')

CPU times: user 55.7 s, sys: 804 ms, total: 56.5 s
Wall time: 1min 27s
