In [None]:
from rdflib import Graph
from rdflib import RDF
from rdflib import URIRef
from rdflib import Literal
import networkx as nx
import matplotlib.pyplot as plt
from rdflib.plugins.stores.sparqlstore import SPARQLStore
from queries import QUERY_AI, QUERY_FILM, QUERY_PERSON
from paths_to_examples import REPO_BOOKS, REPO_PEOPLE, WIKIDATA, GPT_EX, GPT_SUBCLASSES

DEBUG = True  # for debug prints
PRINTGRAPH = False # for showing the networkX graph
LOCAL = True # for choosing local ttl file path or SPARQL endpoint 

def debug_print(*args, **kwargs):
    if DEBUG:
        print(*args, **kwargs)


# printing all elements of a set
def print_set(set):
    if DEBUG:
        for i in set:
            print(str(i))

def show_graph(G):
    if PRINTGRAPH:
        # calculating positions for nodes
        pos = nx.spring_layout(G, k=0.5, iterations=50)

        # printing nodes and edges
        plt.figure(figsize=(12, 8))
        nx.draw(G, pos, with_labels=True, node_color="lightblue", node_size=1500, font_size=10, font_weight="bold", arrows=True)

        # printing labels for the edges
        edge_labels = nx.get_edge_attributes(G, 'label')
        nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_color='red')

        plt.title("RDF Graph")
        plt.axis("off")
        plt.show()

        print("FINISHED PRINTING GRAPH\n\n")        

# Calculating all paths from one root node (start node)
def find_all_paths(G, start_node):
    # helper function for DFS
    def dfs(node, path):
        global num_paths
        global abs_depth
        global max_depth

        # skip node if it already is in path (avoiding cycles)
        if node in path:
            return
        
        # add current node to path
        path.append(node)
        
        neighbors = list(G.neighbors(node))
        # if node does not have any neighbors left, there is no other path left
        if not neighbors:
            paths.append(list(path))  # Store current path
            num_paths += 1
            abs_depth += (len(path) - 1) # cardinality of a path = number of EDGES in path (=> -1)
            if (len(path)-1) > max_depth:
                max_depth = len(path) - 1
        
        # recursively extend the path for each neighbor
        for neighbor in neighbors:
            dfs(neighbor, path)
        
        # remove node from the path to find next path
        path.pop()

    # list which stores all paths
    paths = []
    
    # starts DFS with start_node
    dfs(start_node, [])
    
    return paths

In [None]:
# path to ttl file
# can also be an online graph source
ttl_file_path = REPO_BOOKS
g = Graph()

print("STARTING PARSING")

# if it fails -> print some error
try:
    g.parse(ttl_file_path, format="turtle")
except Exception as e:
    print("ERROR WHILE PARSING GRAPH: {e}")    

print("Number of triples: " + str(len(g)))    
print("FINISHED PARSING\n")

#for s, p, o in g:
 #   print(s, p, o)

print("STARTING GETTING ROOT NODES AND LITERALS")

all_nodes = set(g.subjects())

# printing all subject nodes
# print_set(nodes)

# all objects
object_nodes = set(g.objects())

# all literals in a graph (literals can only appear as objects)
literals = set(o for o in object_nodes if isinstance(o, Literal))

# converting items of literals to strings
literals = {str(item) for item in literals}

# getting possible root nodes
root_nodes = all_nodes - object_nodes

debug_print("Root Nodes:")
print_set(root_nodes)
#debug_print("\n")

print("FINISHED GETTING ROOT NODES AND LITERALS\n")

print("STARTING CREATING GRAPH G")

# NetworkX-DiGraph for visualization
G = nx.DiGraph()

debug_print("Printing all triples:")
# add triples to GRaph G
for subj, pred, obj in g:
    G.add_edge(str(subj), str(obj), label=str(pred))
    #debug_print("(" , str(subj), ",", str(pred), ",", str(obj), ")")
    debug_print("(" , subj, ",", pred, ",", obj, ")")

print("FINISHED CREATING GRAPH G\n")

show_graph(G)

num_paths = 0
abs_depth = 0
max_depth = 0

print("STARTING CALCULATING PATHS")

# list of paths for all root nodes
all_paths = {}

# calculating all paths from root nodes
for root in list(root_nodes):
    str_root = str(root)
    debug_print("Starting searching for paths with root node: " + str_root)
    all_paths[str_root] = find_all_paths(G, str_root)

print("FINISHED CALCULATING PATHS\n")

# Output root node + found paths
# print("FOUND PATHS:")
# for root_node, paths in all_paths.items():
#     print(f"Root-Knoten: {root_node}")
#     for path in paths:
#         print(path)

# Output found paths
print("FOUND PATHS:")
for root_node, paths in all_paths.items():
    for path in paths:
        print(path)
        print("Path length = " + str(len(path)-1))
        

#num_paths = len(all_paths)

print("RESULTS:")

print("-Number of Paths: " + str(num_paths))
print("-Absolute depth: " + str(abs_depth))

avg_depth = abs_depth / num_paths

print("-Average depth: " + str(avg_depth))
print("-Maximal depth: " + str(max_depth))

STARTING PARSING
Number of triples: 952
FINISHED PARSING

STARTING GETTING ROOT NODES AND LITERALS
Root Nodes:
http://example.org/ns#Node161
http://example.org/ns#Node104
http://example.org/ns#Node143
http://example.org/ns#Node151
http://example.org/ns#Node172
http://example.org/ns#Node148
http://example.org/ns#Node183
http://example.org/ns#Node139
http://example.org/ns#Node130
http://example.org/ns#Node108
http://example.org/ns#Node146
http://example.org/ns#Node155
http://example.org/ns#Node153
http://example.org/ns#Node124
http://example.org/ns#Node167
http://example.org/ns#Node128
http://example.org/ns#Node187
http://example.org/ns#Node132
http://example.org/ns#Node100
http://example.org/ns#Node135
http://example.org/ns#Node118
http://example.org/ns#Node165
http://example.org/ns#Node180
http://example.org/ns#Node126
http://example.org/ns#Node170
http://example.org/ns#Node176
http://example.org/ns#Node121
http://example.org/ns#Node112
http://example.org/ns#Node158
http://example.org/

In [4]:
# Tangledness 
# source 73 - page 4
# tangledness = mean number of classes with more than 1 direct ancestor, so two primitive 
# measurements (number of classes and number of direct ancestors) are used for computing the metric 

# Select number of classes in graph 
query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

SELECT (COUNT(DISTINCT ?class) AS ?num_classes)
WHERE {
  {
    # 1. explicitly/implicitly used RDF classes
    # explicitly: ?class a owl:Class . or ?class a rdfs:Class .
    # implicitly: ?any rdf:type ?class . (includes also explicitly used classes)

    ?any rdf:type ?class .
  }
  UNION
  {
    # 2. subclasses
    ?class rdfs:subClassOf ?any .
  }
  UNION
  {
    # 3. superclasses
    ?any rdfs:subClassOf ?class .
  }
  UNION
  {
    # 4. classes used with owl:equivalentClass
    { ?class owl:equivalentClass ?any . }
    UNION
    { ?any owl:equivalentClass ?class . }
  }
  UNION
  {
    # 5. OWL restriction classes
    ?class a owl:Restriction .
  }
  UNION
  {
    # 6. complex classes with using unionOf, intersectionOf etc.
    ?class owl:unionOf|owl:intersectionOf|owl:complementOf|owl:oneOf ?list .
  }
  UNION
  {
    # 7. OWL hasValue restrictions
    ?class owl:hasValue ?val .
  }
}"""

res = g.query(query)
for row in res:
    print("Number of classes in graph: " + str(row['num_classes']))
    num_classes = int(row['num_classes'])

# Select number of classes with more than one ingoing isA arc (Dr. Jovanovik said I should use is-a)
query_var1 = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(?class) AS ?tangledCount)
WHERE
{
  SELECT ?class (COUNT(?s) AS ?numIngoing)
  WHERE {
    { ?s rdf:type ?class . }
  }
  GROUP BY ?class
  HAVING (COUNT(?s) > 1) # problem here was: i used ?numIngoing instead of COUNT(?s)
}
"""

# Select number of classes with more than one superclass (source says I should use this query)
query_var2 = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(?class) AS ?tangledCount)
WHERE {
  SELECT ?class (COUNT(?super) AS ?numSupers)
  WHERE {
    ?class rdfs:subClassOf ?super .
  }
  GROUP BY ?class
  HAVING (COUNT(?super) > 1)
}
"""
res = g.query(query_var2)

t = 0

for row in res:
  print("Number of classes with more than one superclass: " + str(row["tangledCount"]) )
  t = int(row["tangledCount"])
  #print("Class " + str(row["class"]) + ": " + str(row["numIngoing"]))

print("t: " + str(t))

debug_print("\nClasses with more than one superclass: ")

print("\nRESULT:")

if num_classes > 0:
  # source 37 says num_classes / t
  # source 73 says denominator and numerator should be switched -> t / num_classes
  tangledness = t / num_classes
  print("-Tangledness: " + str(tangledness))
else:
  # TODO
   print("-Tangledness is INF")


Number of classes in graph: 2
Number of classes with more than one superclass: 0
t: 0

Classes with more than one superclass: 

RESULT:
-Tangledness: 0.0


In [8]:
# Degree Distribution (Formula in: source 37, page 7)
# nG...number of nodes in gaph
# nE...number of edges in graph

print("STARTING CALCULATING DEGREE DISTRIBUTION")

# Calculating nE
query_nE = """
SELECT (COUNT(*) AS ?tripleCount)
WHERE {
  ?s ?p ?o .
}
"""

res = g.query(query_nE)
for row in res:
    print("Number of edges in graph: " + str(row['tripleCount']))
    nE = int(row['tripleCount'])

# Calculating nG
query_nG = """
SELECT (COUNT(DISTINCT ?node) AS ?nodeCount)
WHERE {
  {
    SELECT ?node WHERE {
      { ?node ?p1 ?o }       
      UNION
      { ?s ?p2 ?node }       
    }
  }
}
"""

res = g.query(query_nG)

for row in res:
    print("Number of nodes in graph: " + str(row['nodeCount']))
    nG = int(row['nodeCount'])

# Calculating degree for every node in graph
query_degrees = """
SELECT ?node (COUNT(?any) AS ?degree)
WHERE {
  {
    { ?node ?p1 ?any }    
    UNION
    { ?any ?p2 ?node }    
  }
}
GROUP BY ?node
"""

# sum_of_degress = 0
degrees = []

res = g.query(query_degrees)

for row in res:
    #sum_of_degress += int(row['degree'])
    node = row["node"]
    degree = int(row["degree"])
    degrees.append((node, degree))
    debug_print("Node: " + str(node) + " - Degree: " + str(degree))

# Summe aller Degrees
sum_of_degrees = sum(d for _, d in degrees)

print("\nSum of Degrees: " + str(sum_of_degrees))

if nG > 0:
    mean_degree = (2 * nE) / nG
    squared_diffs = [(deg_v - mean_degree) ** 2 for _,deg_v in degrees]
    degree_distribution = sum(squared_diffs) / (nG-1)
else:
    degree_distribution = 0 

print("FINISHED CALCULATING DEGREE DISTRIBUTION")

print("\nRESULT:")
print("-Degree Distribution: " + str(degree_distribution))

STARTING CALCULATING DEGREE DISTRIBUTION
Number of edges in graph: 952
Number of nodes in graph: 546
Node: http://example.org/ns#Node146 - Degree: 14
Node: http://example.org/ns#Node180 - Degree: 15
Node: http://example.org/ns#Node142 - Degree: 9
Node: http://example.org/ns#Node163 - Degree: 9
Node: http://example.org/ns#Node128 - Degree: 14
Node: http://example.org/ns#Node185 - Degree: 9
Node: http://example.org/ns#Node166 - Degree: 9
Node: http://example.org/ns#Node184 - Degree: 9
Node: http://example.org/ns#Node121 - Degree: 16
Node: http://example.org/ns#Node170 - Degree: 14
Node: http://example.org/ns#Node183 - Degree: 17
Node: http://example.org/ns#Node117 - Degree: 9
Node: http://example.org/ns#Node152 - Degree: 9
Node: http://example.org/ns#Node154 - Degree: 9
Node: http://example.org/ns#Node104 - Degree: 16
Node: http://example.org/ns#Node157 - Degree: 9
Node: http://example.org/ns#Node186 - Degree: 9
Node: http://example.org/ns#Node167 - Degree: 17
Node: http://example.org/ns

In [9]:
print("STARTING CALCULATING ENTITIES/CLASSES/PROPERTIES/...")

# TODO: Entities

# Number of instances per type
query_inst = """ SELECT ?type (COUNT(?s) AS ?count)
WHERE {
  ?s rdf:type ?type . 
}
GROUP BY ?type """

res = g.query(query_inst)

num_instances = 0

for row in res:
    rdf_type = row["type"]
    count = int(row["count"])
    debug_print("Number of instancees of" + rdf_type + ": " + str(count))
    num_instances += count

# Number of classes
# Defintion of Class: 
# source: 213, page: 5 - source: 250, page: 3
# TNOC (total number of classes/concepts) = classes, subclasses, superclasses, anonymous classes
# anonymous classes = equivalent/restriction/unionOf/intersectionOf/complementOf/oneOf/hasValue classes

query_classes = """ 
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

SELECT DISTINCT ?class
WHERE {
  {
    # 1. explicitly/implicitly used RDF classes
    # explicitly: ?class a owl:Class . or ?class a rdfs:Class .
    # implicitly: ?any rdf:type ?class . (includes also explicitly used classes)

    ?any rdf:type ?class .
  }
  UNION
  {
    # 2. subclasses
    ?class rdfs:subClassOf ?any .
  }
  UNION
  {
    # 3. superclasses
    ?any rdfs:subClassOf ?class .
  }
  UNION
  {
    # 4. classes used with owl:equivalentClass
    { ?class owl:equivalentClass ?any . }
    UNION
    { ?any owl:equivalentClass ?class . }
  }
  UNION
  {
    # 5. OWL restriction classes
    ?class a owl:Restriction .
  }
  UNION
  {
    # 6. complex classes with using unionOf, intersectionOf etc.
    ?class owl:unionOf|owl:intersectionOf|owl:complementOf|owl:oneOf ?list .
  }
  UNION
  {
    # 7. OWL hasValue restrictions
    ?class owl:hasValue ?val .
  }
}
"""
res = g.query(query_classes)

num_classes = 0

debug_print("\nExisting classes: ")

for row in res:
  num_classes += 1
  debug_print("Class " + str(num_classes) + ": " + str(row["class"]))

# number of properties in T-Box
query_properties = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(DISTINCT ?property) AS ?NoP)
WHERE {
  VALUES ?type { owl:ObjectProperty owl:DatatypeProperty owl:AnnotationProperty }
  ?property rdf:type ?type .
}
"""

res = g.query(query_properties)

num_properties = 0

for row in res:
  num_properties = int(row["NoP"])

# Number of object properties
# Non-Inheritance -> excluding inheritance properties like rdfs:subPropertyOf or rdfs:subClassOf
query_object_properties = """
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT (COUNT(DISTINCT ?property) AS ?numObjectProperties)
WHERE {
  ?property rdf:type owl:ObjectProperty .
}
"""

res = g.query(query_object_properties)

num_obj_properties = 0

for row in res:
  num_obj_properties = int(row["numObjectProperties"])

print("\nRESULTS:")
print("-Number of properties: " + str(num_properties))
print("-Number of classes: " + str(num_classes))
print("-Number of instances: " + str(num_instances))
print("-Number of object properties: " + str(num_obj_properties))

Number of instancees ofhttp://schema.org/Book does not look like a valid URI, trying to serialize this will break.
Number of instancees ofhttp://schema.org/Book:  does not look like a valid URI, trying to serialize this will break.
Number of instancees ofhttp://schema.org/Book: 30 does not look like a valid URI, trying to serialize this will break.
Number of instancees ofhttp://schema.org/Person does not look like a valid URI, trying to serialize this will break.
Number of instancees ofhttp://schema.org/Person:  does not look like a valid URI, trying to serialize this will break.
Number of instancees ofhttp://schema.org/Person: 59 does not look like a valid URI, trying to serialize this will break.


STARTING CALCULATING ENTITIES/CLASSES/PROPERTIES/...
Number of instancees ofhttp://schema.org/Book: 30
Number of instancees ofhttp://schema.org/Person: 59

Existing classes: 
Class 1: http://schema.org/Book
Class 2: http://schema.org/Person

RESULTS:
-Number of properties: 0
-Number of classes: 2
-Number of instances: 89
-Number of object properties: 0


In [None]:
# TODO: Depth of Inheritance Tree

In [10]:
# Property Class Ratio
if num_classes > 0:
    prop_class_ratio = num_properties / num_classes 
    print("Property Class Ratio: " + str(prop_class_ratio))
else:
    # TODO
    print("Property Class Ratio is INF!")

# Class Property Ratio
if num_properties > 0:
    class_prop_ratio = num_classes / num_properties 
    print("Class Property Ratio: " + str(class_prop_ratio))
else:
    # TODO
    print("Class Property Ratio is INF!")

# Inheritance Richness = average number of subclasses per clas (source 227 - page 9) 
query_subclasses = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(*) AS ?numInheritanceRelations)
WHERE {
  ?subclass rdfs:subClassOf ?superclass .
}
"""

num_subclasses = 0

res = g.query(query_subclasses)

for row in res:
  num_subclasses = int(row["numInheritanceRelations"])

if num_classes > 0:
    inheritance_richness = num_subclasses / num_classes 
    print("Inheritance Richness: " + str(inheritance_richness))
else:
    # TODO
    print("Inheritance Richness is INF!")

query_datatype_properties = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(DISTINCT ?property) AS ?numDatatypeProperties)
WHERE {
  ?property rdf:type owl:DatatypeProperty .
}
"""

num_datatype_properties = 0

res = g.query(query_datatype_properties)

for row in res:
  num_datatype_properties = int(row["numDatatypeProperties"])

debug_print("Number of Datatype properties: " + str(num_datatype_properties))

if num_classes > 0:
    attr_richness = num_datatype_properties / num_classes
    print("Attribute Richness: " + str(attr_richness))
else:
    print("Attribute Richness is INF!")

Property Class Ratio: 0.0
Class Property Ratio is INF!
Inheritance Richness: 0.0
Number of Datatype properties: 0
Attribute Richness: 0.0


In [11]:
# Average Connectivity
if num_classes > 0:
    avg_population = num_instances / num_classes
    print("Average Population: " + str(avg_population))
else:
    print("Average Population is INF!")


Average Population: 44.5


In [None]:
# TODO: Cohesion

In [None]:
# Average Class Connectivity
# Connectivity of a class is defined as the total number of relationships instances of 
# the class have with instances of other classes (source 227 - page 10)

# looking for number of triples (c1, p, c2) or (c3, p, c1) for each class with instances c1
# c1, c2 are instances of classes 
# c1 != c2
# property != rdf:type because we are not interested in the class relationships 
query_class_connectivity = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT ?class (COUNT(*) AS ?connectivity)
WHERE {
  {
    ?instance ?property ?target .

    ?instance rdf:type ?class .
    ?target rdf:type ?targetClass .

    FILTER(?property != rdf:type)
    FILTER(?class != ?targetClass)
  }
  UNION
  {
    ?instance ?property ?target .

    ?target rdf:type ?class .
    ?instance rdf:type ?targetClass .

    FILTER(?property != rdf:type)
    FILTER(?class != ?targetClass)
  }
}
GROUP BY ?class
"""

class_connectivity_list = []
sum_connectivities = 0

res = g.query(query_class_connectivity)

for row in res:
    class_name = row["class"]
    connectivity = int(row["connectivity"])
    class_connectivity_list.append((class_name, connectivity))
    sum_connectivities += connectivity


for class_name, connectivity in class_connectivity_list:
  debug_print("Connectivity of Class " + str(class_name) + ": " + str(connectivity))

if num_classes > 0:
  avg_class_connectivity = sum_connectivities / num_classes
  print("Average Class Connectivity: " + str(avg_class_connectivity))
else:
  print("Average Class Connectivity is INF!")



Connectivity of Class http://schema.org/Book does not look like a valid URI, trying to serialize this will break.
Connectivity of Class http://schema.org/Book:  does not look like a valid URI, trying to serialize this will break.
Connectivity of Class http://schema.org/Book: 59 does not look like a valid URI, trying to serialize this will break.
Connectivity of Class http://schema.org/Person does not look like a valid URI, trying to serialize this will break.
Connectivity of Class http://schema.org/Person:  does not look like a valid URI, trying to serialize this will break.
Connectivity of Class http://schema.org/Person: 59 does not look like a valid URI, trying to serialize this will break.


Connectivity of Class http://schema.org/Book: 59
Connectivity of Class http://schema.org/Person: 59
Average Class Connectivity: 59.0
