In [2]:
from rdflib import Graph
from rdflib import RDF
from rdflib import URIRef
from rdflib import Literal
import networkx as nx
import matplotlib.pyplot as plt
from rdflib.plugins.stores.sparqlstore import SPARQLStore
from queries import QUERY_AI, QUERY_FILM, QUERY_PERSON
from paths_to_examples import REPO_BOOKS, REPO_PEOPLE, WIKIDATA, GPT_EX, GPT_EX2, GPT_SUBCLASSES, EP_DG_1_TTL, EP_DG_1_JSON

DEBUG = True  # for debug prints
PRINTGRAPH = False # for showing the networkX graph
LOCAL = True # for choosing local ttl file path or SPARQL endpoint 
DEC_PLACES = 2 # for rounding float values to x decimal places

def debug_print(*args, **kwargs):
    if DEBUG:
        print(*args, **kwargs)


# printing all elements of a set
def print_set(set):
    if DEBUG:
        for i in set:
            print(str(i))

def show_graph(G):
    if PRINTGRAPH:
        # calculating positions for nodes
        pos = nx.spring_layout(G, k=0.5, iterations=50)

        # printing nodes and edges
        plt.figure(figsize=(12, 8))
        nx.draw(G, pos, with_labels=True, node_color="lightblue", node_size=1500, font_size=10, font_weight="bold", arrows=True)

        # printing labels for the edges
        edge_labels = nx.get_edge_attributes(G, 'label')
        nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_color='red')

        plt.title("RDF Graph")
        plt.axis("off")
        plt.show()

        print("FINISHED PRINTING GRAPH\n\n")        

# Calculating all paths from one root node (start node)
def find_all_paths(G, start_node):
    # helper function for DFS
    def dfs(node, path):
        global num_paths
        global abs_depth
        global max_depth

        # skip node if it already is in path (avoiding cycles)
        if node in path:
            return
        
        # add current node to path
        path.append(node)
        
        neighbors = list(G.neighbors(node))
        # if node does not have any neighbors left, there is no other path left
        if not neighbors:
            paths.append(list(path))  # Store current path
            num_paths += 1
            abs_depth += (len(path) - 1) # cardinality of a path = number of EDGES in path (=> -1)
            if (len(path)-1) > max_depth:
                max_depth = len(path) - 1
        
        # recursively extend the path for each neighbor
        for neighbor in neighbors:
            dfs(neighbor, path)
        
        # remove node from the path to find next path
        path.pop()

    # list which stores all paths
    paths = []
    
    # starts DFS with start_node
    dfs(start_node, [])
    
    return paths

In [1]:
# path to ttl file
# can also be an online graph source
ttl_file_path = EP_DG_1_TTL
g = Graph()

print("STARTING PARSING")

# if it fails -> print some error
try:
    # TODO: He said you can just drop the format parameter, then it automatically recognizes the format
    g.parse(ttl_file_path)
except Exception as e:
    print(f"ERROR WHILE PARSING GRAPH: {e}")    

print("Number of triples: " + str(len(g)))    
print("FINISHED PARSING\n")

#for s, p, o in g:
 #   print(s, p, o)

print("STARTING GETTING ROOT NODES AND LITERALS")

all_nodes = set(g.subjects())

# printing all subject nodes
# print_set(nodes)

# all objects
object_nodes = set(g.objects())

# all literals in a graph (literals can only appear as objects)
literals = set(o for o in object_nodes if isinstance(o, Literal))

# converting items of literals to strings
literals = {str(item) for item in literals}

# getting possible root nodes
root_nodes = all_nodes - object_nodes

debug_print("Root Nodes:")
print_set(root_nodes)
#debug_print("\n")

print("FINISHED GETTING ROOT NODES AND LITERALS\n")

NameError: name 'EP_DG_1_TTL' is not defined

In [16]:
import sys
# path to ttl file
# can also be an online graph source
ttl_file_path = EP_DG_1_TTL
g = Graph()

print("STARTING PARSING")

# if it fails -> print some error
try:
    # TODO: He said you can just drop the format parameter, then it automatically recognizes the format
    g.parse(ttl_file_path)
except Exception as e:
    print("ERROR WHILE PARSING GRAPH: {e}")    

print("Number of triples: " + str(len(g)))    
print("FINISHED PARSING\n")

#for s, p, o in g:
 #   print(s, p, o)

print("STARTING GETTING ROOT NODES AND LITERALS")

all_nodes = set(g.subjects())

# printing all subject nodes
# print_set(nodes)

# all objects
object_nodes = set(g.objects())

# all literals in a graph (literals can only appear as objects)
literals = set(o for o in object_nodes if isinstance(o, Literal))

for lit in literals:
    datatype = lit.datatype
    lang = lit.language
    print(str(lit) + "-" + str(datatype) + "-" + str(lang))

# converting items of literals to strings
literals = {str(item) for item in literals}


# getting possible root nodes
root_nodes = all_nodes - object_nodes

debug_print("Root Nodes:")
print_set(root_nodes)
#debug_print("\n")

print("FINISHED GETTING ROOT NODES AND LITERALS\n")

print("STARTING CREATING GRAPH G")

# NetworkX-DiGraph for visualization
G = nx.DiGraph()

debug_print("Printing all triples:")
# add triples to Graph G
for subj, pred, obj in g:
    if isinstance(obj, Literal):
        G.add_edge(str(subj), str(obj) + "_" + str(obj.datatype) + "_" + str(obj.language), label=str(pred))
        #debug_print("(" , str(subj), ",", str(pred), ",",  str(obj) + "_" + str(obj.datatype) + "_" + str(obj.language), ")")
    else:
        G.add_edge(str(subj), str(obj), label=str(pred))
    #debug_print("(" , str(subj), ",", str(pred), ",", str(obj), ")")
    
    debug_print("(" , str(subj), ",", str(pred), ",", str(obj), ")")

print("FINISHED CREATING GRAPH G\n")

show_graph(G)

num_paths = 0
abs_depth = 0
max_depth = 0

print("STARTING CALCULATING PATHS")

# list of paths for all root nodes
all_paths = {}

# calculating all paths from root nodes
for root in list(root_nodes):
    str_root = str(root)
    debug_print("Starting searching for paths with root node: " + str_root)
    all_paths[str_root] = find_all_paths(G, str_root)

print("FINISHED CALCULATING PATHS\n")

#sys.exit()

# Output root node + found paths
# print("FOUND PATHS:")
# for root_node, paths in all_paths.items():
#     print(f"Root-Knoten: {root_node}")
#     for path in paths:
#         print(path)

# Output found paths
print("FOUND PATHS:")
for root_node, paths in all_paths.items():
    for path in paths:
        print(path)
        print("Path length = " + str(len(path)-1))
        

#num_paths = len(all_paths)

print("RESULTS:")

print("-Number of Paths: " + str(num_paths))
print("-Absolute depth: " + str(abs_depth))

avg_depth = abs_depth / num_paths

print("-Average depth: " + str(avg_depth))
print("-Maximal depth: " + str(max_depth))

STARTING PARSING
Number of triples: 179
FINISHED PARSING

STARTING GETTING ROOT NODES AND LITERALS
Bebauungsplan-None-de
Υπηρεσία έμπνευσης για το σχέδιο ανάπτυξης της υπηρεσίας XPlanning για τις γραμμές κατασκευής σχεδίου Rindelbach Rattstadt Am Ortsweg No. 5 (XPlanGML 5.0.1) (INSPIRE GML)-None-el-t-de-t0-mtec
download-None-nl-t-en-t0-mtec
Služba INSPIRE za storitev razvojnega načrta (XPlanGML 5.0.1) za projektne linije Rindelbach Rattstadt Am Ortsweg št. 5 mesta Ellwangen (Jagst) (INSPIRE GML)-None-sl-t-de-t0-mtec
Siehe referenzierte Spezifikation-None-de
Služba Inspirace pro plán rozvoje (XPlanGML 5.0.1) pro plánovací linky Rindelbach Rattstadt Am Ortsweg č. 5 města Ellwangen (Jagst) (INSPIRE GML)-None-cs-t-de-t0-mtec
2023-02-25-http://www.w3.org/2001/XMLSchema#date-None
Descarcă-None-ro-t-en-t0-mtec
Bebauungspläne-None-de
Inspire Service for the Development Plan Service (XPlanGML 5.0.1) για τις γραμμές κατασκευής σχεδίων Rindelbach Rattstadt Am Ortsweg No. 5 της πόλης Ellwangen (Ja

In [3]:
# Tangledness 
# source 73 - page 4
# tangledness = mean number of classes with more than 1 direct ancestor, so two primitive 
# measurements (number of classes and number of direct ancestors) are used for computing the metric 

# Select number of classes in graph 
query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

SELECT (COUNT(DISTINCT ?class) AS ?num_classes)
WHERE {
  {
    # 1. explicitly/implicitly used RDF classes
    # explicitly: ?class a owl:Class . or ?class a rdfs:Class .
    # implicitly: ?any rdf:type ?class . (includes also explicitly used classes)

    ?any rdf:type ?class .
  }
  UNION
  {
    # 2. subclasses
    ?class rdfs:subClassOf ?any .
  }
  UNION
  {
    # 3. superclasses
    ?any rdfs:subClassOf ?class .
  }
  UNION
  {
    # 4. classes used with owl:equivalentClass
    { ?class owl:equivalentClass ?any . }
    UNION
    { ?any owl:equivalentClass ?class . }
  }
  UNION
  {
    # 5. OWL restriction classes
    ?class a owl:Restriction .
  }
  UNION
  {
    # 6. complex classes with using unionOf, intersectionOf etc.
    ?class owl:unionOf|owl:intersectionOf|owl:complementOf|owl:oneOf ?list .
  }
  UNION
  {
    # 7. OWL hasValue restrictions
    ?class owl:hasValue ?val .
  }
}"""

res = g.query(query)
for row in res:
    print("Number of classes in graph: " + str(row['num_classes']))
    num_classes = int(row['num_classes'])

# Select number of classes with more than one ingoing isA arc (Dr. Jovanovik said I should use is-a)
query_var1 = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(?class) AS ?tangledCount)
WHERE
{
  SELECT ?class (COUNT(?s) AS ?numIngoing)
  WHERE {
    { ?s rdf:type ?class . }
  }
  GROUP BY ?class
  HAVING (COUNT(?s) > 1) # problem here was: i used ?numIngoing instead of COUNT(?s)
}
"""

# Select number of classes with more than one superclass (source says I should use this query)
query_var2 = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(?class) AS ?tangledCount)
WHERE {
  SELECT ?class (COUNT(?super) AS ?numSupers)
  WHERE {
    ?class rdfs:subClassOf ?super .
  }
  GROUP BY ?class
  HAVING (COUNT(?super) > 1)
}
"""
res = g.query(query_var2)

t = 0

for row in res:
  print("Number of classes with more than one superclass: " + str(row["tangledCount"]) )
  t = int(row["tangledCount"])
  #print("Class " + str(row["class"]) + ": " + str(row["numIngoing"]))

#print("t: " + str(t))

print("\nRESULT:")

if t > 0:
  # source 37 says num_classes / t
  # source 73 says denominator and numerator should be switched -> t / num_classes
  ont_tangledness = round(num_classes / t, DEC_PLACES) 
else:
  ont_tangledness = 0.0

print("-Ontology Tangledness: " + str(ont_tangledness))


Number of classes in graph: 8
Number of classes with more than one superclass: 0

RESULT:
-Ontology Tangledness: 0.0


In [5]:
# Degree Distribution (Formula in: source 37, page 7)
# nG...number of nodes in gaph
# nE...number of edges in graph

print("STARTING CALCULATING DEGREE DISTRIBUTION")

# Calculating nE
query_nE = """
SELECT (COUNT(*) AS ?tripleCount)
WHERE {
  ?s ?p ?o .
}
"""

res = g.query(query_nE)
for row in res:
    print("Number of edges in graph: " + str(row['tripleCount']))
    nE = int(row['tripleCount'])

# Calculating nG
query_nG = """
SELECT (COUNT(DISTINCT ?node) AS ?nodeCount)
WHERE {
  {
    SELECT ?node WHERE {
      { ?node ?p1 ?o }       
      UNION
      { ?s ?p2 ?node }       
    }
  }
}
"""

res = g.query(query_nG)

for row in res:
    print("Number of nodes in graph: " + str(row['nodeCount']))
    nG = int(row['nodeCount'])

# Calculating degree for every node in graph
query_degrees = """
SELECT ?node (COUNT(?any) AS ?degree)
WHERE {
  {
    { ?node ?p1 ?any }    
    UNION
    { ?any ?p2 ?node }    
  }
}
GROUP BY ?node
"""

# sum_of_degress = 0
degrees = []

res = g.query(query_degrees)

for row in res:
    #sum_of_degress += int(row['degree'])
    node = row["node"]
    degree = int(row["degree"])
    degrees.append((node, degree))
    debug_print(str(node) + " - Degree: " + str(degree))

# Summe aller Degrees
sum_of_degrees = sum(d for _, d in degrees)

print("\nSum of Degrees: " + str(sum_of_degrees))

if nG > 1:
  mean_degree = (2 * nE) / nG
  squared_diffs = [(deg_v - mean_degree) ** 2 for _,deg_v in degrees]
  degree_distribution = round(sum(squared_diffs) / (nG-1), 2)
else:
  degree_distribution = 0.0 

print("FINISHED CALCULATING DEGREE DISTRIBUTION")

print("\nRESULT:")
print("-Degree Distribution: " + str(degree_distribution))

STARTING CALCULATING DEGREE DISTRIBUTION
Number of edges in graph: 38
Number of nodes in graph: 30
http://example.org/ontology#manages - Degree: 4
http://example.org/ontology#Ontology - Degree: 1
http://example.org/ontology#johnDoe - Degree: 4
http://example.org/ontology#deptIT - Degree: 5
http://example.org/ontology#hasName - Degree: 4
http://example.org/ontology#hasSalary - Degree: 4
http://example.org/ontology#Manager - Degree: 5
http://example.org/ontology#janeSmith - Degree: 5
http://example.org/ontology#Employee - Degree: 7
http://example.org/ontology#worksIn - Degree: 4
http://example.org/ontology#Person - Degree: 4
http://example.org/ontology#Department - Degree: 5
manages department - Degree: 1
http://www.w3.org/2002/07/owl#Ontology - Degree: 1
45000 - Degree: 1
IT Department - Degree: 1
http://www.w3.org/2002/07/owl#DatatypeProperty - Degree: 2
Manager - Degree: 1
http://www.w3.org/2000/01/rdf-schema#Literal - Degree: 2
Employee - Degree: 1
works in - Degree: 1
http://www.w3.

In [7]:
print("STARTING CALCULATING ENTITIES/CLASSES/PROPERTIES/INSTANCES/OBJECT PROPERTIES")

# TODO: Entities
query_entitities = """
SELECT (COUNT(DISTINCT ?entity) AS ?entityCount)
WHERE {
  {
    SELECT DISTINCT ?entity WHERE {
      ?entity ?p ?o .
    }
  }
  UNION
  {
    SELECT DISTINCT ?entity WHERE {
      ?s ?p ?entity .
      FILTER(!isLiteral(?entity))
    }
  }
}
"""

res = g.query(query_entitities)

num_entities = 0

for row in res:
  num_entities = int(row["entityCount"])


# Number of instances per type
query_inst = """ 
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
SELECT ?type (COUNT(?s) AS ?count)
WHERE {
  ?s rdf:type ?type . 
}
GROUP BY ?type """

res = g.query(query_inst)

num_instances = 0

for row in res:
  rdf_type = row["type"]
  count = int(row["count"])
  debug_print("Number of instancees of " + str(rdf_type) + ": " + str(count))
  num_instances += count

# Number of classes
# Defintion of Class: 
# source: 213, page: 5 - source: 250, page: 3
# TNOC (total number of classes/concepts) = classes, subclasses, superclasses, anonymous classes
# anonymous classes = equivalent/restriction/unionOf/intersectionOf/complementOf/oneOf/hasValue classes

query_classes = """ 
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

SELECT DISTINCT ?class
WHERE {
  {
    # 1. explicitly/implicitly used RDF classes
    # explicitly: ?class a owl:Class . or ?class a rdfs:Class .
    # implicitly: ?any rdf:type ?class . (includes also explicitly used classes)

    ?any rdf:type ?class .
  }
  UNION
  {
    # 2. subclasses
    ?class rdfs:subClassOf ?any .
  }
  UNION
  {
    # 3. superclasses
    ?any rdfs:subClassOf ?class .
  }
  UNION
  {
    # 4. classes used with owl:equivalentClass
    { ?class owl:equivalentClass ?any . }
    UNION
    { ?any owl:equivalentClass ?class . }
  }
  UNION
  {
    # 5. OWL restriction classes
    ?class a owl:Restriction .
  }
  UNION
  {
    # 6. complex classes with using unionOf, intersectionOf etc.
    ?class owl:unionOf|owl:intersectionOf|owl:complementOf|owl:oneOf ?list .
  }
  UNION
  {
    # 7. OWL hasValue restrictions
    ?class owl:hasValue ?val .
  }
}
"""
res = g.query(query_classes)

num_classes = 0

debug_print("\nExisting classes: ")

for row in res:
  num_classes += 1
  debug_print("Class " + str(num_classes) + ": " + str(row["class"]))

# number of properties in T-Box
# source says: property = explicitly defined property
query_properties_t = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(DISTINCT ?property) AS ?propertyCount)
WHERE {
  VALUES ?type { owl:ObjectProperty owl:DatatypeProperty owl:AnnotationProperty }
  ?property rdf:type ?type .
}
"""

res = g.query(query_properties_t)

num_properties_t = 0

for row in res:
  num_properties_t = int(row["propertyCount"])

# number of properties in A-Box
# source says: property = the unique ?p in ?s ?p ?o
query_properties_a = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(DISTINCT ?p) AS ?propertyCount)
WHERE {
  ?s ?p ?o .
}
"""
res = g.query(query_properties_a)

num_properties_a = 0

for row in res:
  num_properties_a = int(row["propertyCount"])

debug_print("Properties in T-Box: " + str(num_properties_t))
debug_print("Properties in A-Box: " + str(num_properties_a))

num_properties = num_properties_t + num_properties_a

# Number of object properties in T-Box
# Non-Inheritance -> excluding inheritance properties like rdfs:subPropertyOf or rdfs:subClassOf
query_object_properties_t = """
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT (COUNT(DISTINCT ?property) AS ?numObjectProperties)
WHERE {
  ?property rdf:type owl:ObjectProperty .
}
"""

res = g.query(query_object_properties_t)

num_obj_properties_t = 0

for row in res:
  num_obj_properties_t = int(row["numObjectProperties"])

# Number of object properties in A-Box
# Non-Inheritance -> excluding inheritance properties like rdfs:subPropertyOf or rdfs:subClassOf
query_object_properties_a = """
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT (COUNT(DISTINCT ?property) AS ?numObjectProperties)
WHERE {
  ?s ?property ?o 
  Filter(!isLiteral(?o))
}
"""

res = g.query(query_object_properties_a)

num_obj_properties_a = 0

for row in res:
  num_obj_properties_a = int(row["numObjectProperties"])

debug_print("Object Properties in T-Box: " + str(num_obj_properties_t))
debug_print("Object Properties in A-Box: " + str(num_obj_properties_a))
num_obj_properties = num_obj_properties_t + num_obj_properties_a

print("FINISHED CALCULATING ENTITIES/CLASSES/PROPERTIES/INSTANCES/OBJECT PROPERTIES")

print("\nRESULTS:")
print("-Number of entities: TODO")
print("-Number of properties: " + str(num_properties))
print("-Number of classes: " + str(num_classes))
print("-Number of instances: " + str(num_instances))
print("-Number of object properties: " + str(num_obj_properties))

STARTING CALCULATING ENTITIES/CLASSES/PROPERTIES/INSTANCES/OBJECT PROPERTIES
Number of instancees of http://www.w3.org/2002/07/owl#Ontology: 1
Number of instancees of http://www.w3.org/2002/07/owl#Class: 4
Number of instancees of http://www.w3.org/2002/07/owl#ObjectProperty: 2
Number of instancees of http://www.w3.org/2002/07/owl#DatatypeProperty: 2
Number of instancees of http://example.org/ontology#Department: 1
Number of instancees of http://example.org/ontology#Employee: 1
Number of instancees of http://example.org/ontology#Manager: 1

Existing classes: 
Class 1: http://www.w3.org/2002/07/owl#Ontology
Class 2: http://www.w3.org/2002/07/owl#Class
Class 3: http://www.w3.org/2002/07/owl#ObjectProperty
Class 4: http://www.w3.org/2002/07/owl#DatatypeProperty
Class 5: http://example.org/ontology#Department
Class 6: http://example.org/ontology#Employee
Class 7: http://example.org/ontology#Manager
Class 8: http://example.org/ontology#Person
Properties in T-Box: 4
Properties in A-Box: 9
Obj

In [None]:
# TODO: Depth of Inheritance Tree

print("STARTING CALCULATING DEPTH OF INHERITANCE TREE")

# Calculating all paths from one root node (start node) for a sparql endpoint
def find_all_paths_subclasses(endpoint_url, default_graph, start_node):
    neighbors_cache = {}  # global oder in der Funktion definiert

    # helperfunction for finding neighbors of a node (triple=(node, pred, neighbor))
    def get_neighbors(node):
        
        node_str = str(node)

        # 1. Prüfen, ob der Node bereits im Cache ist
        if node_str in neighbors_cache:
            # Debug-Ausgabe
            #print(f"Cache-Treffer für {node_str}")
            return neighbors_cache[node_str]
 
        if (len(str(node)) == 0): #or (not (node.startswith("http://") or node.startswith("https://"))):
            return [] 

        query = f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

        SELECT DISTINCT ?child
        WHERE {{
          ?child rdfs:subClassOf <{node}> .
        }}
        """
        
        #sparql.setQuery(query)
        #sparql.setReturnFormat(JSON)
        #sparql.addDefaultGraph(default_graph)
        res = g.query(query)
       
        
        neighbors = []
        for row in res:
            next_obj = binding["next"]
            value = next_obj["value"]
            value_type = next_obj["type"]  # 'uri', 'literal', 'bnode'
        
            neighbors.append((value, value_type))

        neighbors_cache[node_str] = neighbors

        return neighbors
    
    def dfs(path, node, node_type="uri"):
        global num_paths_inh_tree
        global max_depth_inh_tree

        # skip node if it already is in path (avoiding cycles)
        if node in path:
            return
        
        # add current node to path
        path.append(node)
        
        neighbors = list(get_neighbors(node)) 

        if not neighbors:
            paths.append(list(path))
            num_paths_inh_tree += 1
            max_depth_inh_tree = max(max_depth_inh_tree, len(path) - 1)
        else:
            for neighbor, neighbor_type in neighbors:
              #print("\node: " + str(node) + " - neighbor: " + str(neighbor))
              
              dfs(path, neighbor, neighbor_type)

        # remove node from the path to find next path
        path.pop()

    sparql = SPARQLWrapper(endpoint_url)
    sparql.addDefaultGraph(default_graph)
    #sparql.setTimeout(60)  # 120 Sekunden Timeout

    # list which stores all paths
    paths = []

    # TODO: ich glaub dass node_type=uri nicht passt, weil bnodes auch vorkommen können (vllt passts aber auch mt uri weil gleich gesucht wird mit bnodes)
    dfs([], start_node, node_type="uri")

    return paths

In [11]:
print("STARTING CALCULATING PROPERTY CLASS RATIO/CLASS PROPERTY RATIO/INHERITANCE RICHNESS/ATTRIBUTE RICHNESS")

# Inheritance Richness = average number of subclasses per class (source 227 - page 9) 
# Getting number of all subclasses
query_subclasses = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(*) AS ?numInheritanceRelations)
WHERE {
  ?subclass rdfs:subClassOf ?superclass .
}
"""

num_subclasses = 0

res = g.query(query_subclasses)

for row in res:
  num_subclasses = int(row["numInheritanceRelations"])

# Getting number of datatype properties
query_datatype_properties_t = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(DISTINCT ?property) AS ?numDatatypeProperties)
WHERE {
  ?property rdf:type owl:DatatypeProperty .
}
"""

num_datatype_properties_t = 0

res = g.query(query_datatype_properties_t)

for row in res:
  num_datatype_properties_t = int(row["numDatatypeProperties"])

debug_print("Number of Datatype properties in T-Box: " + str(num_datatype_properties_t))

# Datatype Properties in A-Box
query_datatype_properties_a = """
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT (COUNT(DISTINCT ?property) AS ?numDatatypeProperties)
WHERE {
  ?s ?property ?o 
  Filter(isLiteral(?o))
}
"""

num_datatype_properties_a = 0

res = g.query(query_datatype_properties_a)

for row in res:
  num_datatype_properties_a = int(row["numDatatypeProperties"])

debug_print("Number of Datatype properties in A-Box: " + str(num_datatype_properties_a))

num_datatype_properties = num_datatype_properties_t + num_datatype_properties_a

debug_print("Number of Datatype properties: " + str(num_datatype_properties))

if num_classes > 0:
  prop_class_ratio = round(num_properties / num_classes, DEC_PLACES) 
  #print("-Property Class Ratio: " + str(prop_class_ratio))
else:
  # source 172 - page: assumes that classes must exist for properties to exist (Number of Properties, Number of CLasses > 1)
  # I assume: no classes -> ratio = 0
  prop_class_ratio = 0.0
  #print("-Property Class Ratio: 0")

# Class Property Ratio
if num_properties > 0:
  class_prop_ratio = round(num_classes / num_properties, DEC_PLACES) 
 # print("-Class Property Ratio: " + str(class_prop_ratio))
else:
  # metric is not defined for num_properties = 0
  # I assume: no properties -> ratio = 0
  class_prop_ratio = 0
  #print("-Class Property Ratio is INF!")

if num_classes > 0:
  inheritance_richness = round(num_subclasses / num_classes, DEC_PLACES) 
  #print("-Inheritance Richness: " + str(inheritance_richness))
else:
  # metric is not defined for num_classes = 0
  inheritance_richness = 0
  #print("-Inheritance Richness is INF!")

if num_classes > 0:
  attr_richness = round(num_datatype_properties / num_classes, DEC_PLACES)
  #print("-Attribute Richness: " + str(attr_richness))
else:
  # metric is not defined for num_classes = 0
  attr_richness = 0
  #print("-Attribute Richness is INF!")

print("FINISHED CALCULATING PROPERTY CLASS RATIO/CLASS PROPERTY RATIO/INHERITANCE RICHNESS/ATTRIBUTE RICHNESS")
print("\nRESULTS:")

print("-Property Class Ratio: " + str(prop_class_ratio))
print("-Class Property Ratio: " + str(class_prop_ratio))
print("-Inheritance Richness: " + str(inheritance_richness))
print("-Attribute Richness: " + str(attr_richness))

STARTING CALCULATING PROPERTY CLASS RATIO/CLASS PROPERTY RATIO/INHERITANCE RICHNESS/ATTRIBUTE RICHNESS
Number of Datatype properties in T-Box: 2
Number of Datatype properties in A-Box: 3
Number of Datatype properties: 5
FINISHED CALCULATING PROPERTY CLASS RATIO/CLASS PROPERTY RATIO/INHERITANCE RICHNESS/ATTRIBUTE RICHNESS

RESULTS:
-Property Class Ratio: 1.62
-Class Property Ratio: 0.62
-Inheritance Richness: 0.25
-Attribute Richness: 0.62


In [14]:
print("STARTING CALCULATING AVERAGE CLASS CONNECTIVITY/AVERAGE POPULATION")

# Average Class Connectivity
# Connectivity of a class is defined as the total number of relationships instances of 
# the class have with instances of other classes (source 227 - page 10)

# looking for number of triples (c1, p, c2) or (c3, p, c1) for each class with instances c1
# c1, c2 are instances of classes 
# c1 != c2,c3
# property != rdf:type because we are not interested in the class relationships 
query_class_connectivity = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT ?class (COUNT(*) AS ?connectivity)
WHERE {
  {
    ?instance ?property ?target .

    ?instance rdf:type ?class .
    ?target rdf:type ?targetClass .

    FILTER(?property != rdf:type)
    FILTER(?class != ?targetClass)
  }
  UNION
  {
    ?instance ?property ?target .

    ?target rdf:type ?class .
    ?instance rdf:type ?targetClass .

    FILTER(?property != rdf:type)
    FILTER(?class != ?targetClass)
  }
}
GROUP BY ?class
"""

res = g.query(query_class_connectivity)

class_connectivity_list = []
sum_connectivities = 0

for row in res:
  class_name = row["class"]
  connectivity = int(row["connectivity"])
  class_connectivity_list.append((class_name, connectivity))
  sum_connectivities += connectivity

for class_name, connectivity in class_connectivity_list:
  debug_print("Connectivity of Class " + str(class_name) + ": " + str(connectivity))

if num_classes > 0:
  avg_class_connectivity = round(sum_connectivities / num_classes, DEC_PLACES)
  #print("-Average Class Connectivity: " + str(avg_class_connectivity))
else:
  # metric is not defined for num_classes = 0
  avg_class_connectivity = 0
  #print("-Average Class Connectivity is INF!")

# Average Population
if num_classes > 0:
  avg_population = round(num_instances / num_classes, DEC_PLACES)
  #print("-Average Population: " + str(avg_population))
else:
  # metric is not defined for num_classes = 0
  avg_population = 0
  #print("-Average Population is INF!")

print("FINISHED CALCULATING AVERAGE CLASS CONNECTIVITY/AVERAGE POPULATION")

print("\nRESULTS:")
print("-Average Class Connectivity: " + str(avg_class_connectivity))
print("-Average Population: " + str(avg_population))

STARTING CALCULATING AVERAGE CLASS CONNECTIVITY/AVERAGE POPULATION
Connectivity of Class http://www.w3.org/2002/07/owl#ObjectProperty: 4
Connectivity of Class http://www.w3.org/2002/07/owl#DatatypeProperty: 2
Connectivity of Class http://example.org/ontology#Employee: 1
Connectivity of Class http://example.org/ontology#Manager: 2
Connectivity of Class http://www.w3.org/2002/07/owl#Class: 6
Connectivity of Class http://example.org/ontology#Department: 3
FINISHED CALCULATING AVERAGE CLASS CONNECTIVITY/AVERAGE POPULATION

RESULTS:
-Average Class Connectivity: 2.25
-Average Population: 1.5


In [52]:
# TODO: Cohesion

In [18]:
# Average Class Connectivity
# Connectivity of a class is defined as the total number of relationships instances of 
# the class have with instances of other classes (source 227 - page 10)

# looking for number of triples (c1, p, c2) or (c3, p, c1) for each class with instances c1
# c1, c2, c3 are instances of classes 
# c1 != c2,c3
# property != rdf:type because we are not interested in the class relationships 
query_class_connectivity = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT ?class (COUNT(*) AS ?connectivity)
WHERE {
  {
    ?instance ?property ?target .

    ?instance rdf:type ?class .
    ?target rdf:type ?targetClass .

    FILTER(?property != rdf:type)
    FILTER(?class != ?targetClass)
  }
  UNION
  {
    ?instance ?property2 ?target .

    ?target rdf:type ?class .
    ?instance rdf:type ?targetClass .

    FILTER(?property != rdf:type)
    FILTER(?class2 != ?targetClass)
  }
}
GROUP BY ?class
"""

class_connectivity_list = []
sum_connectivities = 0

res = g.query(query_class_connectivity)

#for row in res:
 # print(str(row["class"]) + ": " + str(row["instance"]) + " - " + str(row["property"]) + " - " + str(row["target"]))

for row in res:
    class_name = row["class"]
    connectivity = int(row["connectivity"])
    class_connectivity_list.append((class_name, connectivity))
    sum_connectivities += connectivity


for class_name, connectivity in class_connectivity_list:
  debug_print("Connectivity of Class " + str(class_name) + ": " + str(connectivity))

if num_classes > 0:
  avg_class_connectivity = sum_connectivities / num_classes
  print("Average Class Connectivity: " + str(avg_class_connectivity))
else:
  print("Average Class Connectivity is INF!")



Connectivity of Class http://www.w3.org/2002/07/owl#ObjectProperty: 4
Connectivity of Class http://www.w3.org/2002/07/owl#DatatypeProperty: 2
Connectivity of Class http://example.org/ontology#Employee: 1
Connectivity of Class http://example.org/ontology#Manager: 2
Average Class Connectivity: 1.125
