In [14]:
"""import psutil
import time
from IPython.display import clear_output

def monitor_resources(interval=2):
    # Shows cpu and ram usage every "interval" seconds
    # Stop with STRG+C
    
    process = psutil.Process()
    while True:
        clear_output(wait=True)
        cpu = psutil.cpu_percent(interval=0.1)
        ram = psutil.virtual_memory()
        mem_used = ram.used / (1024**3)
        mem_total = ram.total / (1024**3)
        print(f"CPU Usage: {cpu}%")
        print(f"Memory Used: {mem_used:.2f} GB / {mem_total:.2f} GB")
        print(f"Process Memory: {process.memory_info().rss / (1024**3):.2f} GB")
        time.sleep(interval)

monitor_resources()
"""

'import psutil\nimport time\nfrom IPython.display import clear_output\n\ndef monitor_resources(interval=2):\n    # Shows cpu and ram usage every "interval" seconds\n    # Stop with STRG+C\n\n    process = psutil.Process()\n    while True:\n        clear_output(wait=True)\n        cpu = psutil.cpu_percent(interval=0.1)\n        ram = psutil.virtual_memory()\n        mem_used = ram.used / (1024**3)\n        mem_total = ram.total / (1024**3)\n        print(f"CPU Usage: {cpu}%")\n        print(f"Memory Used: {mem_used:.2f} GB / {mem_total:.2f} GB")\n        print(f"Process Memory: {process.memory_info().rss / (1024**3):.2f} GB")\n        time.sleep(interval)\n\nmonitor_resources()\n'

In [3]:
from rdflib import Graph
from rdflib import RDF
from rdflib import URIRef
from rdflib import Literal
import networkx as nx
import matplotlib.pyplot as plt
from rdflib.plugins.stores.sparqlstore import SPARQLStore
from queries import QUERY_AI, QUERY_FILM, QUERY_PERSON
from paths_to_examples import REPO_BOOKS, REPO_PEOPLE, WIKIDATA, GPT_EX, GPT_SUBCLASSES
from SPARQLWrapper import SPARQLWrapper, RDF, JSON
import random
from concurrent.futures import ThreadPoolExecutor
import sys

DEBUG = True  # for debug prints
PRINTGRAPH = False # for showing the networkX graph
LOCAL = False # for choosing local ttl file path or SPARQL endpoint 

def debug_print(*args, **kwargs):
    if DEBUG:
        print(*args, **kwargs)


# printing all elements of a set
def print_set(set):
    if DEBUG:
        for i in set:
            print(i)

def send_query(query, sparql, format):
    sparql.setQuery(query)
    #sparql.setMethod("POST")
    try: 
        #print("Return Format:", format)
        sparql.setReturnFormat(format)
        results = sparql.query()
        results = results.convert()
        
        #triples = results.convert() # this converts directly to an RDFlib Graph object

        # add triples to graph
        #g += triples
    except Exception as e:
        print("Sending query failed:", e)  
        #sys.exit() 

    return results    

# Calculating all paths from one root node (start node) for a sparql endpoint
def find_all_paths_endpoint(endpoint_url, default_graph, literals, start_node):
    neighbors_cache = {}  # global oder in der Funktion definiert

    # helperfunction for finding neighbors of a node (triple=(node, pred, neighbor))
    def get_neighbors(node):
        
        node_str = str(node)

        # Check if node is in cache
        if node_str in neighbors_cache:
            #print(f"Cache-Treffer für {node_str}")
            return neighbors_cache[node_str]
 
        if (len(str(node)) == 0): #or (not (node.startswith("http://") or node.startswith("https://"))):
            return []  

        #print("\n Searching for neighbors of: " + str(node) + " will start.")

        query = f"""
        SELECT DISTINCT ?next WHERE {{
            <{node}> ?p ?next .
        }}
        """

        results = send_query(query, sparql, JSON)
       
        neighbors = []
        for binding in results["results"]["bindings"]:
            next_obj = binding["next"]
            value = next_obj["value"]
            value_type = next_obj["type"]  # 'uri', 'literal', 'typed-literal', 'bnode'

            if value_type == "literal" or value_type == "typed-literal":
                lang_tag = next_obj.get("xml:lang") 
                datatype = next_obj.get("datatype") 
                
                if lang_tag:  # add lagnuage tag
                    value = f"{value}@{lang_tag}"
                elif datatype:  # add datatype tag
                    value = f"{value}^^{datatype}"
            
            if value_type != "literal" and value_type != "typed-literal" and value_type != "uri" and value_type != "iri" and value_type != "bnode":
                print("OTHER VALUE TYPE: " + value_type)

            neighbors.append((value, value_type))

        neighbors_cache[node_str] = neighbors

        return neighbors
    
    def dfs(path, node, node_type="uri"):
        global num_paths
        global abs_depth
        global max_depth

        # skip node if it already is in path (avoiding cycles)
        if node in path:
            return
        
        # add current node to path
        path.append(node)
        
        neighbors = list(get_neighbors(node)) if (node_type != "literal" and node_type != "typed-literal" and node not in literals) else []

        if not neighbors:
            paths.append(list(path))
            num_paths += 1
            abs_depth += (len(path) - 1)
            max_depth = max(max_depth, len(path) - 1)
        else:
            for neighbor, neighbor_type in neighbors:
                debug_print("\nnode: " + str(node) + " - neighbor: " + str(neighbor))
                # if node is literal -> it does not have any neighbors -> path is finished
                if neighbor in literals:
                    path.append(neighbor)
                    debug_print("Found Path: ", path)
                    paths.append(list(path))
                    num_paths += 1
                    abs_depth += len(path) - 1
                    max_depth = max(max_depth, len(path) - 1)
                    # remove node from the path to find next path
                    path.pop()
                else:
                    dfs(path, neighbor, neighbor_type)

        # remove node from the path to find next path
        path.pop()

    # list which stores all paths
    paths = []

    dfs([], start_node, node_type="uri")

    return paths

In [4]:

num_paths = 0
abs_depth = 0
max_depth = 0

endpoint_url = "https://data.europa.eu/sparql"
default_graph = "http://data.europa.eu/88u/dataset/0800af55-8e56-49a0-8986-aa55151d0440"
endpoint_url = "https://lov.linkeddata.es/dataset/lov/sparql"
#endpoint_url = "https://sparql.europeana.eu/"
#endpoint_url = "https://nfdi4culture.de/sparql"

endpoint_url = "https://lov.linkeddata.es/dataset/lov/sparql"
default_graph = "http://purl.org/cwmo/#"
default_graph = "https://w3id.org/ecfo"
default_graph = "http://vocab.deri.ie/csp"
default_graph = "http://purl.org/wf4ever/ro"

endpoint_url = "https://data.europa.eu/sparql"
default_graph = "http://data.europa.eu/88u/dataset/0800af55-8e56-49a0-8986-aa55151d0440"

#default_graph = "http://def.seegrid.csiro.au/isotc211/iso19156/2011/observation"

sparql = SPARQLWrapper(endpoint_url)
sparql.addDefaultGraph(default_graph)

print("STARTING GETTING LITERALS")

query_literals = """
SELECT DISTINCT ?literal
WHERE {
?s ?p ?literal .
FILTER(isLiteral(?literal))
}
"""

results = send_query(query_literals, sparql, JSON)

literals = set()
debug_print("Literals:")
i = 0
for res in results["results"]["bindings"]:
    literal_value = res["literal"]["value"]
    lang_tag = res["literal"].get("xml:lang") 
    datatype = res["literal"].get("datatype") 
    if lang_tag:
        literal_value = f"{literal_value}@{lang_tag}"
    elif datatype:
        literal_value = f"{literal_value}^^{datatype}"

    literals.add(literal_value)
    debug_print(literal_value)

print(str(len(literals)) + " Literals existing")

print("FINISHED GETTING LITERALS\n")

#sys.exit()

print("STARTING CALCULATING PATHS")

query_roots = """
SELECT DISTINCT ?root
WHERE {
    ?root ?p ?o .
    FILTER NOT EXISTS {
        ?s ?p2 ?root .
    }
}
"""

results = send_query(query_roots, sparql, JSON)

#debug_print((results))
root_nodes = set()


for res in results["results"]["bindings"]:
    str_root = str(res["root"]["value"])
    root_nodes.add(str_root)

print(f"Number of root nodes: {len(root_nodes)}")

all_paths = {}

# Calculating all paths
for str_root in root_nodes:
    debug_print("Calculating paths for root node: " + str_root)
    all_paths[str_root] = find_all_paths_endpoint(endpoint_url, default_graph, literals, str_root)


print("FINISHED CALCULATING PATHS\n")

# Output found paths
print("FOUND PATHS:")
for root_node, paths in all_paths.items():
    for path in paths:
        print(path)
        print("Path length = " + str(len(path)-1))

print("RESULTS:")
print("-Number of Paths: " + str(num_paths))
print("-Absolute depth: " + str(abs_depth))

if num_paths > 0:
    avg_depth = abs_depth / num_paths
else:
    avg_depth = 0

print("-Average depth: " + str(avg_depth))
print("-Maximal depth: " + str(max_depth))

STARTING GETTING LITERALS
Literals:
2021-05-17^^http://www.w3.org/2001/XMLSchema#date
2023-02-25^^http://www.w3.org/2001/XMLSchema#date
2025-02-08T17:27:04Z^^http://www.w3.org/2001/XMLSchema#dateTime
INSPIRE-Dienst für den Bebauungsplan XPlanung-Dienst für den Plan Baulinien Rindelbach Rattstadt Am Ortsweg Nr. 5 (XPlanGML 5.0.1) (INSPIRE GML)@de
Įkvėpimo paslauga plėtros planui XPlanning paslauga plano statybos linijoms Rindelbach Rattstadt Am Ortsweg Nr. 5 (XPlanGML 5.0.1) (INSPIRE GML)@lt-t-de-t0-mtec
Serviciu de inspirație pentru planul de dezvoltare XPlanning Service pentru Plan Construction Lines Rindelbach Rattstadt Am Ortsweg No. 5 (XPlanGML 5.0.1) (INSPIRE GML)@ro-t-de-t0-mtec
Usługa inspiracyjna dla planu rozwoju XPlanning dla Plan Construction Lines Rindelbach Rattstadt Am Ortsweg nr 5 (XPlanGML 5.0.1) (INSPIRE GML)@pl-t-de-t0-mtec
Seirbhís inspioráide don phlean forbartha XPlanning seirbhíse do Phlean Tógála Línte Rindelbach Rattstadt Am Ortsweg Uimh. 5 (XPlanGML 5.0.1) (INS

In [30]:
# Code just for testing
# TODO: the second query takes long time for some endpoints -> should I run it one time and then save all nieghbors of all bnodes at one time so that I do not need to search after this query everytime
# -> Should I maybe just write a query for getting all triples and then filter out bnodes in code ?
"""endpoint_url = "https://lov.linkeddata.es/dataset/lov/sparql"
default_graph = "http://purl.org/cwmo/#" # 986 triples

endpoint_url = "https://data.europa.eu/sparql"
default_graph = "http://data.europa.eu/88u/dataset/0800af55-8e56-49a0-8986-aa55151d0440" # 179 triples

sparql = SPARQLWrapper(endpoint_url)
sparql.addDefaultGraph(default_graph)

query = """
#SELECT DISTINCT ?neighbor ?o WHERE {
#    ?neighbor ?p ?o . 
#    FILTER(isBlank(?o))
#}
"""

results = send_query(query, sparql, JSON)

for binding in results["results"]["bindings"]:
    #print(binding)
    lang_tag = results["results"].get("xml:lang") 
    datatype = results["results"].get("datatype") 
    print("lang_tag: " + str(lang_tag))
    print("datatype: " + str(datatype))

query = """
        #  SELECT DISTINCT ?neighbor ?o WHERE {
        #      ?o ?p ?neighbor . 
        #      FILTER(isBlank(?o))
        #  }
"""

results = send_query(query, sparql, JSON)

for binding in results["results"]["bindings"]:
    #print(binding)
    lang_tag = results["results"].get("xml:lang") 
    datatype = results["results"].get("datatype") 
    print("lang_tag: " + str(lang_tag))
    print("datatype: " + str(datatype))

"""

'\n\nresults = send_query(query, sparql, JSON)\n\nfor binding in results["results"]["bindings"]:\n    #print(binding)\n    lang_tag = results["results"].get("xml:lang") \n    datatype = results["results"].get("datatype") \n    print("lang_tag: " + str(lang_tag))\n    print("datatype: " + str(datatype))\n\n'

In [None]:
# Cohesion
# TODO: Optimization: send query for searching after neighbors of literals just one time and then store all neighbors of all literals directly
print("STARTING CALCULATING COHESION")

endpoint_url = "https://lov.linkeddata.es/dataset/lov/sparql"
default_graph = "http://purl.org/wf4ever/ro"
#default_graph = "http://purl.org/cwmo/#"

sparql = SPARQLWrapper(endpoint_url)
sparql.addDefaultGraph(default_graph)

def format_literal_for_sparql(node_value, node_type, lang_tag, datatype):
    # formatting backslashes und quotes
    formatted_value = node_value.replace("\\", "\\\\").replace('"', '\\"')
    # formatting new line
    formatted_value = formatted_value.replace("\n", "\\n")

    #lang_tag = node_obj.get("xml:lang")
    #datatype = node_obj.get("datatype")

    if lang_tag:
        #debug_print("node before formatting: " + formatted_value)
        val, lang = node_value, lang_tag
        formatted_value = val.replace("\n", "\\n").replace('"', '\\"')
       # debug_print("node after formatting: " + f'"{formatted_value}"@{lang}')
        return f'"{formatted_value}"@{lang}'
    elif datatype:
        #debug_print("node before formatting: " + formatted_value)
        val, dtype = node_value, datatype
        formatted_value = val.replace("\n", "\\n").replace('"', '\\"')
        #debug_print("node after formatting: " + f'"{formatted_value}"@{dtype}')
        return f'"{formatted_value}"^^<{dtype}>'
    else:
        return f'"{formatted_value}"'
    
def get_new_neighbors(node_value, node_type, lang_tag, datatype, sparql, all_nodes, visited):
  neighbors = set()
  
  if node_type in ["literal", "typed-literal"]:
    
    query = """
      SELECT DISTINCT ?neighbor ?o WHERE {
          ?neighbor ?p ?o .
          FILTER(isLiteral(?o))
      }
        """
      
    all_nghs = send_query(query, sparql, JSON)

    # comparison of gotten literals with literal which is needed
    for binding in all_nghs["results"]["bindings"]:
      pot_node = binding["o"]["value"] # pot..potential
      pot_node_lang = binding["o"].get("xml:lang") 
      pot_node_dtype = binding["o"].get("datatype") 

      # check if gotten literal = needed literal
      if pot_node == node_value and ((pot_node_lang == lang_tag) or (pot_node_dtype == datatype)):
        neighbor_val = binding["neighbor"]["value"]
        neighbor_type = binding["neighbor"]["type"]
        neighbor_lang = binding["neighbor"].get("xml:lang") 
        neighbor_dtype = binding["neighbor"].get("datatype") 
        neighbor = (neighbor_val, neighbor_type, neighbor_lang, neighbor_dtype)
        #debug_print(neighbor)
        if ((neighbor in all_nodes) and (neighbor not in visited)):
          neighbors.add(neighbor)
          debug_print("-Found unvisited neighbor of a literal node: " + format_literal_for_sparql(neighbor_val, neighbor_type, neighbor_lang, neighbor_dtype))
        elif neighbor not in visited:
          #debug_print("##### NEIGHBOR NOT IN ALL_NODES: " + format_literal_for_sparql(neighbor_val, neighbor_type, neighbor_lang, neighbor_dtype) + " #####")
          debug_print("##### NEIGHBOR NOT IN ALL_NODES: ", neighbor_val , neighbor_type, neighbor_lang, neighbor_dtype + " #####")

  elif node_type == "bnode":
    query = """
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
      SELECT DISTINCT ?neighbor ?o WHERE {
    {      ?neighbor ?p ?o . 
    FILTER(isBlank(?o)) }
  UNION
 		 {?o ?p ?neighbor . 
             FILTER(isBlank(?o)) }
      }
        """
      
    all_nghs = send_query(query, sparql, JSON)

    # comparison of gotten literals with literal which is needed
    for binding in all_nghs["results"]["bindings"]:
      pot_node_value = binding["o"]["value"] # pot..potential
      pot_node_type = binding["o"]["type"]
      pot_node_lang = binding["o"].get("xml:lang") 
      pot_node_dtype = binding["o"].get("datatype") 
      
      pot_node = (pot_node_value,pot_node_type,pot_node_lang,pot_node_dtype)
      node = (node_value, node_type, lang_tag, datatype)
      # check if gotten bnode = needed bnode
      if pot_node == node:
        neighbor_val = binding["neighbor"]["value"]
        neighbor_type = binding["neighbor"]["type"]
        neighbor_lang = binding["neighbor"].get("xml:lang") 
        neighbor_dtype = binding["neighbor"].get("datatype") 
        neighbor = (neighbor_val, neighbor_type, neighbor_lang, neighbor_dtype)

        if ((neighbor in all_nodes) and (neighbor not in visited)):
          neighbors.add(neighbor)
          # with this print, it prints with including the prefix, which makes it more difficult to debug
          debug_print("-Found unvisited neighbor of the blank node " + node_value + ": " + format_literal_for_sparql(neighbor_val, neighbor_type, neighbor_lang, neighbor_dtype))
          #debug_print("-Found unvisited neighbor of the blank node " + node_value + ": " + neighbor_val + "-" + neighbor_type + "-" + str(neighbor_lang) + "-" + str(neighbor_dtype))
        elif neighbor not in visited:
          #debug_print("##### NEIGHBOR NOT IN ALL_NODES: " + format_literal_for_sparql(neighbor_val, neighbor_type, neighbor_lang, neighbor_dtype) + " #####")
          debug_print("##### NEIGHBOR NOT IN ALL_NODES: ", neighbor_val , neighbor_type, neighbor_lang, neighbor_dtype + " #####")
  else:
    query = f"""
    SELECT DISTINCT ?neighbor WHERE {{
        {{ <{node_value}> ?p1 ?neighbor . }}
        UNION
        {{ ?neighbor ?p2 <{node_value}> . }}
    }}
    """

    #debug_print("QUERY: ")
    #debug_print(query)

    pot_new_nghs = send_query(query, sparql, JSON)
    for binding in pot_new_nghs["results"]["bindings"]:
      pot_new_ngh_obj = binding["neighbor"]
      pot_new_ngh_val = pot_new_ngh_obj["value"]
      pot_new_ngh_type = pot_new_ngh_obj["type"]
      # i know lang_tag & datatype will be None, but using get function is important for variable types!!
      pot_new_ngh = pot_new_ngh_val, pot_new_ngh_type, pot_new_ngh_obj.get("xml:lang"), pot_new_ngh_obj.get("datatype")
      if ((pot_new_ngh in all_nodes) and (pot_new_ngh not in visited)):
        neighbors.add(pot_new_ngh)
        debug_print("-Found unvisited neighbor: " + format_literal_for_sparql(pot_new_ngh_val, pot_new_ngh_type, pot_new_ngh_obj.get("xml:lang"), pot_new_ngh_obj.get("datatype")))
      elif pot_new_ngh not in visited:
        #debug_print("##### NEIGHBOR NOT IN ALL_NODES: " + format_literal_for_sparql(neighbor_val, neighbor_type, neighbor_obj.get("xml:lang"), neighbor_obj.get("datatype")) + " #####")
        debug_print("##### 1 NEIGHBOR NOT IN ALL_NODES: " + pot_new_ngh_val + " - " + pot_new_ngh_type + " - " + str(pot_new_ngh_obj.get("xml:lang")) + " - " + str(pot_new_ngh_obj.get("datatype")) + " #####")
  
  return neighbors

def check_literal_tags(node_obj, node_value, node_type):
  if node_type == "literal" or node_type == "typed-literal":
    debug_print("[check_literal_tags] " + node_value + " - lang_tag: " + str(node_obj.get("xml:lang")) + " - datatype: " + str(node_obj.get("datatype")))
    lang_tag = node_obj.get("xml:lang")
    datatype = node_obj.get("datatype")
    
    if lang_tag:  # add language tag
        node = f"{node}@{lang_tag}"
    elif datatype:  # add datatype tag
        node = f"{node}^^{datatype}"
        
    debug_print("[check_literal_tags] " + node)
  return node

# Getting all nodes

debug_print("Getting all nodes")

query_all_nodes = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT DISTINCT ?node WHERE {
    { ?node ?p ?o } UNION { ?s ?p ?node }
}
"""
results = send_query(query_all_nodes, sparql, JSON)

#print(results)

all_nodes = set()

for binding in results["results"]["bindings"]:
  #print("TEST " + node_obj)
  node_obj = binding["node"]
  #print("TEST " + node_obj)
  node_value = node_obj["value"]
  node_type = node_obj["type"]
  lang_tag = node_obj.get("xml:lang") 
  datatype = node_obj.get("datatype") 

  all_nodes.add((node_value, node_type, lang_tag, datatype))      

debug_print("Got all nodes")

debug_print("Getting all literals and their neighbors")

literal_neighbors = []

query_literal_neighbors = """
SELECT DISTINCT ?neighbor ?literal WHERE {
          ?neighbor ?p ?literal .
          FILTER(isLiteral(?literal))
      }
"""
results = send_query(query_literal_neighbors, sparql, JSON)

for binding in results["results"]["bindings"]:
  lit_obj = binding["literal"]
  ngh_obj = binding["neighbor"]

  lit = (lit_obj["value"], lit_obj["type"], lit_obj.get("xml:lang"), lit_obj.get("datatype"))
  neigh = (ngh_obj["value"], ngh_obj["type"], ngh_obj.get("xml:lang"), ngh_obj.get("datatype"))

  literal_neighbors.append((lit, neigh))
  #debug_print("neigh: " + ngh_obj["value"] + "- lit: " + lit_obj["value"])

debug_print("Got all literals and their neighbors")

sys.exit()

# describes number of independent components of graph (like subgraphs which are connected internally)
components = 0

# describes visited / discovered nodes
visited = set()

#DEBUG = False

## algorithm: searches for components of graph
while visited != all_nodes:
  
  # Choose first unvisited node
  # iter creates iterator for set - next gives next element in set
  start_node = next(iter(all_nodes - visited))
  node_value, node_type, lang_tag, datatype = start_node
  #if node_type in ["literal", "typed-literal", "bnode"]:
  debug_print("NEXT RANDOM NODE TO SEARCH FOR: " + format_literal_for_sparql(node_value, node_type, lang_tag, datatype))
  frontier = {start_node}
  visited.add(start_node)

  while frontier:
    
    debug_print("starting new frontier")
    new_frontier = set()
    
    for node_value, node_type, lang_tag, datatype in frontier:
            
      if (node_value, node_type, lang_tag, datatype) != start_node:
        debug_print("next state to search for: " + format_literal_for_sparql(node_value, node_type, lang_tag, datatype ))
      #print(node)
      #if node_type in ["literal", "typed-literal", "bnode"]:
      #  debug_print("-Searching for neighbors of: " + str(format_literal_for_sparql(node_value, node_type, lang_tag, datatype)))
      new_visited_neighbors = set()
      new_visited_neighbors = get_new_neighbors(node_value, node_type, lang_tag, datatype, sparql, all_nodes, visited)
      # neighbors just consists of new neighbors 
      visited |= new_visited_neighbors

      if visited == all_nodes:
        break

      new_frontier |= new_visited_neighbors
      
      debug_print("New Frontier after getting neighbors of " + str(format_literal_for_sparql(node_value, node_type, lang_tag, datatype ) + ":"))
      for node_value, node_type, lang_tag, datatype in new_frontier:
        debug_print("-- " + format_literal_for_sparql(node_value, node_type, lang_tag, datatype))

    frontier = new_frontier

    debug_print("Frontier done. New Frontier:")
    for node_value, node_type, lang_tag, datatype in frontier:
      debug_print("-- " + format_literal_for_sparql(node_value, node_type, lang_tag, datatype))
      #print("Visited Components: " + len(visited))

    if visited == all_nodes:
      break

  components += 1

  print(f"Component {components} completed. \nVISITED: {len(visited)}/{len(all_nodes)}")

print("FINISHED CALCULATING COHESION")

print("\nRESULT: ")
print("-Cohesion: " + str(components))

# TODO: runden der werte auf 2 stellig

# TODO: Solve following problems:
# [SOLVED] LITERAL QUEUE:  SELECT DISTINCT ?neighbor WHERE { ?neighbor ?p2 ""download""@en .} does not work
# [SOLVED] addressing bnodes with <bnode-id> does not work for every endpoint
# [SOLVED] Literals with \n and emtpy spaces do not work in neighbor search! (ich denke das ist gelöst durch format_literal_for_sparql)
# [SOLVED] "download"@en geht auch nicht

STARTING CALCULATING COHESION
Getting all nodes
Got all nodes
Getting all literals and their neighbors
neigh: http://purl.org/wf4ever/ro- lit: This ontology shows how AO and ORE ontologies can be used together to define a ResearchObject. This ontology is further customized by the wf4ever ontology.
neigh: http://purl.org/wf4ever/ro- lit: 0.1.1
neigh: http://purl.org/wf4ever/ro- lit: The Research Object Ontology
neigh: http://purl.org/wf4ever/ro#ResearchObject- lit: A research object aggregates a number of resources. A resource can be a workflow, web service, document, data item, data set, workflow run, software or a research object.
neigh: http://purl.org/wf4ever/ro#Manifest- lit: The ro:Manifest is used to describe an ro:ResearchObject. This identifies the resource for the manifest which lists all the aggregations of the research object, typically called ".ro/manifest.rdf" relative to the research object this manifest ore:describes.
neigh: http://purl.org/wf4ever/ro#SemanticAnnotation-

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [16]:
# Calculate Tangledness 
# source 73 - page 4
# tangledness = mean number of classes with more than 1 direct ancestor, so two primitive 
# measurements (number of classes and number of direct ancestors) are used for computing the metric 

print("STARTING CALCULATING TANGLEDNESS")

# Select number of classes in graph 
query_classes = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

SELECT (COUNT(DISTINCT ?class) AS ?num_classes)
WHERE {
  {
    # 1. explicitly/implicitly used RDF classes
    # explicitly: ?class a owl:Class . or ?class a rdfs:Class .
    # implicitly: ?any rdf:type ?class . (includes also explicitly used classes)

    ?any rdf:type ?class .
  }
  UNION
  {
    # 2. subclasses
    ?class rdfs:subClassOf ?any .
  }
  UNION
  {
    # 3. superclasses
    ?any rdfs:subClassOf ?class .
  }
  UNION
  {
    # 4. classes used with owl:equivalentClass
    { ?class owl:equivalentClass ?any . }
    UNION
    { ?any owl:equivalentClass ?class . }
  }
  UNION
  {
    # 5. OWL restriction classes
    ?class a owl:Restriction .
  }
  UNION
  {
    # 6. complex classes with using unionOf, intersectionOf etc.
    ?class owl:unionOf|owl:intersectionOf|owl:complementOf|owl:oneOf ?list .
  }
  UNION
  {
    # 7. OWL hasValue restrictions
    ?class owl:hasValue ?val .
  }
}"""

results = send_query(query_classes, sparql, JSON)

num_classes = 0

for binding in results["results"]["bindings"]:
    num_classes = int(binding["num_classes"]["value"])
    print("Number of classes in graph: " + str(num_classes))

# TODO Select number of classes with more than one ingoing isA arc (Dr. Jovanovik said I should use is-a)
# source 37 - page 3 says the same, but look at query_var2
query_var1 = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(?class) AS ?tangledCount)
WHERE
{
  SELECT ?class (COUNT(?s) AS ?numIngoing)
  WHERE {
    { ?s rdf:type ?class . }
  }
  GROUP BY ?class
  HAVING (COUNT(?s) > 1) # problem here was: i used ?numIngoing instead of COUNT(?s)
}
"""

# TODO Select number of classes with more than one superclass (source 73 says I should use this query)
query_var2 = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(?class) AS ?tangledCount)
WHERE {
  SELECT ?class (COUNT(?super) AS ?numSupers)
  WHERE {
    ?class rdfs:subClassOf ?super .
  }
  GROUP BY ?class
  HAVING (COUNT(?super) > 1)
}
"""

results = send_query(query_var2, sparql, JSON)

t = 0

for binding in results["results"]["bindings"]:
    t = int(binding["tangledCount"]["value"])
    print("Number of classes with more than one superclass: " + str(t))

print("STARTING CALCULATING TANGLEDNESS")
print("\nRESULT:")

if t > 0:
  # source 37 says num_classes / t
  # source 73 says denominator and numerator should be switched -> t / num_classes
  ont_tangledness = num_classes / t
else:
  ont_tangledness = 0

print("-Ontology Tangledness: " + str(ont_tangledness))


STARTING CALCULATING TANGLEDNESS
Number of classes in graph: 20
Number of classes with more than one superclass: 0
STARTING CALCULATING TANGLEDNESS

RESULT:
-Ontology Tangledness: 0


In [17]:
# Degree Distribution (Formula in: source 37, page 7)
# nG...number of nodes in gaph
# nE...number of edges in graph

print("STARTING CALCULATING DEGREE DISTRIBUTION")

# Calculating nE
query_nE = """
SELECT (COUNT(*) AS ?tripleCount)
WHERE {
  ?s ?p ?o .
}
"""

results = send_query(query_nE, sparql, JSON)

nE = int(results["results"]["bindings"][0]["tripleCount"]["value"])
print("Number of edges in graph: ", str(nE))

# Calculating nG
query_nG = """
SELECT (COUNT(DISTINCT ?node) AS ?nodeCount)
WHERE {
  {
    SELECT ?node WHERE {
      { ?node ?p1 ?o }       
      UNION
      { ?s ?p2 ?node }       
    }
  }
}
"""

results = send_query(query_nG, sparql, JSON)

nG = int(results["results"]["bindings"][0]["nodeCount"]["value"])
print("Number of nodes in graph: ", str(nG))

# Calculating degree for every node in graph
query_degrees = """
SELECT ?node (COUNT(?any) AS ?degree)
WHERE {
  {
    { ?node ?p1 ?any }     
    UNION
    { ?any ?p2 ?node }   
  }
}
GROUP BY ?node
"""

results = send_query(query_degrees, sparql, JSON)

degrees = []
for binding in results["results"]["bindings"]:
    node = binding["node"]["value"]
    degree = int(binding["degree"]["value"])
    degrees.append((node, degree))
    debug_print(node + ": " + str(degree))

sum_of_degrees = sum(d for _, d in degrees)

# sum of degrees should be equal to 2 * nE
print("Sum of Degrees: " + str(sum_of_degrees))

if nG > 1:
    mean_degree = (2 * nE) / nG
    squared_diffs = [(deg_v - mean_degree) ** 2 for _,deg_v in degrees]
    degree_distribution = sum(squared_diffs) / (nG-1)
else:
    degree_distribution = 0 

print("FINISHED CALCULATING DEGREE DISTRIBUTION")

print("\nRESULT:")
print("-Degree Distribution: " + str(degree_distribution))

STARTING CALCULATING DEGREE DISTRIBUTION
Number of edges in graph:  179
Number of nodes in graph:  163
prenos: 1
http://purl.org/dc/terms/MediaType: 1
Serviço INSPIRE para o Serviço do Plano de Desenvolvimento (XPlanGML 5.0.1) para as linhas de construção do plano Rindelbach Rattstadt Am Ortsweg N.º 5 da cidade de Ellwangen (Jagst) (INSPIRE GML): 1
lataa: 1
http://inspire.ec.europa.eu/metadata-codelist/ResourceType/dataset: 1
INSPIRE-Dienst für den Bebauungsplan XPlanung-Dienst für den Plan Baulinien Rindelbach Rattstadt Am Ortsweg Nr. 5 (XPlanGML 5.0.1) (INSPIRE GML): 1
Usługa inspiracyjna dla planu rozwoju XPlanning dla Plan Construction Lines Rindelbach Rattstadt Am Ortsweg nr 5 (XPlanGML 5.0.1) (INSPIRE GML): 1
nodeID://b3793463703: 4
nodeID://b3793463696: 4
PlannedLandUse: 1
Seirbhís INSPIRE don tSeirbhís um Phlean Forbartha (XPlanGML 5.0.1) do Línte Tógála Phlean Rindelbach Rattstadt Am Ortsweg Uimh. 5 de chathair Ellwangen (Jagst) (INSPIRE GML): 1
nodeID://b3793463699: 7
http://

In [23]:
print("STARTING CALCULATING ENTITIES/CLASSES/PROPERTIES/INSTANCES/OBJECT PROPERTIES")

query_entitities = """
SELECT (COUNT(DISTINCT ?entity) AS ?entityCount)
WHERE {
  {
    SELECT DISTINCT ?entity WHERE {
      ?entity ?p ?o .
    }
  }
  UNION
  {
    SELECT DISTINCT ?entity WHERE {
      ?s ?p ?entity .
      FILTER(!isLiteral(?entity))
    }
  }
}
"""

results = send_query(query_entitities, sparql, JSON)

num_entities = 0

for binding in results["results"]["bindings"]:
  num_entities = int(binding["entityCount"]["value"])
        

# Number of instances per type
query_inst = """ 
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
SELECT ?type (COUNT(?s) AS ?count)
WHERE {
  ?s rdf:type ?type . 
}
GROUP BY ?type """

results = send_query(query_inst, sparql, JSON)
#print(results)
num_instances = 0

for binding in results["results"]["bindings"]:
  rdf_type = binding["type"]["value"]
  count = int(binding["count"]["value"])
  num_instances += count
        
# Number of classes
# Defintion of Class: 
# source: 213, page: 5 - source 250, page 3
# TNOC (total number of classes/concepts) = classes, subclasses, superclasses, anonymous classes
# anonymous classes = equivalent/restriction/unionOf/intersectionOf/complementOf/oneOf/hasValue classes

query_classes = """ 
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

SELECT DISTINCT ?class
WHERE {
  {
    # 1. explicitly/implicitly used RDF classes
    # explicitly: ?class a owl:Class . or ?class a rdfs:Class .
    # implicitly: ?any rdf:type ?class . (includes also explicitly used classes)

    ?any rdf:type ?class .
  }
  UNION
  {
    # 2. subclasses
    ?class rdfs:subClassOf ?any .
  }
  UNION
  {
    # 3. superclasses
    ?any rdfs:subClassOf ?class .
  }
  UNION
  {
    # 4. classes used with owl:equivalentClass
    { ?class owl:equivalentClass ?any . }
    UNION
    { ?any owl:equivalentClass ?class . }
  }
  UNION
  {
    # 5. OWL restriction classes
    ?class a owl:Restriction .
  }
  UNION
  {
    # 6. complex classes with using unionOf, intersectionOf etc.
    ?class owl:unionOf|owl:intersectionOf|owl:complementOf|owl:oneOf ?list .
  }
  UNION
  {
    # 7. OWL hasValue restrictions
    ?class owl:hasValue ?val .
  }
}
"""

results = send_query(query_classes, sparql, JSON)

num_classes = 0

for binding in results["results"]["bindings"]:
  rdf_class = binding["class"]["value"]
  num_classes += 1
  debug_print("Class " + str(num_classes) + ": " + str(rdf_class))

# number of properties in T-Box
# source says: property = explicitly defined property
query_properties_t = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(DISTINCT ?property) AS ?propertyCount)
WHERE {
  VALUES ?type { owl:ObjectProperty owl:DatatypeProperty owl:AnnotationProperty }
  ?property rdf:type ?type .
}
"""

results = send_query(query_properties_t, sparql, JSON)

num_properties_t = 0

for binding in results["results"]["bindings"]:
  num_properties_t = int(binding["propertyCount"]["value"])

# number of properties in A-Box
# source says: property = the unique ?p in ?s ?p ?o
query_properties_a = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(DISTINCT ?p) AS ?propertyCount)
WHERE {
  ?s ?p ?o .
}
"""

results = send_query(query_properties_a, sparql, JSON)

num_properties_a = 0

for binding in results["results"]["bindings"]:
  num_properties_a = int(binding["propertyCount"]["value"])

debug_print("Properties in T-Box: " + str(num_properties_t))
debug_print("Properties in A-Box: " + str(num_properties_a))

num_properties = num_properties_t + num_properties_a

# Number of object properties in T-Box
# Non-Inheritance -> excluding inheritance properties like rdfs:subPropertyOf or rdfs:subClassOf
query_object_properties_t = """
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT (COUNT(DISTINCT ?property) AS ?numObjectProperties)
WHERE {
  ?property rdf:type owl:ObjectProperty .
}
"""

num_obj_properties_t = 0

results = send_query(query_object_properties_t, sparql, JSON)

for binding in results["results"]["bindings"]:
  num_obj_properties_t = int(binding["numObjectProperties"]["value"])

# TODO
# Number of object properties in A-Box
# Non-Inheritance -> excluding inheritance properties like rdfs:subPropertyOf or rdfs:subClassOf
query_object_properties_a = """
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT (COUNT(DISTINCT ?property) AS ?numObjectProperties)
WHERE {
  ?s ?property ?o 
  Filter(!isLiteral(?o))
}
"""

num_obj_properties_a = 0

results = send_query(query_object_properties_a, sparql, JSON)

for binding in results["results"]["bindings"]:
  num_obj_properties_a = int(binding["numObjectProperties"]["value"])

debug_print("Object Properties in T-Box: " + str(num_obj_properties_t))
debug_print("Object Properties in A-Box: " + str(num_obj_properties_a))
num_obj_properties = num_obj_properties_t + num_obj_properties_a

print("FINISHED CALCULATING ENTITIES/CLASSES/PROPERTIES/INSTANCES/OBJECT PROPERTIES")

print("\nRESULTS: ")
print("-Number of entities: " + str(num_entities))
print("-Number of properties: " + str(num_properties))
print("-Number of classes: " + str(num_classes))
print("-Number of instances: " + str(num_instances))
print("-Number of object properties: " + str(num_obj_properties))
        

STARTING CALCULATING ENTITIES/CLASSES/PROPERTIES/INSTANCES/OBJECT PROPERTIES
Class 1: http://xmlns.com/foaf/0.1/Organization
Class 2: http://www.w3.org/2006/vcard/ns#Individual
Class 3: http://www.w3.org/ns/dcat#CatalogRecord
Class 4: http://www.w3.org/ns/dcat#Distribution
Class 5: http://www.w3.org/ns/prov#Plan
Class 6: http://www.w3.org/2006/vcard/ns#Address
Class 7: http://purl.org/dc/terms/LicenseDocument
Class 8: http://purl.org/dc/terms/Standard
Class 9: http://purl.org/dc/terms/Location
Class 10: http://www.w3.org/ns/dcat#DataService
Class 11: http://www.w3.org/ns/dcat#Dataset
Class 12: http://xmlns.com/foaf/0.1/Person
Class 13: http://spdx.org/rdf/terms#Checksum
Class 14: http://www.w3.org/ns/locn#Address
Class 15: http://www.w3.org/ns/prov#Activity
Class 16: http://purl.org/dc/terms/MediaType
Class 17: http://purl.org/dc/terms/RightsStatement
Class 18: http://www.w3.org/ns/prov#Attribution
Class 19: http://www.w3.org/ns/prov#Entity
Class 20: http://www.w3.org/ns/prov#Associati

In [24]:
# TODO: Depth of Inheritance Tree

print("STARTING CALCULATING DEPTH OF INHERITANCE TREE")

# Calculating all paths from one root node (start node) for a sparql endpoint
def find_all_paths_subclasses(endpoint_url, default_graph,  start_node):
    neighbors_cache = {}  # global oder in der Funktion definiert

    # helperfunction for finding neighbors of a node (triple=(node, pred, neighbor))
    def get_neighbors(node):
        
        node_str = str(node)

        # 1. Prüfen, ob der Node bereits im Cache ist
        if node_str in neighbors_cache:
            # Debug-Ausgabe
            #print(f"Cache-Treffer für {node_str}")
            return neighbors_cache[node_str]
 
        if (len(str(node)) == 0): #or (not (node.startswith("http://") or node.startswith("https://"))):
            return [] 

        query = f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

        SELECT DISTINCT ?child
        WHERE {{
          ?child rdfs:subClassOf <{node}> .
        }}
        """
        
        #sparql.setQuery(query)
        #sparql.setReturnFormat(JSON)
        #sparql.addDefaultGraph(default_graph)
        results = send_query(query, sparql, JSON)
       
        
        neighbors = []
        for binding in results["results"]["bindings"]:
            next_obj = binding["next"]
            value = next_obj["value"]
            value_type = next_obj["type"]  # 'uri', 'literal', 'bnode'
        
            neighbors.append((value, value_type))

        neighbors_cache[node_str] = neighbors

        return neighbors
    
    def dfs(path, node, node_type="uri"):
        global num_paths_inh_tree
        global max_depth_inh_tree

        # skip node if it already is in path (avoiding cycles)
        if node in path:
            return
        
        # add current node to path
        path.append(node)
        
        neighbors = list(get_neighbors(node)) 

        if not neighbors:
            paths.append(list(path))
            num_paths_inh_tree += 1
            max_depth_inh_tree = max(max_depth_inh_tree, len(path) - 1)
        else:
            for neighbor, neighbor_type in neighbors:
              #print("\node: " + str(node) + " - neighbor: " + str(neighbor))
              
              dfs(path, neighbor, neighbor_type)

        # remove node from the path to find next path
        path.pop()

    sparql = SPARQLWrapper(endpoint_url)
    sparql.addDefaultGraph(default_graph)
    #sparql.setTimeout(60)  # 120 Sekunden Timeout

    # list which stores all paths
    paths = []

    # TODO: ich glaub dass node_type=uri nicht passt, weil bnodes auch vorkommen können (vllt passts aber auch mt uri weil gleich gesucht wird mit bnodes)
    dfs([], start_node, node_type="uri")

    return paths

# ?root a owl:Class -> to ensure root is a class
# FILTER NOT EXISTS {?root rdfs:subClassOf ?anyClass .} -> to get root which has no superclass
query_root = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT DISTINCT ?root
WHERE {
  {
    ?root rdf:type owl:Class .
  }
  UNION
  {
    ?root rdf:type rdfs:Class .
  }
  
  FILTER NOT EXISTS {
    ?root rdfs:subClassOf ?anyClass .
  }
}
"""

# node types that can be in results: uri or blank node
results = send_query(query_root, sparql, JSON)

tree_roots = set()

for binding in results["results"]["bindings"]:
    node = binding["root"]["value"]
    debug_print("Found Tree node: " + node)
    tree_roots.add(node)

debug_print(f"Number of root nodes: {len(tree_roots)}")
    
max_depth_inh_tree = 0
num_paths_inh_tree = 0

# Dictionary für alle Pfade
all_paths = {}

# calculating all subclass paths form tree nodes
for root in tree_roots:
    debug_print("Calculating subclass paths for tree node: " + root)
    all_paths[root] = find_all_paths_subclasses(endpoint_url, default_graph, root)

# Output found paths
print("FOUND PATHS:")
for root_node, paths in all_paths.items():
    for path in paths:
        debug_print(path)
        debug_print("Path length = " + str(len(path)-1))

debug_print("-Number of Paths: " + str(num_paths_inh_tree))

print("FINISHED CALCULATING DEPTH OF INHERITANCE TREE")

print("\nRESULTS:")
print("-Depth of Inheritance Tree: " + str(max_depth_inh_tree))

STARTING CALCULATING DEPTH OF INHERITANCE TREE
Number of root nodes: 0
FOUND PATHS:
-Number of Paths: 0
FINISHED CALCULATING DEPTH OF INHERITANCE TREE

RESULTS:
-Depth of Inheritance Tree: 0


In [27]:
print("STARTING CALCULATING PROPERTY CLASS RATIO/CLASS PROPERTY RATIO/INHERITANCE RICHNESS/ATTRIBUTE RICHNESS")

# Inheritance Richness = average number of subclasses per class (source 227 - page 9) 
# Getting number of all subclasses
query_subclasses = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(*) AS ?numInheritanceRelations)
WHERE {
  ?subclass rdfs:subClassOf ?superclass .
}
"""

num_subclasses = 0

results = send_query(query_subclasses, sparql, JSON)

for binding in results["results"]["bindings"]:
  num_subclasses = int(binding["numInheritanceRelations"]["value"])

# TODO the same with pointing to literal in A-box
# Getting number of datatype properties
query_datatype_properties = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(DISTINCT ?property) AS ?numDatatypeProperties)
WHERE {
  ?property rdf:type owl:DatatypeProperty .
}
"""

num_datatype_properties = 0

results = send_query(query_datatype_properties, sparql, JSON)

for binding in results["results"]["bindings"]:
  num_datatype_properties = int(binding["numDatatypeProperties"]["value"])

debug_print("Number of Datatype properties: " + str(num_datatype_properties))

print("FINISHED CALCULATING PROPERTY CLASS RATIO/CLASS PROPERTY RATIO/INHERITANCE RICHNESS/ATTRIBUTE RICHNESS")
print("\nRESULTS:")
# Property Class Ratio
if num_classes > 0:
    prop_class_ratio = num_properties / num_classes 
    #print("-Property Class Ratio: " + str(prop_class_ratio))
else:
    # TODO
    # source 172 - page: assumes that classes must exist for properties to exist (Number of Properties, Number of CLasses > 1)
    # I assume: no classes -> ratio = 0
    prop_class_ratio = 0
    #print("-Property Class Ratio: 0")

print("-Property Class Ratio: " + str(prop_class_ratio))

# Class Property Ratio
if num_properties > 0:
  class_prop_ratio = num_classes / num_properties 
 # print("-Class Property Ratio: " + str(class_prop_ratio))
else:
  # TODO
  # metric is not defined for num_properties = 0
  # I assume: no properties -> ratio = 0
  class_prop_ratio = 0
  #print("-Class Property Ratio is INF!")
  
print("-Class Property Ratio: " + str(class_prop_ratio))

if num_classes > 0:
  inheritance_richness = num_subclasses / num_classes 
  #print("-Inheritance Richness: " + str(inheritance_richness))
else:
  # TODO
  # metric is not defined for num_classes = 0
  inheritance_richness = 0
  #print("-Inheritance Richness is INF!")

print("-Inheritance Richness: " + str(inheritance_richness))

if num_classes > 0:
  attr_richness = num_datatype_properties / num_classes
  #print("-Attribute Richness: " + str(attr_richness))
else:
  # TODO
  # metric is not defined for num_classes = 0
  attr_richness = 0
  #print("-Attribute Richness is INF!")
print("-Attribute Richness: " + str(attr_richness))


STARTING CALCULATING PROPERTY CLASS RATIO/CLASS PROPERTY RATIO/INHERITANCE RICHNESS/ATTRIBUTE RICHNESS
Number of Datatype properties: 0
FINISHED CALCULATING PROPERTY CLASS RATIO/CLASS PROPERTY RATIO/INHERITANCE RICHNESS/ATTRIBUTE RICHNESS

RESULTS:
-Property Class Ratio: 3.35
-Class Property Ratio: 0.29850746268656714
-Inheritance Richness: 0.0
-Attribute Richness: 0.0


In [28]:
print("STARTING CALCULATING AVERAGE CLASS CONNECTIVITY/AVERAGE POPULATION")

# Average Class Connectivity
# Connectivity of a class is defined as the total number of relationships instances of 
# the class have with instances of other classes (source 227 - page 10)

# looking for number of triples (c1, p, c2) or (c3, p, c1) for each class with instances c1
# c1, c2 are instances of classes 
# c1 != c2,c3
# property != rdf:type because we are not interested in the class relationships 
query_class_connectivity = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT ?class (COUNT(*) AS ?connectivity)
WHERE {
  {
    ?instance ?property ?target .

    ?instance rdf:type ?class .
    ?target rdf:type ?targetClass .

    FILTER(?property != rdf:type)
    FILTER(?class != ?targetClass)
  }
  UNION
  {
    ?instance ?property ?target .

    ?target rdf:type ?class .
    ?instance rdf:type ?targetClass .

    FILTER(?property != rdf:type)
    FILTER(?class != ?targetClass)
  }
}
GROUP BY ?class

"""
results = send_query(query_class_connectivity, sparql, JSON)

class_connectivity_list = []
sum_connectivities = 0

for binding in results["results"]["bindings"]:
    class_name = binding["class"]["value"]
    connectivity = int(binding["connectivity"]["value"])
    class_connectivity_list.append((class_name, connectivity))
    sum_connectivities += connectivity


for class_name, connectivity in class_connectivity_list:
  debug_print("Connectivity of Class " + class_name + ": " + str(connectivity))

if num_classes > 0:
  avg_class_connectivity = sum_connectivities / num_classes
  #print("-Average Class Connectivity: " + str(avg_class_connectivity))
else:
  # TODO
  # metric is not defined for num_classes = 0
  avg_class_connectivity = 0
  #print("-Average Class Connectivity is INF!")

# Average Population
if num_classes > 0:
    avg_population = num_instances / num_classes
    #print("-Average Population: " + str(avg_population))
else:
  # TODO
  # metric is not defined for num_classes = 0
  avg_population = 0
  #print("-Average Population is INF!")

print("FINISHED CALCULATING AVERAGE CLASS CONNECTIVITY/AVERAGE POPULATION")

print("\nRESULTS:")
print("-Average Class Connectivity: " + str(avg_class_connectivity))
print("-Average Population: " + str(avg_population))


STARTING CALCULATING AVERAGE CLASS CONNECTIVITY/AVERAGE POPULATION
Connectivity of Class http://purl.org/dc/terms/Standard: 3
Connectivity of Class http://spdx.org/rdf/terms#Checksum: 1
Connectivity of Class http://www.w3.org/ns/locn#Address: 1
Connectivity of Class http://www.w3.org/ns/dcat#Dataset: 8
Connectivity of Class http://www.w3.org/ns/dcat#DataService: 2
Connectivity of Class http://www.w3.org/ns/prov#Plan: 2
Connectivity of Class http://www.w3.org/ns/prov#Activity: 3
Connectivity of Class http://www.w3.org/ns/dcat#Distribution: 5
Connectivity of Class http://www.w3.org/ns/prov#Attribution: 2
Connectivity of Class http://purl.org/dc/terms/MediaType: 1
Connectivity of Class http://www.w3.org/ns/prov#Association: 2
Connectivity of Class http://purl.org/dc/terms/Location: 1
Connectivity of Class http://xmlns.com/foaf/0.1/Organization: 1
Connectivity of Class http://www.w3.org/2006/vcard/ns#Individual: 2
Connectivity of Class http://www.w3.org/ns/dcat#CatalogRecord: 2
Connectivit

In [12]:
"""import requests

endpoint_url = "https://data.europa.eu/sparql"
#endpoint_url = "https://lov.linkeddata.es/dataset/lov/sparql"
endpoint_url = "https://sparql.europeana.eu/"
#endpoint_url = "https://nfdi4culture.de/sparql"
endpoint_url = "https://query.wikidata.org/sparql"
#endpoint_url = "https://graphdb.ontotext.com/repositories/ontotext_public?query=SELECT+*+WHERE+%7B%7D+LIMIT+1"
#endpoint_url = "https://sparql.bioontology.org/"
#endpoint_url = "http://rdf4j.org/sparql"
#endpoint_url = "https://franz.com/agraph/sparql"

r = requests.get(endpoint_url)
print(r.headers.get("Server"))
print(r.text[:1000])  # manchmal im HTML ein Hinweis wie "Virtuoso" oder "Fuseki"""


'import requests\n\nendpoint_url = "https://data.europa.eu/sparql"\n#endpoint_url = "https://lov.linkeddata.es/dataset/lov/sparql"\nendpoint_url = "https://sparql.europeana.eu/"\n#endpoint_url = "https://nfdi4culture.de/sparql"\nendpoint_url = "https://query.wikidata.org/sparql"\n#endpoint_url = "https://graphdb.ontotext.com/repositories/ontotext_public?query=SELECT+*+WHERE+%7B%7D+LIMIT+1"\n#endpoint_url = "https://sparql.bioontology.org/"\n#endpoint_url = "http://rdf4j.org/sparql"\n#endpoint_url = "https://franz.com/agraph/sparql"\n\nr = requests.get(endpoint_url)\nprint(r.headers.get("Server"))\nprint(r.text[:1000])  # manchmal im HTML ein Hinweis wie "Virtuoso" oder "Fuseki'