In [None]:
from rdflib import Graph
from rdflib import RDF
from rdflib import URIRef
from rdflib import Literal
import networkx as nx
import matplotlib.pyplot as plt
from rdflib.plugins.stores.sparqlstore import SPARQLStore
from queries import QUERY_AI, QUERY_FILM, QUERY_PERSON
from paths_to_examples import REPO_BOOKS, REPO_PEOPLE, WIKIDATA, GPT_EX, GPT_SUBCLASSES
from SPARQLWrapper import SPARQLWrapper, RDF, JSON


DEBUG = True  # for debug prints
PRINTGRAPH = False # for showing the networkX graph
LOCAL = True # for choosing local ttl file path or SPARQL endpoint 

def debug_print(*args, **kwargs):
    if DEBUG:
        print(*args, **kwargs)


# printing all elements of a set
def print_set(set):
    if DEBUG:
        for i in set:
            print(i)

def show_graph(G):
    if PRINTGRAPH:
        # calculating positions for nodes
        pos = nx.spring_layout(G, k=0.5, iterations=50)

        # printing nodes and edges
        plt.figure(figsize=(12, 8))
        nx.draw(G, pos, with_labels=True, node_color="lightblue", node_size=1500, font_size=10, font_weight="bold", arrows=True)

        # printing labels for the edges
        edge_labels = nx.get_edge_attributes(G, 'label')
        nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_color='red')

        plt.title("RDF Graph")
        plt.axis("off")
        plt.show()

        print("FINISHED PRINTING GRAPH\n\n")        

# Calculating all paths from one root node (start node) for a local graph
def find_all_paths_local(G, start_node):
    # helper function for DFS
    def dfs(node, path):
        global num_paths
        global abs_depth
        global max_depth

        # skip node if it already is in path (avoiding cycles)
        if node in path:
            return
        
        # add current node to path
        path.append(node)
        
        neighbors = list(G.neighbors(node))
        # if node does not have any neighbors left, there is no other path left
        if not neighbors:
            paths.append(list(path))  # Store current path
            num_paths += 1
            abs_depth += (len(path) - 1) # cardinality of a path = number of EDGES in path (=> -1)
            if (len(path)-1) > max_depth:
                max_depth = len(path) - 1
        
        # recursively extend the path for each neighbor
        for neighbor in neighbors:
            dfs(neighbor, path)
        
        # remove node from the path to find next path
        path.pop()

    # list which stores all paths
    paths = []
    
    # starts DFS with start_node
    dfs(start_node, [])
    
    return paths

# Calculating all paths from one root node (start node) for a sparql endpoint
def find_all_paths_endpoint(endpoint_url, literals, start_node):

    # helperfunctino for finding neighbors of a node (triple=(node, pred, neighbor))
    def get_neighbors(node):
        if not (node.startswith("http://") or node.startswith("https://")):
            return []  # Literale und Blank Nodes haben keine gültigen SPARQL-Patterns
    
        # wollte construct verwenden, damit das mit den blank nodes gelöst wird
        query = f"""
        CONSTRUCT {{
            <{node}> ?p ?next .
        }} WHERE {{
            <{node}> ?p ?next .
        }}
        """

        query = f"""
        SELECT DISTINCT ?next WHERE {{
            <{node}> ?p ?next .
        }}
        """

        
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
       


        try : 
            #sparql.setReturnFormat(JSON)
            results = sparql.query()
            #triples = results.convert()
            #print(type(triples))
            results = results.convert()
            #print(type(results))
            #triples = results.convert() # this converts directly to an RDFlib Graph object

            #return [res["next"]["value"] for res in results["results"]["bindings"]]
        except Exception as e:
            #print("Query fehlgeschlagen:", e)
            print(str(node))
            raise RuntimeError(f"SPARQL-Abfrage fehlgeschlagen: {e}")
        
        # Alle Objekte der Tripel extrahieren
        neighbors = []
        for binding in results["results"]["bindings"]:
            neighbors.append(binding["next"]["value"])


        return neighbors
    
    def dfs(node, path):
        global num_paths
        global abs_depth
        global max_depth

        # skip node if it already is in path (avoiding cycles)
        if node in path:
            return
        
        # add current node to path
        path.append(node)
        
        neighbors = list(get_neighbors(node))

        if not neighbors:
            paths.append(list(path))
            num_paths += 1
            abs_depth += (len(path) - 1)
            max_depth = max(max_depth, len(path) - 1)
        else:
            for neighbor in neighbors:
                # if node is literal -> it does not have any neighbors -> path is finished
                if neighbor in literals:
                    path.append(neighbor)
                    #print("Found Path: ", path)
                    paths.append(list(path))
                    num_paths += 1
                    abs_depth += len(path) - 1
                    max_depth = max(max_depth, len(path) - 1)
                    # remove node from the path to find next path
                    path.pop()
                else:
                    dfs(neighbor, path)

        # remove node from the path to find next path
        path.pop()

    sparql = SPARQLWrapper(endpoint_url)
    # list which stores all paths
    paths = []

    dfs(start_node, [])

    return paths


In [None]:
# path to ttl file
# can also be an online graph source
ttl_file_path = REPO_BOOKS
g = Graph()

if LOCAL == True:

    print("STARTING PARSING")

    # if it fails -> print some error
    try:
        g.parse(ttl_file_path, format="turtle")
    except Exception as e:
        print("ERROR WHILE PARSING GRAPH: {e}")

    print("Number of triples: " + str(len(g)))    
    print("FINISHED PARSING\n")

else:

    endpoint_url = "https://data.europa.eu/sparql"
    endpoint_url = "https://lov.linkeddata.es/dataset/lov/sparql"
    endpoint_url = "https://sparql.europeana.eu/"
    
    sparql = SPARQLWrapper(endpoint_url)

    query_1 = """
    CONSTRUCT {?s ?p ?o .} 
    WHERE {
    ?s ?p ?o .
    }
    #LIMIT 100
    """

    query_2 = """
    #PREFIX dcat: <http://www.w3.org/ns/dcat#>
    #PREFIX odp:  <http://data.europa.eu/euodp/ontologies/ec-odp#>
    #PREFIX dct: <http://purl.org/dc/terms/>
    #PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    #PREFIX foaf: <http://xmlns.com/foaf/0.1/>

    SELECT ?s ?p ?o 
    WHERE {
    ?s ?p ?o
    }
    
    """

    query_3 = """
    CONSTRUCT {
        ?s ?p ?o .
    }
    WHERE {
        GRAPH ?g {
            ?s ?p ?o .
        }
    }
    
    """

    query = query_1

    #sparql.setQuery(query)

    #try : 
        #sparql.setReturnFormat(RDF)
       # results = sparql.query()
       # triples = results.convert() # this converts directly to an RDFlib Graph object

        # add triples to graph
       # g += triples
   # except Exception as e:
      #  print("Query fehlgeschlagen:", e)

#for s, p, o in g:
#   print(s, p, o)

print("STARTING GETTING ROOT NODES AND LITERALS")

if LOCAL:

    all_nodes = set(g.subjects())

    # printing all subject nodes
    # print_set(nodes)

    # all objects
    object_nodes = set(g.objects())

    # all literals in a graph (literals can only appear as objects)
    literals = set(o for o in object_nodes if isinstance(o, Literal))

    # converting items of literals to strings
    literals = {str(item) for item in literals}

    # getting possible root nodes
    root_nodes = all_nodes - object_nodes  

    debug_print("Root Nodes:")
    print_set(root_nodes)
    #debug_print("\n")

    print("FINISHED GETTING ROOT NODES AND LITERALS\n")

    print("STARTING CREATING GRAPH G")

    # NetworkX-DiGraph for visualization
    G = nx.DiGraph()

    debug_print("Printing all triples:")
    # add triples to GRaph G
    for subj, pred, obj in g:
        G.add_edge(str(subj), str(obj), label=str(pred))
        debug_print("(" , subj, ",", pred, ",", obj, ")")

    print("FINISHED CREATING GRAPH G\n")

    show_graph(G)
else:
    query_roots = """
    SELECT DISTINCT ?root
    WHERE {
        ?root ?p ?o .
        FILTER NOT EXISTS {
            ?s ?p2 ?root .
        }
    }
    """

    sparql.setQuery(query_roots)

    try : 
        sparql.setReturnFormat(JSON)
        results = sparql.query()

        results = results.convert()
        #triples = results.convert() # this converts directly to an RDFlib Graph object

        # add triples to graph
        #g += triples
    except Exception as e:
        print("Query fehlgeschlagen:", e)

    #debug_print((results))
    root_nodes = set()
    debug_print("Root Nodes:")
    i = 0
    for res in results["results"]["bindings"]:
        #debug_print(res["root"]["value"])
        if str(res["root"]["value"]) == "":
            print("YES: ", str(i))
        root_nodes.add(res["root"]["value"])
        #if i > 10:
        #    break
        i += 1
    #print_set(results)
    #for row in results:
    #    debug_print(str(row))
    #debug_print("\n")

    print("Got " + str(i+1) + "root nodes")

    #debug_print(len(root_nodes))

    #print_set(root_nodes)

    query_literals = """
    SELECT DISTINCT ?literal
    WHERE {
    ?s ?p ?literal .
    FILTER(isLiteral(?literal))
    }

    """

    sparql.setQuery(query_literals)

    try : 
        sparql.setReturnFormat(JSON)
        results = sparql.query()

        results = results.convert()
        #triples = results.convert() # this converts directly to an RDFlib Graph object

        # add triples to graph
        #g += triples
    except Exception as e:
        print("Query fehlgeschlagen:", e)

    literals = set()
    debug_print("Literals:")
    #i = 0
    for res in results["results"]["bindings"]:
        literals.add(res["literal"]["value"])
        #debug_print(str(res["literal"]["value"]))

    print("FINISHED GETTING ROOT NODES AND LITERALS\n")

STARTING GETTING ROOT NODES AND LITERALS
Root Nodes:
YES:  37
Literals:
FINISHED GETTING ROOT NODES AND LITERALS



In [51]:
num_paths = 0
abs_depth = 0
max_depth = 0

print("STARTING CALCULATING PATHS")

if LOCAL:
    # list of paths for all root nodes
    all_paths = {}

    # calculating all paths from root nodes
    for root in list(root_nodes):
        str_root = str(root)
        debug_print("Starting searching for paths with root node: " + str_root)
        all_paths[str_root] = find_all_paths_local(G, str_root)
else:
    # list of paths for all root nodes
    all_paths = {}

    # calculating all paths from root nodes
    for root in list(root_nodes):
        str_root = str(root)
        debug_print("Starting searching for paths with root node: " + str_root)
        all_paths[str_root] = find_all_paths_endpoint(endpoint_url, literals, str_root)

print("FINISHED CALCULATING PATHS\n")

# Output root node + found paths
# print("FOUND PATHS:")
# for root_node, paths in all_paths.items():
#     print(f"Root-Knoten: {root_node}")
#     for path in paths:
#         print(path)

# Output found paths
print("FOUND PATHS:")
for root_node, paths in all_paths.items():
    for path in paths:
        print(path)
        print("Path length = " + str(len(path)-1))
        

#num_paths = len(all_paths)

print("RESULTS:")

print("-Number of Paths: " + str(num_paths))
print("-Absolute depth: " + str(abs_depth))

avg_depth = abs_depth / num_paths

print("-Average depth: " + str(avg_depth))
print("-Maximal depth: " + str(max_depth))

STARTING CALCULATING PATHS
Starting searching for paths with root node: 
Starting searching for paths with root node: http://rdf.murberget.se/SE_MURBERGET_ENT_2081
Starting searching for paths with root node: http://www.deutsche-digitale-bibliothek.de/edm/event/urn:nbn:de:bvb:20-36z2270term57157099-0_c/agent/-1
Starting searching for paths with root node: http://rdf.murberget.se/SE_MURBERGET_ENT_1026
Starting searching for paths with root node: http://d-nb.info/gnd/117463612
Starting searching for paths with root node: urn:rijksmuseum:people:RM0001.PEOPLE.89848
Starting searching for paths with root node: http://rdf.murberget.se/SE_MURBERGET_ENT_3933
Starting searching for paths with root node: http://rdf.murberget.se/SE_MURBERGET_ENT_159
Starting searching for paths with root node: urn:rijksmuseum:people:RM0001.PEOPLE.44566
Starting searching for paths with root node: http://d-nb.info/gnd/140983155
Starting searching for paths with root node: urn:rijksmuseum:people:RM0001.PEOPLE.47821

KeyboardInterrupt: 

In [None]:
# Calculate Tangledness 

# Select number of nodes in graph - QUESTION: is this right? or do I have to choose all classes instead of all nodes in a graph? 
query = """
SELECT (COUNT(DISTINCT ?x) AS ?count)
WHERE {
  { ?x ?p1 ?o } 
  UNION 
  { ?s ?p2 ?x }
}
"""

res = g.query(query)
for row in res:
    print("Number of nodes in graph: " + str(row['count']))
    num_of_nodes = int(row['count'])

# Select number of nodes with more than one ingoing isA arc
query = """
SELECT (COUNT(?class) AS ?tangledCount)
WHERE {
  SELECT ?class (COUNT(?super) AS ?numSuper)
  WHERE {
    ?class rdf:type ?super .
  }
  GROUP BY ?class
  HAVING (COUNT(?super) > 1)
}

"""

query = """
SELECT (COUNT(?class) AS ?tangledCount)
WHERE
{
  SELECT ?class (COUNT(?s) AS ?numIngoing)
  WHERE {
    ?s rdf:type ?class .
  }
  GROUP BY ?class
  HAVING (?numIngoing > 1)
}
"""

res = g.query(query)

for row in res:
  print("Number of nodes with more than one ingoing isA arc: " + str(row["tangledCount"]) )
  t = int(row["tangledCount"])

print("t: " + str(t))

# Select class and tangledCount of classes with more than one ingoing isA arc
query = """
SELECT ?class (COUNT(?super) AS ?tangledCount)
WHERE {
  ?class rdfs:subClassOf ?super .
}
GROUP BY ?class
HAVING (COUNT(?super) > 1)

"""
res = g.query(query)

debug_print("\nNodes with more than one ingoing isA arc:")

if DEBUG:
  if t == 0:
    debug_print("0 nodes")
  else:
    for row in res:
        print("  Class: " + str(row["class"]) + " - TangledCount: " + str(row["tangledCount"]))

print("\nRESULT:")

if t != 0:
  tangledness = num_of_nodes / t
  print("-Tangledness: " + str(tangledness))
else:
  print("-Tangledness is INF!")


In [None]:
# Degree Distribution (Formula in: source 37, page 7)
# nG...number of nodes in gaph
# nE...number of edges in graph

# Calculating nE
query = """
SELECT (COUNT(*) AS ?tripleCount)
WHERE {
  ?s ?p ?o .
}
"""

res = g.query(query)
for row in res:
    print("Number of edges in graph: " + str(row['tripleCount']))
    nE = int(row['tripleCount'])

# Calculating nG
# nG already calculated before
nG = num_of_nodes
print("Number of nodes in graph: " + str(nG))

# Calculating degree for every node in graph
query = """
SELECT ?node (COUNT(?any) AS ?degree)
WHERE {
  {
    { ?node ?p1 ?any }     # Outgoing edges
    UNION
    { ?any ?p2 ?node }     # Incoming edges
  }
}
GROUP BY ?node
"""

# sum_of_degress = 0
degrees = []

res = g.query(query)
for row in res:
    debug_print("Node: " + str(row['node']) + " - Degree: " + str(row['degree']))
    #sum_of_degress += int(row['degree'])
    degrees.append(int(row['degree']))
   
print("\nSum of Degrees: " + str(sum(degrees)))

if nG > 1:
    mean_degree = (2 * nE) / nG
    squared_diffs = [(deg_v - mean_degree) ** 2 for deg_v in degrees]
    degree_distribution = sum(squared_diffs) / (nG-1)
else:
    degree_distribution = 0 

print("\nRESULT:")
print("-Degree Distribution: " + str(degree_distribution))

In [None]:
# Number of instances per type
query = """ SELECT ?type (COUNT(?s) AS ?count)
WHERE {
  ?s rdf:type ?type . 
}
GROUP BY ?type """

res = g.query(query)

num_instances = 0

for row in res:
    debug_print("Number of instancees of", str(row["type"]), ": " ,  str(row["count"]) )
    num_instances += int(row["count"])

# Number of classes
# Defintion of Class: 
# source: 213, page: 5 - source: 250, page: 3
# TNOC (total number of classes/concepts) = classes, subclasses, superclasses, anonymous classes
# anonymous classes = equivalent/restriction/unionOf/intersectionOf/complementOf/oneOf/hasValue classes

query = """ 
SELECT DISTINCT ?class
WHERE {
  {
    # 1. explicitly/implicitly used RDF classes
    # explicitly: ?class a owl:Class . or ?class a rdfs:Class .
    # implicitly: ?any rdf:type ?class . (includes also explicitly used classes)

    ?any rdf:type ?class .
  }
  UNION
  {
    # 2. subclasses
    ?class rdfs:subClassOf ?any .
  }
  UNION
  {
    # 3. superclasses
    ?any rdfs:subClassOf ?class .
  }
  UNION
  {
    # 4. classes used with owl:equivalentClass
    { ?class owl:equivalentClass ?any . }
    UNION
    { ?any owl:equivalentClass ?class . }
  }
  UNION
  {
    # 5. OWL restriction classes
    ?class a owl:Restriction .
  }
  UNION
  {
    # 6. complex classes with using unionOf, intersectionOf etc.
    ?class owl:unionOf|owl:intersectionOf|owl:complementOf|owl:oneOf ?list .
  }
  UNION
  {
    # 7. OWL hasValue restrictions
    ?class owl:hasValue ?val .
  }
}
"""
res = g.query(query)

num_classes = 0

debug_print("\nExisting classes: ")

for row in res:
  debug_print("Class " + str(num_classes+1) + ": " + str(row["class"]))
  num_classes += 1

print("\nRESULTS:")
print("-Number of instances:", num_instances)
print("-Number of classes:", num_classes)