In [None]:
from rdflib import Graph
from rdflib import RDF
from rdflib import URIRef
from rdflib import Literal
import networkx as nx
import matplotlib.pyplot as plt
from rdflib.plugins.stores.sparqlstore import SPARQLStore
from queries import QUERY_AI, QUERY_FILM, QUERY_PERSON
from paths_to_examples import REPO_BOOKS, REPO_PEOPLE, WIKIDATA, GPT_EX, GPT_SUBCLASSES

DEBUG = True  # for debug prints
PRINTGRAPH = False # for showing the networkX graph
LOCAL = True # for choosing local ttl file path or SPARQL endpoint 

def debug_print(*args, **kwargs):
    if DEBUG:
        print(*args, **kwargs)


# printing all elements of a set
def print_set(set):
    if DEBUG:
        for i in set:
            print(i)

def show_graph(G):
    if PRINTGRAPH:
        # calculating positions for nodes
        pos = nx.spring_layout(G, k=0.5, iterations=50)

        # printing nodes and edges
        plt.figure(figsize=(12, 8))
        nx.draw(G, pos, with_labels=True, node_color="lightblue", node_size=1500, font_size=10, font_weight="bold", arrows=True)

        # printing labels for the edges
        edge_labels = nx.get_edge_attributes(G, 'label')
        nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_color='red')

        plt.title("RDF Graph")
        plt.axis("off")
        plt.show()

        print("FINISHED PRINTING GRAPH\n\n")        

# Calculating all paths from one root node (start node)
def find_all_paths(G, start_node):
    # helper function for DFS
    def dfs(node, path):
        global num_paths
        global abs_depth
        global max_depth

        # skip node if it already is in path (avoiding cycles)
        if node in path:
            return
        
        # add current node to path
        path.append(node)
        
        neighbors = list(G.neighbors(node))
        # if node does not have any neighbors left, there is no other path left
        if not neighbors:
            paths.append(list(path))  # Store current path
            num_paths += 1
            abs_depth += (len(path) - 1) # cardinality of a path = number of EDGES in path (=> -1)
            if (len(path)-1) > max_depth:
                max_depth = len(path) - 1
        
        # recursively extend the path for each neighbor
        for neighbor in neighbors:
            dfs(neighbor, path)
        
        # remove node from the path to find next path
        path.pop()

    # list which stores all paths
    paths = []
    
    # starts DFS with start_node
    dfs(start_node, [])
    
    return paths

In [None]:
# path to ttl file
# can also be an online graph source
ttl_file_path = REPO_BOOKS
g = Graph()

print("STARTING PARSING")

if LOCAL == True:
    # if it fails -> print some error
    try:
        g.parse(ttl_file_path, format="turtle")
    except Exception as e:
        print("ERROR WHILE PARSING GRAPH: {e}")

else:

    endpoint_url = "https://dbpedia.org/sparql"
    store = SPARQLStore()
    store.open(endpoint_url)

    g_remote = Graph(store=store)

    query = QUERY_AI

    g += g_remote.query(query).graph
    

print("Number of triples: " + str(len(g)))    
print("FINISHED PARSING\n")

#for s, p, o in g:
 #   print(s, p, o)

print("STARTING GETTING ROOT NODES AND LITERALS")

all_nodes = set(g.subjects())

# printing all subject nodes
# print_set(nodes)

# all objects
object_nodes = set(g.objects())

# all literals in a graph (literals can only appear as objects)
literals = set(o for o in object_nodes if isinstance(o, Literal))

# converting items of literals to strings
literals = {str(item) for item in literals}

# getting possible root nodes
root_nodes = all_nodes - object_nodes

debug_print("Root Nodes:")
print_set(root_nodes)
#debug_print("\n")

print("FINISHED GETTING ROOT NODES AND LITERALS\n")

print("STARTING CREATING GRAPH G")

# NetworkX-DiGraph for visualization
G = nx.DiGraph()

debug_print("Printing all triples:")
# add triples to GRaph G
for subj, pred, obj in g:
    G.add_edge(str(subj), str(obj), label=str(pred))
    debug_print("(" , subj, ",", pred, ",", obj, ")")

print("FINISHED CREATING GRAPH G\n")

show_graph(G)

num_paths = 0
abs_depth = 0
max_depth = 0

print("STARTING CALCULATING PATHS")

# list of paths for all root nodes
all_paths = {}

# calculating all paths from root nodes
for root in list(root_nodes):
    str_root = str(root)
    debug_print("Starting searching for paths with root node: " + str_root)
    all_paths[str_root] = find_all_paths(G, str_root)

print("FINISHED CALCULATING PATHS\n")

# Output root node + found paths
# print("FOUND PATHS:")
# for root_node, paths in all_paths.items():
#     print(f"Root-Knoten: {root_node}")
#     for path in paths:
#         print(path)

# Output found paths
print("FOUND PATHS:")
for root_node, paths in all_paths.items():
    for path in paths:
        print(path)
        print("Path length = " + str(len(path)-1))
        

#num_paths = len(all_paths)

print("RESULTS:")

print("-Number of Paths: " + str(num_paths))
print("-Absolute depth: " + str(abs_depth))

avg_depth = abs_depth / num_paths

print("-Average depth: " + str(avg_depth))
print("-Maximal depth: " + str(max_depth))

In [None]:
# Tangledness 
# source 73 - page 4
# tangledness = mean number of classes with more than 1 direct ancestor, so two primitive 
# measurements (number of classes and number of direct ancestors) are used for computing the metric 

# Select number of classes in graph 
query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

SELECT (COUNT(DISTINCT ?class) AS ?num_classes)
WHERE {
  {
    # 1. explicitly/implicitly used RDF classes
    # explicitly: ?class a owl:Class . or ?class a rdfs:Class .
    # implicitly: ?any rdf:type ?class . (includes also explicitly used classes)

    ?any rdf:type ?class .
  }
  UNION
  {
    # 2. subclasses
    ?class rdfs:subClassOf ?any .
  }
  UNION
  {
    # 3. superclasses
    ?any rdfs:subClassOf ?class .
  }
  UNION
  {
    # 4. classes used with owl:equivalentClass
    { ?class owl:equivalentClass ?any . }
    UNION
    { ?any owl:equivalentClass ?class . }
  }
  UNION
  {
    # 5. OWL restriction classes
    ?class a owl:Restriction .
  }
  UNION
  {
    # 6. complex classes with using unionOf, intersectionOf etc.
    ?class owl:unionOf|owl:intersectionOf|owl:complementOf|owl:oneOf ?list .
  }
  UNION
  {
    # 7. OWL hasValue restrictions
    ?class owl:hasValue ?val .
  }
}"""

res = g.query(query)
for row in res:
    print("Number of classes in graph: " + str(row['num_classes']))
    num_classes = int(row['num_classes'])

# Select number of classes with more than one ingoing isA arc (Dr. Jovanovik said I should use is-a)
query_var1 = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(?class) AS ?tangledCount)
WHERE
{
  SELECT ?class (COUNT(?s) AS ?numIngoing)
  WHERE {
    { ?s rdf:type ?class . }
  }
  GROUP BY ?class
  HAVING (COUNT(?s) > 1) # problem here was: i used ?numIngoing instead of COUNT(?s)
}
"""

# Select number of classes with more than one superclass (source says I should use this query)
query_var2 = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (COUNT(?class) AS ?tangledCount)
WHERE {
  SELECT ?class (COUNT(?super) AS ?numSupers)
  WHERE {
    ?class rdfs:subClassOf ?super .
  }
  GROUP BY ?class
  HAVING (COUNT(?super) > 1)
}
"""
res = g.query(query_var2)

for row in res:
  print("Number of classes with more than one superclass: " + str(row["tangledCount"]) )
  t = int(row["tangledCount"])
  #print("Class " + str(row["class"]) + ": " + str(row["numIngoing"]))

print("t: " + str(t))

debug_print("\nClasses with more than one superclass: ")

if DEBUG:
  if t == 0:
    debug_print("0 nodes")
  else:

    query = """
    SELECT ?class (COUNT(?s) AS ?numIngoing)
    WHERE {
      { ?s rdf:type ?class . }
      UNION
      { ?s a ?class . }  
    }
    GROUP BY ?class
    HAVING (COUNT(?s) > 1)
    """

    res = g.query(query)

    for row in res:
        debug_print("Class: " + str(row["class"]) + " - TangledCount: " + str(row["numIngoing"]))

print("\nRESULT:")

if num_classes > 0:
  # source 37 says num_classes / t
  # source 73 says denominator and numerator should be switched -> t / num_classes
  tangledness = num_classes / t
  print("-Tangledness: " + str(tangledness))
else:
  # TODO
   print("-Tangledness is INF")


Number of nodes in graph: 2
Number of classes with more than one superclass: 0
t: 0

Classes with more than one superclass: 
0 nodes

RESULT:
-Tangledness: 0.0


In [None]:
# Degree Distribution (Formula in: source 37, page 7)
# nG...number of nodes in gaph
# nE...number of edges in graph

# Calculating nE
query = """
SELECT (COUNT(*) AS ?tripleCount)
WHERE {
  ?s ?p ?o .
}
"""

res = g.query(query)
for row in res:
    print("Number of edges in graph: " + str(row['tripleCount']))
    nE = int(row['tripleCount'])

# Calculating nG
query = """
SELECT (COUNT(DISTINCT ?node) AS ?nodeCount)
WHERE {
  {
    SELECT ?node WHERE {
      { ?node ?p1 ?o }       
      UNION
      { ?s ?p2 ?node }       
    }
  }
}
"""

res = g.query(query)
for row in res:
    print("Number of nodes in graph: " + str(row['nodeCount']))
    nG = int(row['nodeCount'])

# Calculating degree for every node in graph
query = """
SELECT ?node (COUNT(?any) AS ?degree)
WHERE {
  {
    { ?node ?p1 ?any }     # Outgoing edges
    UNION
    { ?any ?p2 ?node }     # Incoming edges
  }
}
GROUP BY ?node
"""

# sum_of_degress = 0
degrees = []

res = g.query(query)
for row in res:
    debug_print("Node: " + str(row['node']) + " - Degree: " + str(row['degree']))
    #sum_of_degress += int(row['degree'])
    degrees.append(int(row['degree']))
   
print("\nSum of Degrees: " + str(sum(degrees)))

if nG > 1:
    mean_degree = (2 * nE) / nG
    squared_diffs = [(deg_v - mean_degree) ** 2 for deg_v in degrees]
    degree_distribution = sum(squared_diffs) / (nG-1)
else:
    degree_distribution = 0 

print("\nRESULT:")
print("-Degree Distribution: " + str(degree_distribution))

In [None]:
# Number of instances per type
query = """ SELECT ?type (COUNT(?s) AS ?count)
WHERE {
  ?s rdf:type ?type . 
}
GROUP BY ?type """

res = g.query(query)

num_instances = 0

for row in res:
    debug_print("Number of instancees of", str(row["type"]), ": " ,  str(row["count"]) )
    num_instances += int(row["count"])

# Number of classes
# Defintion of Class: 
# source: 213, page: 5 - source: 250, page: 3
# TNOC (total number of classes/concepts) = classes, subclasses, superclasses, anonymous classes
# anonymous classes = equivalent/restriction/unionOf/intersectionOf/complementOf/oneOf/hasValue classes

query = """ 
SELECT DISTINCT ?class
WHERE {
  {
    # 1. explicitly/implicitly used RDF classes
    # explicitly: ?class a owl:Class . or ?class a rdfs:Class .
    # implicitly: ?any rdf:type ?class . (includes also explicitly used classes)

    ?any rdf:type ?class .
  }
  UNION
  {
    # 2. subclasses
    ?class rdfs:subClassOf ?any .
  }
  UNION
  {
    # 3. superclasses
    ?any rdfs:subClassOf ?class .
  }
  UNION
  {
    # 4. classes used with owl:equivalentClass
    { ?class owl:equivalentClass ?any . }
    UNION
    { ?any owl:equivalentClass ?class . }
  }
  UNION
  {
    # 5. OWL restriction classes
    ?class a owl:Restriction .
  }
  UNION
  {
    # 6. complex classes with using unionOf, intersectionOf etc.
    ?class owl:unionOf|owl:intersectionOf|owl:complementOf|owl:oneOf ?list .
  }
  UNION
  {
    # 7. OWL hasValue restrictions
    ?class owl:hasValue ?val .
  }
}
"""
res = g.query(query)

num_classes = 0

debug_print("\nExisting classes: ")

for row in res:
  debug_print("Class " + str(num_classes+1) + ": " + str(row["class"]))
  num_classes += 1

print("\nRESULTS:")
print("-Number of instances:", num_instances)
print("-Number of classes:", num_classes)