In [None]:
import rdflib
from ipyradiant import (
    CustomURIRef,
    FileManager,
    MultiPanelSelect,
    PathLoader,
    PredicateMultiselectApp,
    collapse_predicates,
)
from ipyradiant.sparql.api import SPARQLQueryFramer, build_values
from rdflib import URIRef
import pandas

In [None]:
lw = FileManager(loader=PathLoader(path="data"))
# here we hard set what we want the file to be, but ideally a user can choose a file to work with.
lw.loader.file_picker.value = lw.loader.file_picker.options["tailsitter_uas.ttl"]
lw

In [None]:
list(lw.graph.namespaces())

## Queries to facilitate RDF -> LPG

Method adopted from: https://github.com/Rothamsted/rdf2neo/blob/master/README.md

In [None]:
# namespace we will use for the example
MDK = rdflib.namespace.Namespace('https://www.example.org/CameoMDK/')

In [None]:
test_node_uri = URIRef(
    "https://mms.openmbee.org/alfresco/service/projects/PROJECT-44c15c7a-7fe8-4ebb-8b7b-bbada2edf1d2/refs/master/elements/_18_5_3_9340270_1547479252665_378825_19213"
)

### URI-to-ID default converter (configurable)
https://github.com/Rothamsted/rdf2neo/blob/master/rdf2neo/src/main/java/uk/ac/rothamsted/rdf/neo4j/idconvert/DefaultIri2IdConverter.java

In [None]:
import re

def URItoID(uri: rdflib.term.URIRef) -> str:
    """returns the URI fragment that follows the last '#' or '/' separator
    
    TODO validate on larger group of URIs to ensure robustness
    """
    parts = re.split("/|#", uri)
    return parts[-1]

> Future implementation note: we will want to store converted URIs in a dict and check the dict before calling `URItoID`

In [None]:
# TODO IRI to ShortID for predicates????

### Literal to python type (mapping configurable)
Used to convert typed Literals to other types. 

In [None]:
class LiteralMap:
    # TODO some way to allow map to be passed
    pass


class LiteralTyping:
    def __init__(self, mapping):
        self.map_ = mapping
        
    # TODO
    def convert(self):
        return None

### Node IRIs
SPARQL query that lists all the IRIs about RDF resources that represent a node.

Typically this query will be listing instances of target classes, although you might also catch resources of interest by targeting subjects or objects of given relations.

> it is very <b>important</b> that the query returns distinct results.

In [None]:
class NodeIRIs(SPARQLQueryFramer):
    """
    This query can return anything you want, as long as it meets one (?) requirement; 
    it must return unique IRIs for subjects in the graph (used to create LPG nodes).  
    """
    sparql = """
    # An example of how query for the IRIs about RDF resources that you want to map to LPG as nodes.
    SELECT DISTINCT ?iri
    WHERE {
      # This picks up nodes of interests based on their rdf:type, which should be pretty common.
      ?iri a ?type_ ;
          mdk:appliedStereotypeInstanceId/mdk:classifierIds/mdk:name ?classifier_name .
          
      # Here we can restrict the types to a subset   
      VALUES (?type_) {
          (mdk:Port)
      }
      
      # We can also restrict additional vars
      VALUES (?classifier_name) {
        ("ConstraintParameter")
        ("DirectionalConstraintParameter")
      }
      
      # can also UNION multiple ways to return nodes
    }
    """
    initNs = {"mdk": MDK}
    
    @classmethod
    def run_query(
        cls,
        graph: rdflib.graph.Graph,
        initBindings: dict = None,
        **initBindingsKwarg,
    ) -> pandas.DataFrame:
        """Overwrite the super method in order to wrap with validation checks."""
        qres = super().run_query(graph, initBindings=initBindings, **initBindingsKwarg)
        # Validating with known requirements on query results
        assert qres.columns == ["iri"], "Query results dataframe must be exactly one 'iri' column."
        assert len(set(qres['iri'].values)) == len(qres.values), "Query must return unique IRIs."
        return qres

    
NodeIRIs.run_query(lw.graph).head()

### Node Types (<i>label</i> in neo4j)
Query is invoked for each of the IRIs found by the node IRIs and is parameterized over a single node IRIs. 

The type query will be invoked once per node IRI, its purpose is to list all the LPG types that have to be assigned to the node.

A type can be either a IRI or a literal, or a string. If it's a IRI, it will be translated into a LPG identifier by means of the configured IRI-to-ID converter.

In [None]:
class NodeTypes(SPARQLQueryFramer):
    """
    TODO
    """
    sparql = """
    # Typically, this will be same query as the node IRIs, but returning the ?type_ variable in the results
    # Here the ?iri variable is bound to a particular node, using the results coming from the node IRI query.
    
    SELECT DISTINCT ?iri ?type_
    WHERE {
      # Copy of code from NodeIRI
      ?iri a ?type_ ;
          mdk:appliedStereotypeInstanceId/mdk:classifierIds/mdk:name ?classifier_name .
          
      # Here we can restrict the types to a subset   
      VALUES (?type_) {
          (mdk:Port)
      }
      
      # We can also restrict additional vars
      VALUES (?classifier_name) {
        ("ConstraintParameter")
        ("DirectionalConstraintParameter")
      }
    }
    """
    initNs = {"mdk": MDK}
    
    @classmethod
    def run_query(
        cls,
        graph: rdflib.graph.Graph,
        initBindings: dict = None,
        **initBindingsKwarg,
    ) -> pandas.DataFrame:
        """Overwrite the super method in order to wrap with validation checks."""
        qres = super().run_query(graph, initBindings=initBindings, **initBindingsKwarg)
        # Validating with known requirements on query results
        # TODO (uniques check?)
        return qres

    
NodeTypes.run_query(lw.graph)

### Node Properties
One query per node, with the ?iri variable bound to the specific node URI. The query returns a list of all the pairs of predicate + value that you want to assign to the LPG node.

Every node has always an iri property. We need this to correctly process RDF-defined relations. It can also be useful to track the provenance URI for a node. This property is always indexed and has distinct values.

Every node has a always a default type (`label` in neo4j). The predefined value for thiscan be changed by configuring a `defaultNodeLabel` (in future versions). Again, we need this in order to find nodes by their IRI. 

If values are literals, you should expect reasonable conversions (e.g., RDF numbers => Python numbers). TODO: we plan to add a configuration option to define custom literal converters.

> Note: name is typically converted to shorthand ID using the configured `URItoID`

In [None]:
class NodeProperties(SPARQLQueryFramer):
    """
    An example of how to return pairs of name/value that represent the properties of a LPG node.
     - ?iri is bound to a specific node IRI, to get the properties for that node. 
     - ?predicate is an IRI and is converted into a shorter ID by means of a configured IRI->ID converter.
     - ?value is a literal and, for the moment, is converted into a string, using its lexical value. More options soon (e.g., mapping XSD types to Cortex/python types).
    """
    sparql = """
    SELECT DISTINCT ?iri ?predicate ?value
    {
      ?iri ?predicate ?value.
      
      # could filter by things like lang
      # EXAMPLE: FILTER ( isNumeric (?value) || LANG ( ?value ) = 'en' ). 
      
      # Note: properties/attributes can potentially come from other nodes
      # EXAMPLE: ?iri mdk:ownerId/mdk:name ?value .  (note: have to bind something to ?predicate for LPG property name)

      VALUES ( ?predicate ) {
        ( mdk:visibility )
        ( mdk:aggregation )
      }
    }
    """
    initNs = {"mdk": MDK}
    
    @classmethod
    def run_query(
        cls,
        graph: rdflib.graph.Graph,
        initBindings: dict = None,
        **initBindingsKwarg,
    ) -> pandas.DataFrame:
        """Overwrite the super method in order to wrap with validation checks."""
        qres = super().run_query(graph, initBindings=initBindings, **initBindingsKwarg)
        # Validating with known requirements on query results
        # TODO (uniques check?)
        return qres

    
NodeProperties.run_query(lw.graph, iri=test_node_uri)  # note the iri is bound

## Node Relationships
Similarly to nodes, rdf2lpg needs first a list of relations to be created. These must refer to their linking nodes by means of the node URIs (mapped earlier via the iri property).

As you can see, we need certain properties always reported after the SELECT keyword. Among these, we always need the relation URI, which has to be computed for straight (non reified) triples too.

Similarly to nodes, relation URIs (i.e., ?iri) are needed by rdf2lpg in order to check for their properties with the relation property query. Moreover, it is a good way to keep track of multiple statements about the same subject/predicate/property.

In [None]:
class RelationTypes(SPARQLQueryFramer):
    """
    TODO
    TODO is this overlapping with reified query? e.g. this one is returning triples from the reification pattern...
    """
    sparql = """
    # 
    # An example of how to define mapping from RDF relations, either plain relations or reified ones.
    #
    # - ?iri is the resource about the relation, and uniquely identifies a triple or a relation between two nodes.
    # - ?source, ?target are the resources representing the relation's endpoint nodes (must refer to LPG node IRIs)
    # - ?predicate is a tag (usually an IRI, but it can be a string???) representing the relation type.
    # 
    SELECT DISTINCT ?iri ?predicate ?source ?target
    WHERE {
        { 
          # Plain relations, non-reified
            ?source ?predicate ?target.
            FILTER ( isIRI ( ?target ) ).  # prevent bad things
            
            # could bind values to ?predicate
            # VALUES (?predicate) { (mdk:ownerId) }

            # Plain relations must get their fictitious IRI from constructs like this.
            # Using the triple components ensures there is a unique IRI identifing the triple.
            BIND ( 
                IRI ( 
                    CONCAT ( 
                      STR ( ns: ),
                      MD5 ( CONCAT ( STR ( ?predicate ), STR ( ?source ), STR ( ?target ) ) )
                    ) 
                )
              AS ?iri
            )
        }
    }
    """
    initNs = {"ns": MDK}
    
    @classmethod
    def run_query(
        cls,
        graph: rdflib.graph.Graph,
        initBindings: dict = None,
        **initBindingsKwarg,
    ) -> pandas.DataFrame:
        """Overwrite the super method in order to wrap with validation checks."""
        qres = super().run_query(graph, initBindings=initBindings, **initBindingsKwarg)
        # Validating with known requirements on query results
        # TODO
        return qres

    
RelationTypes.run_query(lw.graph)

#### Separate query to handle reified relationships (if they exist)

In [None]:
class ReifiedRelations(SPARQLQueryFramer):
    """
    TODO
    """
    sparql = """
    SELECT DISTINCT ?iri ?predicate ?source ?target
    WHERE {
        ?iri a rdf:Statement;
            rdf:subject ?source;
            rdf:predicate ?predicate;
            rdf:object ?target.
    }
    """
    initNs = {"ns": MDK}
    
    @classmethod
    def run_query(
        cls,
        graph: rdflib.graph.Graph,
        initBindings: dict = None,
        **initBindingsKwarg,
    ) -> pandas.DataFrame:
        """Overwrite the super method in order to wrap with validation checks."""
        qres = super().run_query(graph, initBindings=initBindings, **initBindingsKwarg)
        # Validating with known requirements on query results
        # TODO
        return qres

    
ReifiedRelation.run_query(lw.graph)