# Schema Transform Example
In this example we will:

1. ~~persist a source graph's metadata in Grakn~~
2. perform a motif query to 'transform' the graph, and 
3. document the updated schema (versioning)

Ideally, generated code snippets will apply the transforms to the source graph in Spark.

In [3]:
# ! pip install findspark

# !pip install graphframes
# https://towardsdatascience.com/graphframes-in-jupyter-a-practical-guide-9b3b346cebc5

# imports and libraries
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext, DataFrame
from pyspark.conf import SparkConf

# Spark runtime boilerplate

sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))
sqlC = SQLContext(sc)
sc.addPyFile("/Users/josephhaaga/.ivy2/jars/graphframes_graphframes-0.6.0-spark2.3-s_2.11.jar")

## Source Graph
First we create a GraphFrame with the data generated by GeneratePeopleAndCompanies.ipynb

In [5]:
edges = sqlC.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("./data/peopleAndCompanies_edges.csv") 
    
vertices = sqlC.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("./data/peopleAndCompanies_vertices.csv")
    

In [8]:
from graphframes import *
# https://stackoverflow.com/a/50404308
g = GraphFrame(vertices, edges)

from pyspark.sql.functions import lit

In [11]:
motifs = g.find("(a)-[e]->(b); (c)-[e2]->(b)")

results = motifs.filter("a.type == 'person'") \
    .filter("c.type == 'person'") \
    .filter("b.type == 'company'") \
    .filter("e.relationship == 'employed_by'") \
    .filter("e2.relationship == 'employed_by'")

In [12]:
src_dst = results.select("a.id","c.id")

In [13]:
newEdges = src_dst.withColumn("relationship", lit("works_with"))

In [14]:
totalE = g.edges.union(newEdges)

In [29]:
totalE.count()

707

In [None]:
# src_dst = results.select("a.id","c.id")
# newEdges = src_dst.withColumn("relationship", lit("works_with"))
# totalE = g.edges.union(newEdges)
# totalE.count()

## Generate Metamodel
Then we represent the GraphFrame graph as a Grakn metamodel. This allows for high-level reasoning about the concepts represented in the graph.

In [32]:
def describeMetamodel(g):
    """
    Generate Graql insert statements to create a metamodel representation of GraphFrame g.
    """
    # Vertices
    vertexTypes = g.vertices.select("type").distinct().rdd.flatMap(lambda x: x).collect()
    createVertices = ['$'+v+' isa graphVertex has name "'+v+'";' \
     for v in vertexTypes]

    # Edges
    edgeTypes = g.edges.select("relationship").distinct().rdd.flatMap(lambda x: x).collect()
    createEdges = ['$'+e+' isa graphEdge has name "'+e+'";' \
     for e in edgeTypes]

    s = g.edges

    # Triples
    tripleTypes = s.join(vertices, s.src == vertices.id) \
        .select(["src","type","dst","relationship"]) \
        .withColumnRenamed('type','src_type') \
        .join(vertices, s.dst == vertices.id) \
        .select(["src","src_type","dst","type","relationship"]) \
        .withColumnRenamed('type', 'dst_type') \
        .select(['src_type', 'relationship', 'dst_type']) \
        .distinct() \
        .collect()
    tripleTypes = [a.asDict() for a in tripleTypes]
    createTriplets = [ \
        """${0} isa graphTriplet has name "{1}";
        (src-vertex-owned: ${2}, dst-vertex-owned: ${3}, edge-owned: ${4}, object-owner: ${0}) isa has-graphobjects;
        """.format( \
                a['src_type']+"_"+a['relationship']+"_"+a['dst_type'], \
                a['src_type']+" "+a['relationship']+" "+a['dst_type'], \
                a['src_type'], \
                a['dst_type'], \
                a['relationship'] \
            ).replace("\n","").replace("\t", ' ')
        for a in tripleTypes \
    ]

    verts = " ".join(createVertices)
    edges = " ".join(createEdges)
    trips = " ".join(createTriplets)
    return (verts + ' ' + edges + " " + trips)

In [99]:
metamodelCreateStatements = describeMetamodel(g)
metamodelCreateStatements = metamodelCreateStatements[:metamodelCreateStatements.rindex(";")]

In [105]:
#!pip install primal-grakn

### Insert Metamodel components into Grakn

In [100]:
with grakn.Graph(uri='localhost:48555', keyspace='grakn') as graph:
    graph.match_or_insert(metamodelCreateStatements)
    graph.commit()

[{'a': {'id': 'V8328', 'type': 'has-graphobjects', 'base_type': 'relationship', 'src-vertex-owned': {'id': 'V40964280', 'type': 'graphVertex', 'base_type': 'entity', 'name': {'value': 'person'}}, 'dst-vertex-owned': {'id': 'V40964280', 'type': 'graphVertex', 'base_type': 'entity', 'name': {'value': 'person'}}, 'object-owner': {'id': 'V8320', 'type': 'graphTriplet', 'base_type': 'entity', 'name': {'value': 'person claims_dependent person'}}, 'edge-owned': {'id': 'V4160', 'type': 'graphEdge', 'base_type': 'entity', 'name': {'value': 'claims_dependent'}}}}, {'a': {'id': 'V12416', 'type': 'has-graphobjects', 'base_type': 'relationship', 'src-vertex-owned': {'id': 'V40964280', 'type': 'graphVertex', 'base_type': 'entity', 'name': {'value': 'person'}}, 'dst-vertex-owned': {'id': 'V8312', 'type': 'graphVertex', 'base_type': 'entity', 'name': {'value': 'company'}}, 'object-owner': {'id': 'V40964104', 'type': 'graphTriplet', 'base_type': 'entity', 'name': {'value': 'person employed_by company'}

## Query metamodel for possible parameters
Now we can provide the user with a OneHop operation, using the parameter options described by the Grakn metamodel.


In [108]:
with grakn.Graph(uri='localhost:48555', keyspace='grakn') as graph:
    concept_map = graph.execute('match $a isa has-graphobjects; get;')    
    graph.commit()

In [106]:
len(concept_map)

3

In [112]:
concept_map[1]['a']

{'base_type': 'relationship',
 'dst-vertex-owned': {'base_type': 'entity',
  'id': 'V8312',
  'name': {'value': 'company'},
  'type': 'graphVertex'},
 'edge-owned': {'base_type': 'entity',
  'id': 'V4264',
  'name': {'value': 'employed_by'},
  'type': 'graphEdge'},
 'id': 'V12416',
 'object-owner': {'base_type': 'entity',
  'id': 'V40964104',
  'name': {'value': 'person employed_by company'},
  'type': 'graphTriplet'},
 'src-vertex-owned': {'base_type': 'entity',
  'id': 'V40964280',
  'name': {'value': 'person'},
  'type': 'graphVertex'},
 'type': 'has-graphobjects'}

In [113]:
sources = [a['a']['src-vertex-owned'] for a in concept_map]
sources

[{'base_type': 'entity',
  'id': 'V40964280',
  'name': {'value': 'person'},
  'type': 'graphVertex'},
 {'base_type': 'entity',
  'id': 'V40964280',
  'name': {'value': 'person'},
  'type': 'graphVertex'},
 {'base_type': 'entity',
  'id': 'V8312',
  'name': {'value': 'company'},
  'type': 'graphVertex'}]

# OneHop example
## Define OneHop operation
Create a function that performs the relevant motif query, and filters results down according to user-specified parameters. Then, return a new graph with 'new_edge' direct relationships established. 



In [126]:
def OneHop(g, params):
    """
    Create a 'new_edge' relationship between A and C due to shared relationship with vertex B.
    
    g is a GraphFrame object
    
    params is a dictionary where each parameter in the motif query is a key,
    and it's value can be a specific parameter or "None"/omitted (wildcard)
    
    e.g.
    
    {
        a: "person",
        c: "person",
        e: "employed_by",
        # note how e2 is omitted
        new_edge: "works_with"
    }
    """
    template = "(a)-[e]->(b); (c)-[e2]->(b)";
    results = g.find(template)

    if 'a' in params and params['a'] is not None:
        results = results.filter("a.type == '"+params['a']+"'")
    if 'b' in params and params['b'] is not None:
        results = results.filter("b.type == '"+params['b']+"'")
    if 'c' in params and params['c'] is not None:
        results = results.filter("c.type == '"+params['c']+"'")

    if 'e' in params and params['e'] is not None:
        results = results.filter("e.relationship == '"+params['e']+"'")
    if 'e2' in params and params['e2'] is not None:
        results = results.filter("e2.relationship == '"+params['e2']+"'")
    
    src_dst = results.select("a.id","c.id")
    newEdges = src_dst.withColumn("relationship", lit(params['new']))
    totalE = g.edges.union(newEdges)
    print(str(newEdges.count())+" '"+params['new_edge']+"' edges added.")
    return GraphFrame(g.vertices, totalE)

## Create *works_with* relationship
Use our newly-defined OneHop operation to create relationships between coworkers.

In [129]:
g = OneHop(g, {"a":"person", \
               "b":"company", \
               "c":"person", \
               "e":"employed_by", \
               "e2":"employed_by", \
               "new_edge":"works_with"})

556 'works_with' edges added.


N.B. We can provide a `<select>` list of dynamically-changing options ["guard rails"] when we connect the frontend UI. That way, users will not be able to query for nonexistent triplets like `(Company)-[claims_dependent]-(Company)`.

In [131]:
g.edges.count()

707

## Sanity check

Check how many employed_by relationships there are, and calculate how many works_with edges should have been added.