In [1]:
import pandas as pd
from py2neo import authenticate, Graph, Node, Relationship

# Log-in to Neo4j server.
authenticate('localhost:7474', 'neo4j', 'admin')

# Create lineage graph.
graph = Graph()

# We do not need to clear the graph anymore, now that we have uniqueness constraints.
# graph.delete_all()  

# TODO: Use a combo of dataset name and hash of contents as unique constraint?
graph.schema.drop_uniqueness_constraint('Dataset', graph.schema.get_uniqueness_constraints('Dataset'))
graph.schema.drop_uniqueness_constraint('Job', graph.schema.get_uniqueness_constraints('Job'))

graph.schema.create_uniqueness_constraint('Dataset', 'name')
graph.schema.create_uniqueness_constraint('Job', 'uid')

dataset1 = Node('Dataset', name='test1.csv')
graph.merge(dataset1)
dataset2 = Node('Dataset', name='test2.csv')
graph.merge(dataset2)

job1 = Node('Job', uid='cp-3792')
graph.merge(job1)
job1['name'] = 'cp'
job1['pid'] = 3792
graph.push(job1)

# Edges are directed to a data lineage relationship.
# Read events: Dataset-[IS_READ_BY]->Job
# Write events: Job-[WRITES_TO]->Dataset
read_relationship = Relationship(dataset1, 'IS_READ_BY', job1, timestamp=1)
write_relationship = Relationship(job1, 'WRITES_TO', dataset2, timestamp=2)
graph.create(read_relationship)
graph.create(write_relationship)

query = """
    MATCH (n)
    WITH n, rand() AS random
    ORDER BY random
    OPTIONAL MATCH (n)-[r]->(m)
    RETURN n AS source_node,
           id(n) AS source_id,
           r,
           m AS target_node,
           id(m) AS target_id
"""

results_df = pd.DataFrame(graph.data(query))
print(results_df)

                  r  source_id                                    source_node  \
0  {'timestamp': 2}         35  {'name': 'cp', 'pid': 3792, 'uid': 'cp-3792'}   
1              None         34                          {'name': 'test2.csv'}   
2  {'timestamp': 1}         33                          {'name': 'test1.csv'}   

   target_id                                    target_node  
0       34.0                          {'name': 'test2.csv'}  
1        NaN                                           None  
2       35.0  {'name': 'cp', 'pid': 3792, 'uid': 'cp-3792'}  


In [2]:
from scripts.vis import draw

options = {"Dataset": "name", "Job": "uid"}
draw(graph, options)