# Events of interest from raw instrumentation logs

In [None]:
import pandas as pd

df = pd.read_csv('lineage_edges_1000.csv')
df.head()

# Build a Neo4j lineage graph from instrumentation logs

In [None]:
import lineage_graph_cosmos as lg

graph = lg.build_lineage_graph('lineage_edges_1000.csv')

# Custom visualization of lineage graph using Vis.js

In [None]:
from scripts.vis import draw

options = {"Dataset": "name", "Job": "uid"}
draw(graph, options)

# Query Neo4j temporal lineage graph using Cypher

## Query results as a Pandas DataFrame

In [None]:
# Sanity check query.
query = """
    MATCH (n)-[r]->(m)
    RETURN id(n) AS source_id,
           id(r) AS rel_id,
           id(m) AS target_id
"""

results_df = pd.DataFrame(graph.data(query))
print(results_df)

## Copy detection as a reachability query between 2 content similar datasets

In [None]:
query = """
    MATCH (n:Dataset)
    WHERE (n.name = '/home/lubuntu/src/file_access_monitor/test_workflows/in1.csv'
    OR n.name = '/home/lubuntu/src/file_access_monitor/test_workflows/out4.csv')
    RETURN id(n) as node_id
"""
node_ids = graph.data(query)

# Path query with monotonically increasing timestamps.
query = """
    START a=node(%d), b=node(%d)
    MATCH p=(a)-[r*]->(b)
    WITH head(relationships(p)) as r1, p, b
    RETURN p, b.name
""" % (node_ids[0]['node_id'], node_ids[1]['node_id'])


paths = graph.data(query)
print('\nFound %d path(s) from a to b:\n' % len(paths))
print(paths)

# Path query with monotonically increasing timestamps.
query = """
    START a=node(%d), b=node(%d)
    MATCH p=(b)-[r*]->(a)
    WITH head(relationships(p)) as r1, p, b
    RETURN p, b.name
""" % (node_ids[0]['node_id'], node_ids[1]['node_id'])


paths = graph.data(query)
print('\nFound %d path(s) from b to a:\n' % len(paths))
print(paths)

## Copy detection as a reachability query between 2 content similar datasets (monotonically increasing edge timestamps)

In [None]:
query = """
    MATCH (n:Dataset)
    WHERE (n.name = '/home/lubuntu/src/file_access_monitor/test_workflows/in1.csv'
    OR n.name = '/home/lubuntu/src/file_access_monitor/test_workflows/out4.csv')
    RETURN id(n) as node_id
"""
node_ids = graph.data(query)

# Path query with monotonically increasing timestamps.
query = """
    START a=node(%d), b=node(%d)
    MATCH p=(a)-[r*]->(b)
    WITH head(relationships(p)) as r1, p, b
    WHERE all(r2 in relationships(p)
              where r2.timestamp>=r1.timestamp)    
    RETURN p, b.name
""" % (node_ids[0]['node_id'], node_ids[1]['node_id'])


paths = graph.data(query)
print('\nFound %d path(s) from a to b with monotonically increasing timestamps:\n' % len(paths))
print(paths)

# Path query with monotonically increasing timestamps.
query = """
    START a=node(%d), b=node(%d)
    MATCH p=(b)-[r*]->(a)
    WITH head(relationships(p)) as r1, p, b
    WHERE all(r2 in relationships(p)
              where r2.timestamp>=r1.timestamp)    
    RETURN p, b.name
""" % (node_ids[0]['node_id'], node_ids[1]['node_id'])


paths = graph.data(query)
print('\nFound %d path(s) from b to a with monotonically increasing timestamps:\n' % len(paths))
print(paths)

# PageRank

In [None]:
%load_ext cypher
import networkx as nx
import operator
%matplotlib inline

results = %cypher MATCH p = (a)-[r*]->(b) RETURN p

# Networkx graph.
g = results.get_graph()

#nx.draw(g)
#g.nodes(data=True)

# Print nodes so that we can see their original ids and properties.
print("\nOriginal Nodes:")
print(g.nodes(data=True))

# Node weights for personalized pagerank.
personalize = {}
for node, data in g.nodes(data=True):
    if 'Dataset' in data['labels']:
        personalize[node] = 1
    elif 'Job' in data['labels']:
        # Let's weight each edge by the amount of CPU consumed.
        personalize[node] = data['cpu']
    print("node=%s, data=%s" % (node, data))

print("\nPersonalize Vector:")
print(personalize)

# Transformation from MultiDigraph to Graph for Pagerank calculation.
H = nx.Graph()
for src, dst, edge in g.edges(data=True):
    # Let's weight each edge by the amount of bytes read / written.
    w = edge['data_bytes']
    if H.has_edge(src, dst):
        H[src][dst]['weight'] += w
    else:
        H.add_edge(src, dst, weight=w)

#print("\nPageRank:")
#print(nx.pagerank(H))

print("\nPersonalized PageRank:")
ranks = nx.pagerank(H, personalization=personalize)
sorted_ranks = sorted(ranks.items(), key=operator.itemgetter(1), reverse=True)
print(sorted_ranks)

# Data Lineage Temporal Graph (Cosmos)

In [2]:
import time
from py2neo import Graph

# Create new graph from scratch, deleting all previous graphs from the DB.
graph = Graph()
graph.delete_all()

# Load CSV into a neo4j graph.
pwd = os.getcwd()
num_rows = 20000  # Edit this to pick the correct file.
filename = 'file:///%s/lineage_edges_%s.csv' % (pwd, num_rows)
query = """
USING PERIODIC COMMIT 
LOAD CSV WITH HEADERS FROM '%s'
  AS line
  FIELDTERMINATOR ','
MERGE (job:Job { id: line.Job, name: line.Job, uid: line.Job })
MERGE (dataset:Dataset { id: line.Dataset, name: line.Dataset })
FOREACH(t in CASE WHEN line.Action='read' THEN [1] ELSE [] END |
    CREATE (dataset)-[:IS_READ_BY {time: line.StartTime}]->(job))
FOREACH(t in CASE WHEN line.Action='write' THEN [1] ELSE [] END |
    CREATE (job)-[:WRITES_TO {time: line.StartTime}]->(dataset))
""" % filename

t0 = int(round(time.time() * 1000))
graph.data(query)
t1 = int(round(time.time() * 1000))
print('Time to load %s rows into a neo4j graph: %s ms' % (num_rows, (t1 - t0)))

Time to load 20000 rows into a neo4j graph: 17109 ms


# Visualize graph

In [3]:
from scripts.vis_cosmos import draw

graph = Graph()
res = graph.data(filter_query)
options = {"Dataset": "name", "Job": "uid"}
draw(graph, options)

# Helios Queries

In [None]:
import time
from py2neo import Graph

# Create new graph from scratch, deleting all previous graphs from the DB.
graph = Graph()
graph.delete_all()

# Load Helios queries CSV into a neo4j graph.
pwd = os.getcwd()
filename = 'file:///%s/helios_queries.csv' % pwd
query = """
USING PERIODIC COMMIT 
LOAD CSV WITH HEADERS FROM '%s'
  AS line
  FIELDTERMINATOR ','
MERGE (query:Query { id: line.scriptindexqueryindex, name: line.queryname })
MERGE (table:Table { id: line.referencedtables, name: line.referencedtables})
FOREACH(t in CASE WHEN line.referencedtables<>'' THEN [1] ELSE [] END |
    CREATE (query)-[:REFERENCES]->(table))
""" % filename

# Pagerank and other algos using networkx

In [None]:
import pandas as pd
import time

num_hops = 3
query = """
    MATCH p=(n)-[r:SENT_EMAIL_TO*%s]->(m)
    RETURN p
""" % num_hops

t0 = int(round(time.time() * 1000))
motifs = graph.data(query)
t1 = int(round(time.time() * 1000))

print('Time to find %s-hop motifs: %s ms' % (num_hops, (t1 - t0)))
#print('%s-hop motifs: %s' % (motifs, num_hops))

%load_ext cypher
import networkx as nx
import operator
%matplotlib inline

# Networkx graph.
t0 = int(round(time.time() * 1000))
results = %cypher MATCH p = (a)-[r]->(b) RETURN p
g = results.get_graph()
#nx.draw(g)
g.nodes(data=True)
t1 = int(round(time.time() * 1000))
print('Time to load it into a networkx graph: %s ms' % (t1 - t0))

t0 = int(round(time.time() * 1000))
#print('In-degree: %s' % g.in_degree())
t1 = int(round(time.time() * 1000))
print('Time to compute in-degree: %s ms' % (t1 - t0))

t0 = int(round(time.time() * 1000))       
centrality = nx.betweenness_centrality(g)
#print('Centrality: %s' % centrality)
t1 = int(round(time.time() * 1000))       
print('Time to compute betweeness centrality: %s ms' % (t1 - t0))


# Print nodes so that we can see their original ids and properties.
#print("\nOriginal Nodes:")
#print(g.nodes(data=True))

# Node weights for personalized pagerank.
t0 = int(round(time.time() * 1000))
personalize = {}
for node, data in g.nodes(data=True):
    if 'Person' in data['labels']:
        personalize[node] = 1
    #print("node=%s, data=%s" % (node, data))
t1 = int(round(time.time() * 1000))
print('Time to create PageRank personalization vector: %s ms' % (t1 - t0))

    
#print("\nPersonalize Vector:")
#print(personalize)

# Transformation from MultiDigraph to Graph for Pagerank calculation.
t0 = int(round(time.time() * 1000))
H = nx.Graph()
for src, dst, edge in g.edges(data=True):
    # Let's weight each edge by the amount of bytes read / written.
    w = int(edge['time'])
    if H.has_edge(src, dst):
        H[src][dst]['weight'] += w
    else:
        H.add_edge(src, dst, weight=w)
t1 = int(round(time.time() * 1000))
print('Time to convert from MultiDigraph to Graph for PageRank calculation: %s ms' % (t1 - t0))


t0 = int(round(time.time() * 1000))       
ranks = nx.pagerank(H, personalization=personalize)
sorted_ranks = sorted(ranks.items(), key=operator.itemgetter(1), reverse=True)
t1 = int(round(time.time() * 1000))
print('Time to compute Pagerank: %s ms' % (t1 - t0))

#print("\nPersonalized PageRank:")
#print(sorted_ranks)