# Events of interest from raw instrumentation logs

In [1]:
import pandas as pd

logs_df = pd.read_csv('loggedfs_events_pagerank.log')
cols = [col for col in logs_df.columns if col in ['Action','Time','Path','PID','PPID','UID','Command Line']]
df = logs_df[cols]
df[(df['Action'] == 'read') | (df['Action'] == 'write')]

Unnamed: 0,Time,Action,Path,UID,PID,PPID,Command Line
1,1490380654673,read,/home/lubuntu/src/file_access_monitor/test_wor...,1000,3495,3494,python concat_csvs.py in1.csv in2.csv out2.csv...
3,1490380654675,read,/home/lubuntu/src/file_access_monitor/test_wor...,1000,3495,3494,python concat_csvs.py in1.csv in2.csv out2.csv...
5,1490380654676,read,/home/lubuntu/src/file_access_monitor/test_wor...,1000,3495,3494,python concat_csvs.py in1.csv in2.csv out2.csv...
8,1490380654680,write,/home/lubuntu/src/file_access_monitor/test_wor...,1000,3495,3494,python concat_csvs.py in1.csv in2.csv out2.csv...
10,1490380656058,read,/home/lubuntu/src/file_access_monitor/test_wor...,1000,3502,3494,python concat_csvs.py in1.csv in2.csv out2.csv...
12,1490380656061,read,/home/lubuntu/src/file_access_monitor/test_wor...,1000,3502,3494,python concat_csvs.py in1.csv in2.csv out2.csv...
14,1490380656063,read,/home/lubuntu/src/file_access_monitor/test_wor...,1000,3502,3494,python concat_csvs.py in1.csv in2.csv out2.csv...
17,1490380656066,write,/home/lubuntu/src/file_access_monitor/test_wor...,1000,3502,3494,python concat_csvs.py in1.csv in2.csv out2.csv...
20,1490380657121,read,/home/lubuntu/src/file_access_monitor/test_wor...,1000,3509,3494,cp out3.csv out5.csv
22,1490380657122,write,/home/lubuntu/src/file_access_monitor/test_wor...,1000,3509,3494,cp out3.csv out5.csv


# Build a Neo4j lineage graph from instrumentation logs

In [8]:
import lineage_graph as lg

graph = lg.build_lineage_graph('loggedfs_events_pagerank.log')

# Custom visualization of lineage graph using Vis.js

In [9]:
from scripts.vis import draw

options = {"Dataset": "name", "Job": "uid"}
draw(graph, options)

# Query Neo4j temporal lineage graph using Cypher

## Query results as a Pandas DataFrame

In [None]:
# Sanity check query.
query = """
    MATCH (n)-[r]->(m)
    RETURN id(n) AS source_id,
           id(r) AS rel_id,
           id(m) AS target_id
"""

results_df = pd.DataFrame(graph.data(query))
print(results_df)

## Copy detection as a reachability query between 2 content similar datasets

In [None]:
query = """
    MATCH (n:Dataset)
    WHERE (n.name = '/home/lubuntu/src/file_access_monitor/test_workflows/in1.csv'
    OR n.name = '/home/lubuntu/src/file_access_monitor/test_workflows/out4.csv')
    RETURN id(n) as node_id
"""
node_ids = graph.data(query)

# Path query with monotonically increasing timestamps.
query = """
    START a=node(%d), b=node(%d)
    MATCH p=(a)-[r*]->(b)
    WITH head(relationships(p)) as r1, p, b
    RETURN p, b.name
""" % (node_ids[0]['node_id'], node_ids[1]['node_id'])


paths = graph.data(query)
print('\nFound %d path(s) from a to b:\n' % len(paths))
print(paths)

# Path query with monotonically increasing timestamps.
query = """
    START a=node(%d), b=node(%d)
    MATCH p=(b)-[r*]->(a)
    WITH head(relationships(p)) as r1, p, b
    RETURN p, b.name
""" % (node_ids[0]['node_id'], node_ids[1]['node_id'])


paths = graph.data(query)
print('\nFound %d path(s) from b to a:\n' % len(paths))
print(paths)

## Copy detection as a reachability query between 2 content similar datasets (monotonically increasing edge timestamps)

In [None]:
query = """
    MATCH (n:Dataset)
    WHERE (n.name = '/home/lubuntu/src/file_access_monitor/test_workflows/in1.csv'
    OR n.name = '/home/lubuntu/src/file_access_monitor/test_workflows/out4.csv')
    RETURN id(n) as node_id
"""
node_ids = graph.data(query)

# Path query with monotonically increasing timestamps.
query = """
    START a=node(%d), b=node(%d)
    MATCH p=(a)-[r*]->(b)
    WITH head(relationships(p)) as r1, p, b
    WHERE all(r2 in relationships(p)
              where r2.timestamp>=r1.timestamp)    
    RETURN p, b.name
""" % (node_ids[0]['node_id'], node_ids[1]['node_id'])


paths = graph.data(query)
print('\nFound %d path(s) from a to b with monotonically increasing timestamps:\n' % len(paths))
print(paths)

# Path query with monotonically increasing timestamps.
query = """
    START a=node(%d), b=node(%d)
    MATCH p=(b)-[r*]->(a)
    WITH head(relationships(p)) as r1, p, b
    WHERE all(r2 in relationships(p)
              where r2.timestamp>=r1.timestamp)    
    RETURN p, b.name
""" % (node_ids[0]['node_id'], node_ids[1]['node_id'])


paths = graph.data(query)
print('\nFound %d path(s) from b to a with monotonically increasing timestamps:\n' % len(paths))
print(paths)

# Aggregate Queries


In [None]:
query = """
    START from=node(*)
    MATCH p=(from)-->(to)
    WITH from as from, to as to, count(p) as paths
    WHERE paths >= 1
    RETURN to,paths
"""

paths = graph.data(query)
print(paths)

# PageRank

In [10]:
%load_ext cypher
import networkx as nx
import operator
%matplotlib inline

results = %cypher MATCH p = (a)-[r*]->(b) RETURN p

# Networkx graph.
g = results.get_graph()

#nx.draw(g)
#g.nodes(data=True)

# Print nodes so that we can see their original ids and properties.
print("\nOriginal Nodes:")
print(g.nodes(data=True))

# Node weights for personalized pagerank.
personalize = {}
for node, data in g.nodes(data=True):
    if 'Dataset' in data['labels']:
        personalize[node] = 0
    elif 'Job' in data['labels']:
        # Let's weight each edge by the amount of CPU consumed.
        personalize[node] = data['cpu']
    print("node=%s, data=%s" % (node, data))

print("\nPersonalize Vector:")
print(personalize)

# Transformation from MultiDigraph to Graph for Pagerank calculation.
H = nx.Graph()
for src, dst, edge in g.edges(data=True):
    # Let's weight each edge by the amount of bytes read / written.
    w = edge['data_bytes']
    if H.has_edge(src, dst):
        H[src][dst]['weight'] += w
    else:
        H.add_edge(src, dst, weight=w)

#print("\nPageRank:")
#print(nx.pagerank(H))

print("\nPersonalized PageRank:")
ranks = nx.pagerank(H, personalization=personalize)
sorted_ranks = sorted(ranks.items(), key=operator.itemgetter(1), reverse=True)
print(sorted_ranks)

104 rows affected.

Original Nodes:
[('222', {'name': '/home/lubuntu/src/file_access_monitor/test_workflows/out7.csv', 'labels': ['Dataset']}), ('223', {'pid': 3515, 'name': 'cp out6.csv out8.csv ', 'labels': ['Job'], 'cpu': 57.7008108755753, 'uid': 'PID=3515,CLI=cp out6.csv out8.csv '}), ('226', {'name': '/home/lubuntu/src/file_access_monitor/test_workflows/out9.csv', 'labels': ['Dataset']}), ('224', {'name': '/home/lubuntu/src/file_access_monitor/test_workflows/out8.csv', 'labels': ['Dataset']}), ('218', {'name': '/home/lubuntu/src/file_access_monitor/test_workflows/out5.csv', 'labels': ['Dataset']}), ('219', {'pid': 3511, 'name': 'cp out4.csv out6.csv ', 'labels': ['Job'], 'cpu': 96.59249028784154, 'uid': 'PID=3511,CLI=cp out4.csv out6.csv '}), ('216', {'name': '/home/lubuntu/src/file_access_monitor/test_workflows/out4.csv', 'labels': ['Dataset']}), ('213', {'name': '/home/lubuntu/src/file_access_monitor/test_workflows/out2.csv', 'labels': ['Dataset']}), ('211', {'pid': 3495, 'name'