# Events of interest from raw instrumentation logs

In [1]:
import pandas as pd

logs_df = pd.read_csv('loggedfs_events4.log')
cols = [col for col in logs_df.columns if col in ['Action','Time','Path','PID','PPID','UID','Command Line']]
df = logs_df[cols]
df[(df['Action'] == 'read') | (df['Action'] == 'write')]

Unnamed: 0,Time,Action,Path,UID,PID,PPID,Command Line
2,1486075407888,read,/home/lubuntu/src/file_access_monitor/test_wor...,1000,17110,17109,cp in1.csv out1.csv
4,1486075407888,write,/home/lubuntu/src/file_access_monitor/test_wor...,1000,17110,17109,cp in1.csv out1.csv
7,1486075408892,read,/home/lubuntu/src/file_access_monitor/test_wor...,1000,17112,17109,cat in1.csv
9,1486075408892,write,/home/lubuntu/src/file_access_monitor/test_wor...,1000,17112,17109,cat in1.csv
11,1486075410214,read,/home/lubuntu/src/file_access_monitor/test_wor...,1000,17114,17109,python concat_csvs.py in1.csv in2.csv out3.csv
13,1486075410215,read,/home/lubuntu/src/file_access_monitor/test_wor...,1000,17114,17109,python concat_csvs.py in1.csv in2.csv out3.csv
16,1486075410219,write,/home/lubuntu/src/file_access_monitor/test_wor...,1000,17114,17109,python concat_csvs.py in1.csv in2.csv out3.csv
19,1486075411282,read,/home/lubuntu/src/file_access_monitor/test_wor...,1000,17121,17109,cat out3.csv
21,1486075411283,write,/home/lubuntu/src/file_access_monitor/test_wor...,1000,17121,17109,cat out3.csv


# Build a Neo4j lineage graph from instrumentation logs

In [2]:
import lineage_graph as lg

graph = lg.build_lineage_graph('loggedfs_events4.log')

# Custom visualization of lineage graph using Vis.js

In [3]:
from scripts.vis import draw

options = {"Dataset": "name", "Job": "uid"}
draw(graph, options)

# Query Neo4j temporal lineage graph using Cypher

## Query results as a Pandas DataFrame

In [4]:
# Sanity check query.
query = """
    MATCH (n)-[r]->(m)
    RETURN id(n) AS source_id,
           id(r) AS rel_id,
           id(m) AS target_id
"""

results_df = pd.DataFrame(graph.data(query))
print(results_df)

   rel_id  source_id  target_id
0       4          0          5
1       2          0          3
2       0          0          1
3       1          1          2
4       3          3          4
5       6          5         90
6       5          6          5
7      55         90         91
8      56         91         92


## Copy detection as a reachability query between 2 content similar datasets

In [5]:
query = """
    MATCH (n:Dataset)
    WHERE (n.name = '/home/lubuntu/src/file_access_monitor/test_workflows/in1.csv'
    OR n.name = '/home/lubuntu/src/file_access_monitor/test_workflows/out4.csv')
    RETURN id(n) as node_id
"""
node_ids = graph.data(query)

# Path query with monotonically increasing timestamps.
query = """
    START a=node(%d), b=node(%d)
    MATCH p=(a)-[r*]->(b)
    WITH head(relationships(p)) as r1, p, b
    RETURN p, b.name
""" % (node_ids[0]['node_id'], node_ids[1]['node_id'])


paths = graph.data(query)
print('\nFound %d path(s) from a to b:\n' % len(paths))
print(paths)

# Path query with monotonically increasing timestamps.
query = """
    START a=node(%d), b=node(%d)
    MATCH p=(b)-[r*]->(a)
    WITH head(relationships(p)) as r1, p, b
    RETURN p, b.name
""" % (node_ids[0]['node_id'], node_ids[1]['node_id'])


paths = graph.data(query)
print('\nFound %d path(s) from b to a:\n' % len(paths))
print(paths)


Found 1 path(s) from a to b:

[{'b.name': '/home/lubuntu/src/file_access_monitor/test_workflows/out4.csv', 'p': (f8d6825)-[:IS_READ_BY {data_bytes:1024,timestamp:1486075410214}]->(f2bebaa)-[:WRITES_TO {data_bytes:1024,timestamp:1486075410219}]->(cb59cd9)-[:IS_READ_BY {data_bytes:1024,timestamp:1486075411282}]->(cb23575)-[:WRITES_TO {data_bytes:1024,timestamp:1486075411283}]->(`/home/lubuntu/src/file_access_monitor/test_workflows/out4.csv`)}]

Found 0 path(s) from b to a:

[]


## Copy detection as a reachability query between 2 content similar datasets (monotonically increasing edge timestamps)

In [6]:
query = """
    MATCH (n:Dataset)
    WHERE (n.name = '/home/lubuntu/src/file_access_monitor/test_workflows/in1.csv'
    OR n.name = '/home/lubuntu/src/file_access_monitor/test_workflows/out4.csv')
    RETURN id(n) as node_id
"""
node_ids = graph.data(query)

# Path query with monotonically increasing timestamps.
query = """
    START a=node(%d), b=node(%d)
    MATCH p=(a)-[r*]->(b)
    WITH head(relationships(p)) as r1, p, b
    WHERE all(r2 in relationships(p)
              where r2.timestamp>=r1.timestamp)    
    RETURN p, b.name
""" % (node_ids[0]['node_id'], node_ids[1]['node_id'])


paths = graph.data(query)
print('\nFound %d path(s) from a to b with monotonically increasing timestamps:\n' % len(paths))
print(paths)

# Path query with monotonically increasing timestamps.
query = """
    START a=node(%d), b=node(%d)
    MATCH p=(b)-[r*]->(a)
    WITH head(relationships(p)) as r1, p, b
    WHERE all(r2 in relationships(p)
              where r2.timestamp>=r1.timestamp)    
    RETURN p, b.name
""" % (node_ids[0]['node_id'], node_ids[1]['node_id'])


paths = graph.data(query)
print('\nFound %d path(s) from b to a with monotonically increasing timestamps:\n' % len(paths))
print(paths)


Found 1 path(s) from a to b with monotonically increasing timestamps:

[{'b.name': '/home/lubuntu/src/file_access_monitor/test_workflows/out4.csv', 'p': (f8d6825)-[:IS_READ_BY {data_bytes:1024,timestamp:1486075410214}]->(f2bebaa)-[:WRITES_TO {data_bytes:1024,timestamp:1486075410219}]->(cb59cd9)-[:IS_READ_BY {data_bytes:1024,timestamp:1486075411282}]->(cb23575)-[:WRITES_TO {data_bytes:1024,timestamp:1486075411283}]->(`/home/lubuntu/src/file_access_monitor/test_workflows/out4.csv`)}]

Found 0 path(s) from b to a with monotonically increasing timestamps:

[]


# Aggregate Queries


In [7]:
query = """
    START from=node(*)
    MATCH p=(from)-->(to)
    WITH from as from, to as to, count(p) as paths
    WHERE paths > 1
    RETURN to,paths
"""

paths = graph.data(query)
print(paths)

[]


# PageRank

In [25]:
%load_ext cypher
import networkx as nx
%matplotlib inline

results = %cypher MATCH p = (a)-[r*]->(b) RETURN p

# Networkx graph.
g = results.get_graph()

#nx.draw(g)
#g.nodes(data=True)

# Print nodes so that we can see their original ids and properties.
print("\nOriginal Nodes:")
print(g.nodes(data=True))

# Node weights for personalized pagerank.
personalize = {}
for node, data in g.nodes(data=True):
    if 'Dataset' in data['labels']:
        # FIXME: 0 weight for dataset nodes?  Should we use some other attribute instead?
        personalize[node] = 0
    elif 'Job' in data['labels']:
        # TODO: Use some resource from telemetry data here, e.g., CPU consumption.
        personalize[node] = data['pid']
    print("node=%s, data=%s" % (node, data))

print("\nPersonalize Vector:")
print(personalize)

print("\nOriginal Edges:")

# Transformation from MultiDigraph to Graph for Pagerank calculation.
H = nx.Graph()
for src, dst, edge in g.edges(data=True):
    print("src=%s, dst=%s, edge=%s" % (src, dst, edge))
    # Let's weight each edge by the amount of bytes read / written.
    w = edge['data_bytes']
    if H.has_edge(src, dst):
        H[src][dst]['weight'] += w
    else:
        H.add_edge(src, dst, weight=w)

print("\nPageRank:")
print(nx.pagerank(H)
print("\nPersonalized PageRank:")
print(nx.pagerank(H, personalization=personalize))

The cypher extension is already loaded. To reload it, use:
  %reload_ext cypher
20 rows affected.

Original Nodes:
[('6', {'name': '/home/lubuntu/src/file_access_monitor/test_workflows/in2.csv', 'labels': ['Dataset']}), ('91', {'name': 'cat out3.csv ', 'uid': 'PID=17121,CLI=cat out3.csv ', 'pid': 17121, 'labels': ['Job']}), ('0', {'name': '/home/lubuntu/src/file_access_monitor/test_workflows/in1.csv', 'labels': ['Dataset']}), ('4', {'name': '/home/lubuntu/src/file_access_monitor/test_workflows/out2.csv', 'labels': ['Dataset']}), ('90', {'name': '/home/lubuntu/src/file_access_monitor/test_workflows/out3.csv', 'labels': ['Dataset']}), ('1', {'name': 'cp in1.csv out1.csv ', 'uid': 'PID=17110,CLI=cp in1.csv out1.csv ', 'pid': 17110, 'labels': ['Job']}), ('5', {'name': 'python concat_csvs.py in1.csv in2.csv out3.csv ', 'uid': 'PID=17114,CLI=python concat_csvs.py in1.csv in2.csv out3.csv ', 'pid': 17114, 'labels': ['Job']}), ('2', {'name': '/home/lubuntu/src/file_access_monitor/test_workflow