# Events of interest from raw instrumentation logs

In [15]:
import pandas as pd

logs_df = pd.read_csv('loggedfs_events4.log')
cols = [col for col in logs_df.columns if col in ['Action','Time','Path','PID','PPID','UID','Command Line']]
df = logs_df[cols]
df[(df['Action'] == 'read') | (df['Action'] == 'write')]

Unnamed: 0,Time,Action,Path,UID,PID,PPID,Command Line
2,1486075407888,read,/home/lubuntu/src/file_access_monitor/test_wor...,1000,17110,17109,cp in1.csv out1.csv
4,1486075407888,write,/home/lubuntu/src/file_access_monitor/test_wor...,1000,17110,17109,cp in1.csv out1.csv
7,1486075408892,read,/home/lubuntu/src/file_access_monitor/test_wor...,1000,17112,17109,cat in1.csv
9,1486075408892,write,/home/lubuntu/src/file_access_monitor/test_wor...,1000,17112,17109,cat in1.csv
11,1486075410214,read,/home/lubuntu/src/file_access_monitor/test_wor...,1000,17114,17109,python concat_csvs.py in1.csv in2.csv out3.csv
13,1486075410215,read,/home/lubuntu/src/file_access_monitor/test_wor...,1000,17114,17109,python concat_csvs.py in1.csv in2.csv out3.csv
16,1486075410219,write,/home/lubuntu/src/file_access_monitor/test_wor...,1000,17114,17109,python concat_csvs.py in1.csv in2.csv out3.csv
19,1486075411282,read,/home/lubuntu/src/file_access_monitor/test_wor...,1000,17121,17109,cat out3.csv
21,1486075411283,write,/home/lubuntu/src/file_access_monitor/test_wor...,1000,17121,17109,cat out3.csv


# Build a Neo4j lineage graph from instrumentation logs

In [106]:
import lineage_graph as lg

graph = lg.build_lineage_graph('loggedfs_events4.log')

# Custom visualization of lineage graph using Vis.js

In [107]:
from scripts.vis import draw

options = {"Dataset": "name", "Job": "uid"}
draw(graph, options)

# Query Neo4j temporal lineage graph using Cypher

## Query results as a Pandas DataFrame

In [119]:
# Sanity check query.
query = """
    MATCH (n)-[r]->(m)
    RETURN id(n) AS source_id,
           id(r) AS rel_id,
           id(m) AS target_id
"""

results_df = pd.DataFrame(graph.data(query))
print(results_df)

   rel_id  source_id  target_id
0     500        591        596
1     498        591        594
2     496        591        592
3     497        592        593
4     499        594        595
5     502        596        598
6     501        597        596
7     503        598        599
8     504        599        600


## Copy detection as a reachability query between 2 content similar datasets

In [117]:
query = """
    MATCH (n:Dataset)
    WHERE (n.name = '/home/lubuntu/src/file_access_monitor/test_workflows/in1.csv'
    OR n.name = '/home/lubuntu/src/file_access_monitor/test_workflows/out4.csv')
    RETURN id(n) as node_id
"""
node_ids = graph.data(query)

# Path query with monotonically increasing timestamps.
query = """
    START a=node(%d), b=node(%d)
    MATCH p=(a)-[r*]->(b)
    WITH head(relationships(p)) as r1, p, b
    RETURN p, b.name
""" % (node_ids[0]['node_id'], node_ids[1]['node_id'])


paths = graph.data(query)
print('\nFound %d path(s) from a to b:\n' % len(paths))
print(paths)

# Path query with monotonically increasing timestamps.
query = """
    START a=node(%d), b=node(%d)
    MATCH p=(b)-[r*]->(a)
    WITH head(relationships(p)) as r1, p, b
    RETURN p, b.name
""" % (node_ids[0]['node_id'], node_ids[1]['node_id'])


paths = graph.data(query)
print('\nFound %d path(s) from b to a:\n' % len(paths))
print(paths)


Found 1 path(s) from a to b:

[{'b.name': '/home/lubuntu/src/file_access_monitor/test_workflows/out4.csv', 'p': (f944c12)-[:IS_READ_BY {timestamp:1486075410214}]->(c1f4d7f)-[:WRITES_TO {timestamp:1486075410219}]->(`/home/lubuntu/src/file_access_monitor/test_workflows/out3.csv`)-[:IS_READ_BY {timestamp:1486075411282}]->(d4bb19c)-[:WRITES_TO {timestamp:1486075411283}]->(`/home/lubuntu/src/file_access_monitor/test_workflows/out4.csv`)}]

Found 0 path(s) from b to a:

[]


## Copy detection as a reachability query between 2 content similar datasets (monotonically increasing edge timestamps)

In [118]:
query = """
    MATCH (n:Dataset)
    WHERE (n.name = '/home/lubuntu/src/file_access_monitor/test_workflows/in1.csv'
    OR n.name = '/home/lubuntu/src/file_access_monitor/test_workflows/out4.csv')
    RETURN id(n) as node_id
"""
node_ids = graph.data(query)

# Path query with monotonically increasing timestamps.
query = """
    START a=node(%d), b=node(%d)
    MATCH p=(a)-[r*]->(b)
    WITH head(relationships(p)) as r1, p, b
    WHERE all(r2 in relationships(p)
              where r2.timestamp>=r1.timestamp)    
    RETURN p, b.name
""" % (node_ids[0]['node_id'], node_ids[1]['node_id'])


paths = graph.data(query)
print('\nFound %d path(s) from a to b with monotonically increasing timestamps:\n' % len(paths))
print(paths)

# Path query with monotonically increasing timestamps.
query = """
    START a=node(%d), b=node(%d)
    MATCH p=(b)-[r*]->(a)
    WITH head(relationships(p)) as r1, p, b
    WHERE all(r2 in relationships(p)
              where r2.timestamp>=r1.timestamp)    
    RETURN p, b.name
""" % (node_ids[0]['node_id'], node_ids[1]['node_id'])


paths = graph.data(query)
print('\nFound %d path(s) from b to a with monotonically increasing timestamps:\n' % len(paths))
print(paths)


Found 1 path(s) from a to b with monotonically increasing timestamps:

[{'b.name': '/home/lubuntu/src/file_access_monitor/test_workflows/out4.csv', 'p': (f944c12)-[:IS_READ_BY {timestamp:1486075410214}]->(c1f4d7f)-[:WRITES_TO {timestamp:1486075410219}]->(`/home/lubuntu/src/file_access_monitor/test_workflows/out3.csv`)-[:IS_READ_BY {timestamp:1486075411282}]->(d4bb19c)-[:WRITES_TO {timestamp:1486075411283}]->(`/home/lubuntu/src/file_access_monitor/test_workflows/out4.csv`)}]

Found 0 path(s) from b to a with monotonically increasing timestamps:

[]
