In [10]:
import duckdb
import re
import ast
con = duckdb.connect(config={'allow_unsigned_extensions' : 'true'})
con.execute("CALL dbgen(sf=1);")
print(con.execute("LOAD '../build/release/repository/v1.3.0/osx_amd64/lineage.duckdb_extension'").df())


end2end_lineage = f"""
WITH RECURSIVE lineage_tree as   (
    -- Base case: start from root
    SELECT
        source_table,
        source_opid,
        sink_opid,
        out_rowid,
        in_rowid,
        0 AS depth,
        CAST(source_opid AS VARCHAR) AS path
    FROM global_lineage()    WHERE sink_opid =-1

    UNION ALL

    -- Recursive step: find children
    SELECT
        c.source_table,
        c.source_opid,
        c.sink_opid,
        p.out_rowid,
        c.in_rowid,
        p.depth + 1,
        path || ' -> ' || c.source_opid
    FROM global_lineage() c
    JOIN lineage_tree p ON c.sink_opid = p.source_opid and c.out_rowid=p.in_rowid
),

-- end-to-end lineage
lineage_e2e AS (
    SELECT source_table, source_opid,
        out_rowid,
        LIST(DISTINCT in_rowid) AS prov
    FROM lineage_tree
    GROUP BY out_rowid, source_opid, source_table
)

SELECT *
FROM lineage_e2e 
""" 
#WHERE  CAST(source_table AS VARCHAR) NOT LIKE 'LOGICAL_%'

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Empty DataFrame
Columns: [Success]
Index: []


In [11]:
qid = 3
qfile = f"../queries/q{str(qid).zfill(2)}.sql"
text_file = open(qfile, "r")
query = text_file.read().strip()
query = ' '.join(query.split())
text_file.close()
con.execute("PRAGMA threads=1")
con.execute("PRAGMA set_debug_lineage(False)")
con.execute("PRAGMA set_lineage(True)")
print(query)
print(con.execute(query).df())
con.execute("PRAGMA set_lineage(False)")
lineage = con.execute("select * from global_lineage()").df()
lineage

SELECT l_orderkey, sum(l_extendedprice * (1 - l_discount)) AS revenue, o_orderdate, o_shippriority FROM customer, orders, lineitem WHERE c_mktsegment = 'BUILDING' AND c_custkey = o_custkey AND l_orderkey = o_orderkey AND o_orderdate < CAST('1995-03-15' AS date) AND l_shipdate > CAST('1995-03-15' AS date) GROUP BY l_orderkey, o_orderdate, o_shippriority ORDER BY revenue DESC, o_orderdate LIMIT 10;
[DEBUG] <persist lineage> partition_id0 qid_opid:5_7_0, left len: 733, right len: 733, type: LOGICAL_COMPARISON_JOIN
[DEBUG] <persist lineage> partition_id1 qid_opid:5_7_1, left len: 0, right len: 0, type: LOGICAL_COMPARISON_JOIN
[DEBUG] <persist lineage> partition_id0 qid_opid:5_5_0, left len: 16, right len: 16, type: LOGICAL_COMPARISON_JOIN
[DEBUG] <persist lineage> partition_id1 qid_opid:5_5_1, left len: 0, right len: 0, type: LOGICAL_COMPARISON_JOIN
[DEBUG] <persist lineage> partition_id0 qid_opid:5_3_0, left len: 6, right len: 0, type: LOGICAL_AGGREGATE_AND_GROUP_BY
[DEBUG] <persist linea

Unnamed: 0,source_table,sink_table,source_opid,sink_opid,out_rowid,in_rowid
0,LOGICAL_AGGREGATE_AND_GROUP_BY,LOGICAL_PROJECTION,3,-1,0,4879
1,LOGICAL_AGGREGATE_AND_GROUP_BY,LOGICAL_PROJECTION,3,-1,1,6822
2,LOGICAL_AGGREGATE_AND_GROUP_BY,LOGICAL_PROJECTION,3,-1,2,1007
3,LOGICAL_AGGREGATE_AND_GROUP_BY,LOGICAL_PROJECTION,3,-1,3,2385
4,LOGICAL_AGGREGATE_AND_GROUP_BY,LOGICAL_PROJECTION,3,-1,4,4834
...,...,...,...,...,...,...
385814,customer,LOGICAL_COMPARISON_JOIN,10,7,147121,1499946
385815,customer,LOGICAL_COMPARISON_JOIN,10,7,147122,1499947
385816,customer,LOGICAL_COMPARISON_JOIN,10,7,147123,1499950
385817,customer,LOGICAL_COMPARISON_JOIN,10,7,147124,1499987


In [12]:
con.execute("PRAGMA set_debug_lineage(False)")
con.execute(end2end_lineage).df()

Unnamed: 0,source_table,source_opid,out_rowid,prov
0,LOGICAL_AGGREGATE_AND_GROUP_BY,3,0,[4879]
1,LOGICAL_AGGREGATE_AND_GROUP_BY,3,1,[6822]
2,LOGICAL_AGGREGATE_AND_GROUP_BY,3,2,[1007]
3,LOGICAL_AGGREGATE_AND_GROUP_BY,3,3,[2385]
4,LOGICAL_AGGREGATE_AND_GROUP_BY,3,4,[4834]
5,LOGICAL_AGGREGATE_AND_GROUP_BY,3,5,[9499]
6,LOGICAL_AGGREGATE_AND_GROUP_BY,3,6,[10711]
7,LOGICAL_AGGREGATE_AND_GROUP_BY,3,7,[5219]
8,LOGICAL_AGGREGATE_AND_GROUP_BY,3,8,[1968]
9,LOGICAL_AGGREGATE_AND_GROUP_BY,3,9,[4587]


In [62]:
# TODO: construct lineage queries for delim

def get_plan():
    meta = con.execute("select * from pragma_latest_qid()").df()
    if len(meta) == 0: return None, None
    latest = len(meta)-1
    query_id = meta['query_id'][latest]
    plan_str = meta['plan'][latest]
    print(plan_str)
    plan_safe = re.sub(r": (\w+)", r': "\1"', plan_str)
    plan = ast.literal_eval(plan_safe)
    return plan, query_id

# TODO: support delim join
def extract_lineage_queries(query_id, node):
    queries = []

    opid = node["opid"]
    table = "-"  if len(node["table"]) == 0 else node["table"]
    has_lineage = node.get("has_lineage", "false") == "true"
    children = node.get("children", [])
    is_join = "JOIN" in node["name"] and len(children) == 2
    needs_unnest = node["name"] in ["LOGICAL_AGGREGATE_AND_GROUP_BY", "LOGICAL_DELIM_GET"]
    has_children = children != "NULL" and children != None
    # If current node has lineage, emit a query
    if has_lineage:
        parent_id = node["sink_id"]
        src_opid = node["source_id"][0]
        src_table = node["source_table"][0]
        if needs_unnest:
            queries.append(f"""
            SELECT '{src_table}' as source_table, {src_opid} AS source_opid, {parent_id} AS sink_opid,
            out_rowid, in_elem AS in_rowid
            FROM (
                SELECT out_rowid, UNNEST(in_rowid) AS in_elem
                FROM lineage_scan({query_id}, {opid}, 0)
                     AS ls(out_rowid BIGINT, in_rowid LIST(BIGINT))
            )
            """.strip())
        else:
            queries.append(f"""SELECT  '{src_table}' as source_table, {src_opid} AS source_opid, {parent_id} AS sink_opid, out_rowid,  in_rowid
            FROM lineage_scan({query_id}, {opid}, 0)  AS ls(out_rowid BIGINT, in_rowid BIGINT)""".strip())
        if is_join:
            rhs_opid = node["source_id"][1]
            rhs_table = node["source_table"][1]
            queries.append(f"""SELECT  '{rhs_table}' as source_table, {rhs_opid} AS source_opid, {parent_id} AS sink_opid, out_rowid,  in_rowid
            FROM lineage_scan({query_id}, {opid}, 1)  AS ls(out_rowid BIGINT, in_rowid BIGINT)""".strip())

    if not has_children: return queries
    for i, child in enumerate(children):
        queries.extend(extract_lineage_queries(query_id, child))

    return queries
    
plan, query_id = get_plan()
lineage = None
if plan:
    queries = extract_lineage_queries(query_id, plan)
    final_sql = "\nUNION ALL\n".join(queries)
    print(final_sql)
    lineage = con.execute(final_sql).df()
lineage
    #con.execute("PRAGMA clear_lineage")
#lineage

{"opid": 1,"name": "LOGICAL_ORDER_BY","sink_id": -1,"source_id": [5],"table": "","source_table": [""],"has_lineage": true,"children": [{"opid": 2,"name": "LOGICAL_PROJECTION","sink_id": -1,"source_id": [],"table": "","source_table": [],"has_lineage": false,"children": [{"opid": 3,"name": "LOGICAL_PROJECTION","sink_id": -1,"source_id": [],"table": "","source_table": [],"has_lineage": false,"children": [{"opid": 4,"name": "LOGICAL_PROJECTION","sink_id": -1,"source_id": [],"table": "","source_table": [],"has_lineage": false,"children": [{"opid": 5,"name": "LOGICAL_AGGREGATE_AND_GROUP_BY","sink_id": 5,"source_id": [7],"table": "","source_table": [""],"has_lineage": true,"children": [{"opid": 6,"name": "LOGICAL_PROJECTION","sink_id": -1,"source_id": [],"table": "","source_table": [],"has_lineage": false,"children": [{"opid": 7,"name": "LOGICAL_DELIM_JOIN","sink_id": 7,"source_id": [9,13],"table": "","source_table": ["","orders"],"has_lineage": true,"children": [{"opid": 8,"name": "LOGICAL_P

Unnamed: 0,source_table,source_opid,sink_opid,out_rowid,in_rowid
0,,5,-1,0,1
1,,5,-1,1,4
2,,5,-1,2,3
3,,5,-1,3,0
4,,5,-1,4,2
...,...,...,...,...,...
3995,,8,12,577,14898
3996,,8,12,578,14941
3997,,8,12,579,14950
3998,,8,12,580,14963


In [None]:
# SELECT 'LOGICAL_COMPARISON_JOIN', 'LOGICAL_DELIM_JOIN', 9, 7, out_rowid, in_rowid  FROM lineage_scan(0, 7, 0) AS ls(out_rowid BIGINT, in_rowid BIGINT)
# UNION ALL SELECT 'orders', 'LOGICAL_DELIM_JOIN', 13, 7, out_rowid, in_rowid  FROM lineage_scan(0, 7, 1) AS ls(out_rowid BIGINT, in_rowid BIGINT)

q = """
SELECT 'LOGICAL_AGGREGATE_AND_GROUP_BY' as source_table, 'LOGICAL_ORDER_BY' as sink_table, 5 as source_opid, -1 as sink_opid, out_rowid, in_rowid  FROM lineage_scan(0, 1, 0) AS ls(out_rowid BIGINT, in_rowid BIGINT)
UNION ALL SELECT 'LOGICAL_COMPARISON_JOIN', 'LOGICAL_AGGREGATE_AND_GROUP_BY', 9, 5, out_rowid, in_elem  FROM  (SELECT out_rowid, UNNEST(in_rowid) AS in_elem FROM lineage_scan(0, 5, 0) AS ls(out_rowid BIGINT, in_rowid LIST(BIGINT)))
UNION ALL SELECT 'orders', 'LOGICAL_COMPARISON_JOIN', 13, 9, out_rowid, in_rowid  FROM lineage_scan(0, 9, 1) AS ls(out_rowid BIGINT, in_rowid BIGINT)
UNION ALL SELECT 'lineitem', 'LOGICAL_COMPARISON_JOIN', 11, 9, out_rowid, in_rowid  FROM lineage_scan(0, 9, 0) AS ls(out_rowid BIGINT, in_rowid BIGINT)
UNION ALL SELECT 'LOGICAL_DELIM_GET', 'LOGICAL_COMPARISON_JOIN', 12, 9, out_rowid, in_rowid  FROM lineage_scan(0, 9, 1) AS ls(out_rowid BIGINT, in_rowid BIGINT)
UNION ALL SELECT 'orders', 'LOGICAL_DELIM_GET', 13, 12, out_rowid, in_elem FROM (SELECT out_rowid, UNNEST(in_rowid) AS in_elem FROM lineage_scan(0, 12, 0) AS ls(out_rowid BIGINT, in_rowid LIST(BIGINT)))
"""

global_lineage  = con.execute(q).df()
global_lineage