# Dataplex lineage demo
@Luis Gerardo Baeza

On this demo, we showcase how to record custom lineage events between both BigQuery tables, and custom entries in the Dataplex Catalog.

You should be able to see the graph with an event to a custom entry in the Dataplex Catalog
![dataplex_lineage.png](dataplex_lineage.png)

In [1]:
pip install --quiet google-cloud-dataplex google-cloud-datacatalog-lineage

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/584.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m266.2/584.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m584.5/584.5 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/82.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.6/82.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from google.cloud import dataplex_v1
from google.cloud import datacatalog_lineage_v1
from datetime import datetime, timezone
from google.protobuf.timestamp_pb2 import Timestamp
import google.protobuf

project_id = "lgbaeza-202310"
location = "us"
client = datacatalog_lineage_v1.LineageClient()
dplx_client = dataplex_v1.CatalogServiceClient()
parent = f"projects/{project_id}/locations/{location}"

project_number = ""



## Sample dataset and table

In [3]:
%%bigquery
CREATE schema IF NOT EXISTS lineage_demo;
CREATE TABLE IF NOT EXISTS lineage_demo.ecommerce_sessions as
SELECT * from `data-to-insights.ecommerce.all_sessions` limit 1000000;

CREATE TABLE IF NOT EXISTS lineage_demo.ecommerce_analytics as
SELECT country, count(fullVisitorId) visits from lineage_demo.ecommerce_sessions group by 1;

Query is running:   0%|          |

## Lineage recording utility function

In [4]:
def create_lineage_event ( process_id, source_fqn, destination_fqn, display_name, transformation, run_display_name):
  process = datacatalog_lineage_v1.Process(
      display_name = display_name,
      attributes = transformation
  )

  created_process = client.create_process( parent = parent, process = process)
  process_name = created_process.name

  now = datetime.now(timezone.utc)
  start_time = Timestamp()
  start_time.FromDatetime(now)

  run = datacatalog_lineage_v1.Run(
      display_name = run_display_name
      , start_time = start_time
  )

  created_run = client.create_run(
      parent = process_name
      , run = run
  )
  run_name = created_run.name

  lineage_event = datacatalog_lineage_v1.LineageEvent(
    links=[
        datacatalog_lineage_v1.EventLink(
            source=datacatalog_lineage_v1.EntityReference(
                fully_qualified_name = source_fqn
            ),
            target=datacatalog_lineage_v1.EntityReference(
                fully_qualified_name = destination_fqn
            )
        )
    ]
    , start_time = start_time
  )

  response = client.create_lineage_event(parent=run_name, lineage_event=lineage_event)
  return response

## Create lineage event against two BQ tables

In [39]:
"""
create_lineage_event (
      process_id = "ecommerce-transform-job"
    , source_fqn = f"bigquery:{project_id}.lineage_demo.ecommerce_sessions"
    , destination_fqn = f"bigquery:{project_id}.lineage_demo.ecommerce_analytics"
    , display_name = "Ecommerce Analytics Transformation"
    , transformation = {"sql_query": "SELECT country, count(fullVisitorId) visits from lineage_demo.ecommerce_sessions group by 1;"}
    , run_display_name = "Run - 2024-05-20"
)
"""

'\ncreate_lineage_event (\n      process_id = "ecommerce-transform-job"\n    , source_fqn = f"bigquery:{project_id}.lineage_demo.ecommerce_sessions"\n    , destination_fqn = f"bigquery:{project_id}.lineage_demo.ecommerce_analytics"\n    , display_name = "Ecommerce Analytics Transformation"\n    , transformation = {"sql_query": "SELECT country, count(fullVisitorId) visits from lineage_demo.ecommerce_sessions group by 1;"}\n    , run_display_name = "Run - 2024-05-20"\n)\n'

## Create an external entry in the Catalog

In [28]:
table_name = "ecommerce_sessions"
DATAPLEX_ENTRY_TYPE_ID = "onprem"
DATAPLEX_ASPECT_TYPE_ID = "crm"
parent_group = f"projects/{project_id}/locations/global/entryGroups/aws"
parent_project = f"projects/{project_id}/locations/global"
ENTRY_ID = f"onprem/{table_name}"
ENTRY_GROUP = "external"

In [10]:
# Create aspect type
index_fields = []

aspect_type = dataplex_v1.types.AspectType(
    metadata_template = dataplex_v1.types.AspectType.MetadataTemplate(
        name = "external-table",
        type_ = "record",
        record_fields = index_fields,
        constraints = dataplex_v1.types.AspectType.MetadataTemplate.Constraints(required=True)
    )
)

dplx_client.create_aspect_type(
    parent = f"projects/{project_id}/locations/global",
    aspect_type_id = DATAPLEX_ASPECT_TYPE_ID,
    aspect_type = aspect_type
)


<google.api_core.operation.Operation at 0x78baff8f9390>

In [11]:
# Create entry type
entry_type = dataplex_v1.types.EntryType(
    type_aliases = ["TABLE"],
    display_name = "On Premises Table",
    system = "CRM",
    description = "CRM table",
    required_aspects = [
        dataplex_v1.types.EntryType.AspectInfo(
            type_=f"projects/{project_id}/locations/global/aspectTypes/{DATAPLEX_ASPECT_TYPE_ID}"
    )]
)

dplx_client.create_entry_type(
    entry_type = entry_type,
    parent = f"projects/{project_id}/locations/global",
    entry_type_id = DATAPLEX_ENTRY_TYPE_ID
)

<google.api_core.operation.Operation at 0x78baff9c90c0>

In [15]:
# Create entry group
entry_group = dataplex_v1.types.EntryGroup(
    description = "External data"
)

dplx_client.create_entry_group(
    parent = parent_project,
    entry_group_id = ENTRY_GROUP,
    entry_group = entry_group
)

<google.api_core.operation.Operation at 0x78bb56b9f7c0>

In [31]:
dplex_fqdn = f"dataplex:{project_id}.us.{ENTRY_GROUP}.{ENTRY_ID}"

aspect_data = google.protobuf.struct_pb2.Struct()
aspect_data = dataplex_v1.types.Aspect(data = aspect_data)

entry_source = dataplex_v1.types.EntrySource(
    system =  "CRM",
    platform = "Legacy Data Center",
    display_name = "ganalytics",
    description = ""
)
aspects = {f"{project_id}.global.{DATAPLEX_ASPECT_TYPE_ID}": aspect_data}

entry = dataplex_v1.types.Entry(
    name = ENTRY_ID
    , entry_type = f"projects/{project_id}/locations/global/entryTypes/{DATAPLEX_ENTRY_TYPE_ID}"
    , entry_source = entry_source
    , aspects = aspects
    , fully_qualified_name = dplex_fqdn
)

entry = dplx_client.create_entry(
    parent = parent_group,
    entry_id = ENTRY_ID,
    entry = entry
)

print(entry.name)

projects/lgbaeza-202310/locations/global/entryGroups/aws/entries/onprem/ecommerce_sessions


In [30]:
# dplx_client.delete_entry(name = f"projects/lgbaeza-202310/locations/global/entryGroups/aws/entries/onprem/ganalytics")

## Create lineage event against a custom entry in the Dataplex Catalog and a BQ Table

In [32]:
create_lineage_event (
      process_id = "ecommerce-transform-job"
    , source_fqn = dplex_fqdn
    , destination_fqn = f"bigquery:{project_id}.lineage_demo.ecommerce_sessions"
    , display_name = "Ecommerce Analytics Transformation"
    , transformation = {"sql_query": "INSERT INTO ecommerce.all_sessions_analytics SELECT * FROM ecommerce.all_sessions"}
    , run_display_name = "Daily Run - 2024-05-20"
)

name: "projects/559018099654/locations/us/processes/134c155e-2203-4b07-8f06-00215105ec28/runs/650d55bb-dc35-4842-b6c9-f03dc6ef88a9/lineageEvents/88d8ddfd-05bb-43fb-b91f-6f408dbb6234"
start_time {
  seconds: 1769003920
  nanos: 550983000
}
links {
  source {
    fully_qualified_name: "dataplex:lgbaeza-202310.us.external.onprem/ecommerce_sessions"
  }
  target {
    fully_qualified_name: "bigquery:lgbaeza-202310.lineage_demo.ecommerce_sessions"
  }
}

In [37]:
# process_id = "bb947ccb-368f-4b5e-9cf3-0153dbe3d8ed"
# client.delete_process( name = f"projects/{project_id}/locations/us/processes/{process_id}" )

<google.api_core.operation.Operation at 0x78baff8af730>

In [None]:
# for entry in dplx_client.list_entries(parent = f"projects/{project_number}/locations/global/entryGroups/aws"):
#  print(entry.fully_qualified_name)





