# DataHub Emitter for Airbyte Connections

EDA with DataHub REST Emiter

### Load Pipeline Config

In [1]:
from pathlib import Path
from pydantic import BaseModel
import yaml

In [2]:
class WorkspaceConfig(BaseModel):
    id: str
    name: str
    base_url: str

class DataHubConfig(BaseModel):
    server: str

class ConnectionConfig(BaseModel):
    id: str
    name: str
    upstream_datajob: str
    downstream_dataset: str

class PipelineConfig(BaseModel):
    workspace: WorkspaceConfig
    environment: str
    datahub: DataHubConfig
    connections: list[ConnectionConfig] = []

In [3]:
config_path = Path("./pipeline.yaml")

with open(config_path) as f:
    raw = yaml.safe_load(f)

config = PipelineConfig.model_validate(raw)

In [4]:
config

PipelineConfig(workspace=WorkspaceConfig(id='c6171c23-c173-4eff-bdcf-9a1713fa5ca8', name='hackernews_rss_bigquery', base_url='http://localhost:8000'), environment='prod', datahub=DataHubConfig(server='http://localhost:9090'), connections=[ConnectionConfig(id='e37988e6-8ed5-465c-abb2-150639819c62', name='hackernews_rss_front', upstream_datajob='urn:li:dataJob:(urn:li:dataFlow:(airflow,hackernews_rss_bigquery,prod),hackernews_rss_front)', downstream_dataset='urn:li:dataset:(urn:li:dataPlatform:bigquery,iobruno-gcp-labs.hackernews_rss_raw.frontpage_items,PROD)')])

### Build DataFlow and DataJobs

In [5]:
from datahub.api.entities.datajob.dataflow import DataFlow
from datahub.api.entities.datajob.datajob import DataJob
from datahub.metadata._urns.urn_defs import DataJobUrn, DatasetUrn

In [6]:
def fetch_airbyte_url(base_url: str, workspace_id: str, connection_id: str) -> str:
    return f"{base_url}/workspaces/{workspace_id}/connections/{connection_id}"

In [7]:
def build_datajob(config: PipelineConfig, conn: ConnectionConfig, flow: DataFlow) -> DataJob:
    job = DataJob(id=conn.id, flow_urn=flow.urn, name=conn.name)
    job.upstream_urns.append(DataJobUrn.from_string(conn.upstream_datajob))
    job.outlets.append(DatasetUrn.from_string(conn.downstream_dataset))
    external_url = fetch_airbyte_url(config.workspace.base_url, config.workspace.id, conn.id)
    job.url = external_url
    return job

In [8]:
flow = DataFlow(orchestrator="airbyte", id=config.workspace.name, env=config.environment)

In [9]:
jobs = [build_datajob(config, conn, flow) for conn in config.connections]

for job in jobs:
    print(f"DataJob (Airbyte Connection): {job.urn}")
    print(f"  Name:                 {job.name}")
    print(f"  URL:                  {job.url}")
    print(f"  Upstream (Airflow):   {job.upstream_urns}")
    print(f"  Downstream (BigQuery):{job.outlets}")
    print()

DataJob (Airbyte Connection): urn:li:dataJob:(urn:li:dataFlow:(airbyte,hackernews_rss_bigquery,prod),e37988e6-8ed5-465c-abb2-150639819c62)
  Name:                 hackernews_rss_front
  URL:                  http://localhost:8000/workspaces/c6171c23-c173-4eff-bdcf-9a1713fa5ca8/connections/e37988e6-8ed5-465c-abb2-150639819c62
  Upstream (Airflow):   [DataJobUrn(urn:li:dataJob:(urn:li:dataFlow:(airflow,hackernews_rss_bigquery,prod),hackernews_rss_front))]
  Downstream (BigQuery):[DatasetUrn(urn:li:dataset:(urn:li:dataPlatform:bigquery,iobruno-gcp-labs.hackernews_rss_raw.frontpage_items,PROD))]



### Generate and Emit MCPs

In [10]:
from datahub.emitter.rest_emitter import DataHubRestEmitter

In [11]:
flow_mcps = list(flow.generate_mcp())
jobs_mcps = [mcp for job in jobs for mcp in job.generate_mcp()]
all_mcps = flow_mcps + jobs_mcps

print(f"Flow MCPs: {len(flow_mcps)}")
print(f"Jobs MCPs: {len(jobs_mcps)}")
print(f"Total MCPs: {len(all_mcps)}")

Flow MCPs: 4
Jobs MCPs: 6
Total MCPs: 10


In [12]:
emitter = DataHubRestEmitter(gms_server=config.datahub.server)
emitter.emit_mcps(all_mcps)

print(f"Emitted {len(all_mcps)} MCPs to {config.datahub.server}")

Emitted 10 MCPs to http://localhost:9090
