# Airbyte API EDA

Data Analysis to explore Airbyte connection/source/destination details via the API to verify the downstream BigQuery dataset  
and table names in order to generate URN for DataHub

In [1]:
from os import getenv
from airbyte_api import AirbyteAPI, api, models

### Authentication (Username/Password)

TODO: Figure out how to connect with client_id and client_secret

In [2]:
client = AirbyteAPI(
    server_url="http://localhost:8000/api/public/v1/",
    security=models.Security(
        client_credentials=models.SchemeClientCredentials(
            client_id=getenv("AIRBYTE_CLIENT_ID"),
            client_secret=getenv("AIRBYTE_CLIENT_SECRET"),
            token_url="http://localhost:8000/api/public/v1/applications/token"
        )
    )
)

### Connection Details

In [3]:
CONNECTION_ID = getenv("AIRBYTE_CONN_ID", "e37988e6-8ed5-465c-abb2-150639819c62")

In [4]:
conn_req  = client.connections.get_connection(request=api.GetConnectionRequest(connection_id=CONNECTION_ID))
conn_resp = conn_req.connection_response

print(f"Connection name:        {conn_resp.name}")
print(f"Status:                 {conn_resp.status}")
print(f"Source Id:              {conn_resp.source_id}")
print(f"Destination Id:         {conn_resp.destination_id}")
print(f"Workspace Id:           {conn_resp.workspace_id}")
print(f"Namespace definition:   {conn_resp.namespace_definition}")
print(f"Namespace format:       {conn_resp.namespace_format}")
print(f"Prefix:                 {conn_resp.prefix}")
print()

print("Streams:")
for stream_cfg in conn_resp.configurations.streams:
    print(f"  - name: {stream_cfg.name}")
    print(f"    sync_mode: {stream_cfg.sync_mode}")
    print(f"    destination_object_name: {stream_cfg.destination_object_name}")
    print(f"    namespace: {stream_cfg.namespace}")

Connection name:        hackernews-rss-frontpage <> bigquery-hackernews
Status:                 ConnectionStatusEnum.ACTIVE
Source Id:              d5be8561-fc48-48ba-9515-31d64a38a31e
Destination Id:         a0fb5576-0a39-4a88-9b10-d8fe76a92ee7
Workspace Id:           c6171c23-c173-4eff-bdcf-9a1713fa5ca8
Namespace definition:   NamespaceDefinitionEnum.DESTINATION
Namespace format:       None
Prefix:                 frontpage_

Streams:
  - name: items
    sync_mode: ConnectionSyncModeEnum.FULL_REFRESH_OVERWRITE
    destination_object_name: None
    namespace: None


### Destination Details

In [5]:
dest_req  = client.destinations.get_destination(request=api.GetDestinationRequest(destination_id=conn_resp.destination_id))
dest_resp = dest_req.destination_response

print(f"Destination name:    {dest_resp.name}")
print(f"Destination type:    {dest_resp.destination_type}")
print()

bq_config = dest_resp.configuration
print(f"Project Id:          {bq_config.project_id}")
print(f"Dataset Id:          {bq_config.dataset_id}")
print(f"Dataset location:    {bq_config.dataset_location}")

Destination name:    bigquery-hackernews
Destination type:    bigquery

Project Id:          iobruno-gcp-labs
Dataset Id:          hackernews_rss_raw
Dataset location:    DatasetLocation.US_CENTRAL1


In [6]:
project_id = bq_config.project_id
dataset_id = bq_config.dataset_id

for stream_cfg in conn_resp.configurations.streams:
    table_name = stream_cfg.destination_object_name or stream_cfg.name    
    fq_table = f"{project_id}.{dataset_id}.{conn_resp.prefix}{table_name}"
    
    print(f"Fully-qualified Table Name: {fq_table}")

Fully-qualified Table Name: iobruno-gcp-labs.hackernews_rss_raw.frontpage_items


### Source Details

In [7]:
source_req  = client.sources.get_source(request=api.GetSourceRequest(source_id=conn_resp.source_id))
source_resp = source_req.source_response

print(f"Source name:   {source_resp.name}")
print(f"Source type:   {source_resp.source_type}")
print(f"Configuration: {source_resp.configuration}")

Source name:   hackernews-rss-frontpage
Source type:   rss
Configuration: SourceAirtable(credentials=None, SOURCE_TYPE=<SourceAirtableAirtable.AIRTABLE: 'airtable'>)


In [8]:
conn_resp.to_dict()

{'configurations': {'streams': [{'name': 'items',
    'cursorField': ['published'],
    'includeFiles': False,
    'mappers': [],
    'primaryKey': [],
    'selectedFields': [],
    'syncMode': <ConnectionSyncModeEnum.FULL_REFRESH_OVERWRITE: 'full_refresh_overwrite'>}]},
 'connectionId': 'e37988e6-8ed5-465c-abb2-150639819c62',
 'createdAt': 1769736003,
 'destinationId': 'a0fb5576-0a39-4a88-9b10-d8fe76a92ee7',
 'name': 'hackernews-rss-frontpage <> bigquery-hackernews',
 'schedule': {'scheduleType': <ScheduleTypeWithBasicEnum.MANUAL: 'manual'>},
 'sourceId': 'd5be8561-fc48-48ba-9515-31d64a38a31e',
 'status': <ConnectionStatusEnum.ACTIVE: 'active'>,
 'tags': [],
 'workspaceId': 'c6171c23-c173-4eff-bdcf-9a1713fa5ca8',
 'namespaceDefinition': <NamespaceDefinitionEnum.DESTINATION: 'destination'>,
 'nonBreakingSchemaUpdatesBehavior': <NonBreakingSchemaUpdatesBehaviorEnum.IGNORE: 'ignore'>,
 'prefix': 'frontpage_'}