In [68]:
from google.cloud import storage
from google.cloud import bigquery
import os
import json


In [79]:
# Configuration
PROJECT_ID = "PROJECT HERE"
LOCATION = "us-central1"
EXPORT_BUCKET_NAME = f"{PROJECT_ID}-lab-data-export"

In [72]:
import google.auth
from google.auth.transport.requests import AuthorizedSession
from requests import HTTPError
from typing import Any, Optional, Dict

def call_google_api(
    url: str,
    http_verb: str,
    request_body: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
    creds, project = google.auth.default(
        scopes=["https://www.googleapis.com/auth/cloud-platform"]
    )
    authed_session = AuthorizedSession(creds)
    try:
        response = authed_session.request(
            method=http_verb,
            url=url,
            json=request_body  # requests handles None for json param gracefully
        )

        response.raise_for_status()

        if response.status_code == 204:
            return {}

        return response.json()

    except HTTPError as e:
        # Provide more structured error information
        error_message = f"API call failed with status {e.response.status_code}: {e.response.text}"
        print(error_message) # Or use logging
        raise RuntimeError(error_message) from e

In [73]:
#create bucket if it does not exist
def create_storage_bucket():
  storage_client = storage.Client(project=PROJECT_ID)
  buckets = storage_client.list_buckets()
  bucket_names = [bucket.name for bucket in buckets]

  bucket = storage_client.bucket(EXPORT_BUCKET_NAME)

  if not bucket.exists():
      try:
          bucket = storage_client.create_bucket(EXPORT_BUCKET_NAME)
          print(f"Bucket {bucket.name} created.")
      except Exception as e:
          print(f"Error creating bucket: {e}")
  else:
      print(f"Bucket {EXPORT_BUCKET_NAME} already exists.")

create_storage_bucket()

Bucket haneyr-477-20250813153731-lab-data-export already exists.


In [82]:
# request_body = {
#   "type": "EXPORT",
#   "export_spec": {
#     "output_path": f"gs://{EXPORT_BUCKET_NAME}/",
#     "scope": {
#       "projects": [
#         f"projects/{PROJECT_ID}"
#       ]
#     }
#   }
# }

request_body = {
  "type": "EXPORT",
  "export_spec": {
    "output_path": f"gs://{EXPORT_BUCKET_NAME}/",
    "scope": {
      "organizationLevel": "true",
    },
  }
}

In [87]:
url = f"https://dataplex.googleapis.com/v1/projects/{PROJECT_ID}/locations/{LOCATION}/metadataJobs"
response = call_google_api(url, "POST", request_body)
metadata_job_target = response['metadata']['target']
pretty_json = json.dumps(response, indent=4, sort_keys=True)
print(pretty_json)

{
    "done": false,
    "metadata": {
        "@type": "type.googleapis.com/google.cloud.dataplex.v1.OperationMetadata",
        "apiVersion": "v1",
        "createTime": "2025-09-27T03:00:48.064044165Z",
        "requestedCancellation": false,
        "target": "projects/haneyr-1200-20250807004910/locations/us-central1/metadataJobs/metadata-job-703cf6c8-7515-4027-ae09-089495955bac",
        "verb": "create"
    },
    "name": "projects/haneyr-1200-20250807004910/locations/us-central1/operations/operation-1758942047873-63fbf9bf1a819-99d7eddf-4bf6238c"
}


In [90]:
status_url = f"https://dataplex.googleapis.com/v1/{metadata_job_target}"
response = call_google_api(status_url, "GET")
pretty_json = json.dumps(response, indent=4, sort_keys=True)
print(pretty_json)

{
    "createTime": "2025-09-27T03:00:48.059153143Z",
    "exportResult": {},
    "exportSpec": {
        "outputPath": "gs://haneyr-1200-20250807004910-lab-data-export/",
        "scope": {
            "organizationLevel": true
        }
    },
    "name": "projects/haneyr-1200-20250807004910/locations/us-central1/metadataJobs/metadata-job-703cf6c8-7515-4027-ae09-089495955bac",
    "status": {
        "message": "Logs for this MetadataJob can be found at: https://console.cloud.google.com/logs/query;query=resource.type=\"dataplex.googleapis.com/MetadataJob\"\nresource.labels.location=\"us-central1\"\nresource.labels.metadata_job_id=\"metadata-job-703cf6c8-7515-4027-ae09-089495955bac\";?project=609577334843\n",
        "state": "RUNNING",
        "updateTime": "2025-09-27T03:01:26.110532Z"
    },
    "type": "EXPORT",
    "uid": "e3ed5988-110a-493f-990a-1787c507ade9",
    "updateTime": "2025-09-27T03:01:26.184859161Z"
}


In [80]:
import os
from google.cloud import bigquery
from google.cloud.exceptions import NotFound
from google.api_core.exceptions import Conflict

def create_hive_partitioned_external_table(project_id: str, export_bucket_name: str) -> None:
    """
    Creates a Hive-partitioned external table in BigQuery.

    Checks if the dataset exists and creates it if necessary before attempting
    to create the table. The table's data is stored in newline-delimited JSON
    format in a Google Cloud Storage bucket with a Hive-style directory structure.

    Args:
        project_id (str): Your Google Cloud project ID.
        export_bucket_name (str): The GCS bucket name containing the source data.
    """
    # Set these variables
    dataset_id = "dataplex_metadata"
    table_id = "metadata_export"
    location = "US"


    client = bigquery.Client(project=project_id)
    dataset_ref = client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_id)

    #Check for and create the dataset if it doesn't exist
    try:
        client.get_dataset(dataset_ref)
        print(f"ℹ️ Dataset '{dataset_id}' already exists.")
    except NotFound:
        print(f"ℹ️ Dataset '{dataset_id}' not found. Creating it in location '{location}'.")
        try:
            dataset = bigquery.Dataset(dataset_ref)
            dataset.location = location
            client.create_dataset(dataset, timeout=30)
            print(f"Successfully created dataset '{dataset_id}'.")
        except Exception as e:
            print(f"Failed to create dataset '{dataset_id}': {e}")
            return

    # Table schema
    schema = [
        bigquery.SchemaField(
            "entry", "RECORD", "NULLABLE",
            fields=[
                bigquery.SchemaField("name", "STRING", "NULLABLE"),
                bigquery.SchemaField("entryType", "STRING", "NULLABLE"),
                bigquery.SchemaField("createTime", "STRING", "NULLABLE"),
                bigquery.SchemaField("updateTime", "STRING", "NULLABLE"),
                bigquery.SchemaField("aspects", "JSON", "NULLABLE"),
                bigquery.SchemaField("parentEntry", "STRING", "NULLABLE"),
                bigquery.SchemaField("fullyQualifiedName", "STRING", "NULLABLE"),
                bigquery.SchemaField(
                    "entrySource", "RECORD", "NULLABLE",
                    fields=[
                        bigquery.SchemaField("resource", "STRING", "NULLABLE"),
                        bigquery.SchemaField("system", "STRING", "NULLABLE"),
                        bigquery.SchemaField("platform", "STRING", "NULLABLE"),
                        bigquery.SchemaField("displayName", "STRING", "NULLABLE"),
                        bigquery.SchemaField("description", "STRING", "NULLABLE"),
                        bigquery.SchemaField("labels", "JSON", "NULLABLE"),
                        bigquery.SchemaField(
                            "ancestors", "RECORD", "REPEATED",
                            fields=[
                                bigquery.SchemaField("name", "STRING", "NULLABLE"),
                                bigquery.SchemaField("type", "STRING", "NULLABLE"),
                            ],
                        ),
                        bigquery.SchemaField("createTime", "STRING", "NULLABLE"),
                        bigquery.SchemaField("updateTime", "STRING", "NULLABLE"),
                        bigquery.SchemaField("location", "STRING", "NULLABLE"),
                    ],
                ),
            ],
        )
    ]

    external_config = bigquery.ExternalConfig("NEWLINE_DELIMITED_JSON")
    gcs_uri = f"gs://{export_bucket_name}/*"
    external_config.source_uris = [gcs_uri]

    hive_partitioning_options = bigquery.HivePartitioningOptions()
    hive_partitioning_options.mode = "AUTO"
    hive_partitioning_options.source_uri_prefix = f"gs://{export_bucket_name}/"
    external_config.hive_partitioning = hive_partitioning_options

    table = bigquery.Table(table_ref, schema=schema)
    table.external_data_configuration = external_config

    try:
        created_table = client.create_table(table)
        print(
            f"Successfully created external table: {created_table.project}.{created_table.dataset_id}.{created_table.table_id}"
        )
    except Conflict:
        print(f"ℹ️ Table '{table_id}' already exists.")
    except Exception as e:
        print(f"An unexpected error occurred while creating the table: {e}")



create_hive_partitioned_external_table(PROJECT_ID, EXPORT_BUCKET_NAME)

ℹ️ Dataset 'dataplex_metadata' not found. Creating it in location 'US'...
✅ Successfully created dataset 'dataplex_metadata'.
✅ Successfully created external table: haneyr-1200-20250807004910.dataplex_metadata.metadata_export
