In [1]:
! pip install --upgrade google-cloud-dataplex

Collecting google-cloud-dataplex
  Downloading google_cloud_dataplex-2.12.0-py3-none-any.whl.metadata (9.8 kB)
Downloading google_cloud_dataplex-2.12.0-py3-none-any.whl (576 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m576.2/576.2 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-cloud-dataplex
Successfully installed google-cloud-dataplex-2.12.0


In [12]:
import csv
import io
import random
from google.cloud import bigquery, storage, dataplex_v1
from google.api_core import exceptions
from google.api_core import exceptions as dataplex_exceptions

# --- Configuration ---
# Your GCP Project ID and Dataset (as requested)
PROJECT_ID = "bq-sme-governance-build"
DATASET_ID = "sme_raw_layer"

# Set a single region. Dataplex DQ scans are not supported in multi-regions like "US".
LOCATION = "us-central1"

# GCS Bucket for external table data.
# This script will try to create it if it doesn't exist.
# !- UPDATE THIS to a globally unique name -!
BUCKET_NAME = f"{PROJECT_ID}-lab-data-source"
# ---------------------

# Initialize clients
bq_client = bigquery.Client(project=PROJECT_ID)
storage_client = storage.Client(project=PROJECT_ID)

# Global list to track created scans for cleanup
CREATED_SCAN_IDS = []


# --- Mock Data Generation ---

def get_mock_customers():
    """Generates mock data for the 'customers' table."""
    return [
        {"customer_id": "C1001", "first_name": "Alice", "last_name": "Smith", "email": "alice@example.com", "join_date": "2023-01-15"},
        {"customer_id": "C1002", "first_name": "Bob", "last_name": "Johnson", "email": "bob@example.com", "join_date": "2023-02-10"},
        {"customer_id": "C1003", "first_name": "Charlie", "last_name": "Brown", "email": "charlie@example.com", "join_date": "2023-03-05"},
        {"customer_id": "C1004", "first_name": "David", "last_name": "Lee", "email": "david@example.com", "join_date": "2023-04-20"},
        {"customer_id": "C1005", "first_name": "Eve", "last_name": "Davis", "email": "eve@example.com", "join_date": "2023-05-15"},
    ]

def get_mock_products():
    """Generates mock data for the 'products' table."""
    return [
        {"product_id": "P2001", "product_name": "Laptop", "category": "Electronics", "unit_price": 1200.00},
        {"product_id": "P2002", "product_name": "Mouse", "category": "Electronics", "unit_price": 25.50},
        {"product_id": "P2003", "product_name": "Coffee Mug", "category": "Homeware", "unit_price": 15.00},
        {"product_id": "P2004", "product_name": "Notebook", "category": "Stationery", "unit_price": 5.75},
        {"product_id": "P9999", "product_name": "Test Item", "category": "UNKNOWN", "unit_price": -1.00},
    ]

def get_mock_orders():
    """Generates mock data for 'orders' as a list of dicts."""
    return [
        {"order_id": "E101", "customer_id": "C1001", "order_date": "2024-05-01", "status": "Shipped"},
        {"order_id": "E102", "customer_id": "C1002", "order_date": "2024-05-03", "status": "Processing"},
        {"order_id": "E103", "customer_id": "C1001", "order_date": "2024-05-04", "status": "Shipped"},
        {"order_id": "E104", "customer_id": "C1003", "order_date": "2024-05-05", "status": "Delivered"},
        {"order_id": "E105", "customer_id": "C1004", "order_date": "2024-05-06", "status": "Shipped"},
        {"order_id": "E106", "customer_id": "C9999", "order_date": "2024-05-07", "status": "Pending"},
    ]


def get_mock_order_items_csv():
    """Generates mock data for 'order_items' as a CSV string."""
    data = [
        ["item_id", "order_id", "product_id", "quantity"],
        ["OI301", "E101", "P2001", 1],
        ["OI302", "E101", "P2002", 1],
        ["OI303", "E102", "P2003", 2],
        ["OI304", "E103", "P2004", 5],
        ["OI305", "E104", "P2001", 1],
        ["OI306", "E105", "P2003", 1],
        ["OI307", "E106", "P9999", 99],
    ]
    output = io.StringIO()
    writer = csv.writer(output)
    writer.writerows(data)
    return output.getvalue()

# --- Cloud Resource Setup ---

def ensure_gcs_bucket_exists():
    """Checks for GCS bucket and creates it if not found."""
    print(f"Checking for GCS bucket: {BUCKET_NAME}...")
    try:
        bucket = storage_client.get_bucket(BUCKET_NAME)
        print("...bucket already exists.")
    except exceptions.NotFound:
        print("...bucket not found, creating new bucket.")
        bucket = storage_client.create_bucket(BUCKET_NAME, location=LOCATION)
        print(f"...created bucket {bucket.name} in {bucket.location}")
    return bucket

def ensure_bq_dataset_exists():
    """Checks for BQ dataset and creates it if not found."""
    dataset_ref = bq_client.dataset(DATASET_ID)
    print(f"Checking for BigQuery dataset: {DATASET_ID}...")
    try:
        bq_client.get_dataset(dataset_ref)
        print("...dataset already exists.")
    except exceptions.NotFound:
        print("...dataset not found, creating new dataset.")
        dataset = bigquery.Dataset(dataset_ref)
        dataset.location = LOCATION
        bq_client.create_dataset(dataset, timeout=30)
        print("...created dataset.")

def upload_to_gcs(bucket, blob_name, data_string):
    """Uploads a string as a file to GCS."""
    print(f"Uploading {blob_name} to GCS bucket {bucket.name}...")
    blob = bucket.blob(blob_name)
    blob.upload_from_string(data_string, content_type="text/csv")
    print("...upload complete.")
    return f"gs://{bucket.name}/{blob_name}"

# --- BigQuery Table Creation ---

def load_table_from_memory(table_id, data, schema):
    """Loads data from a list of dicts into a new BQ table."""
    full_table_id = f"{PROJECT_ID}.{DATASET_ID}.{table_id}"
    print(f"Starting load job for table: {full_table_id}...")
    job_config = bigquery.LoadJobConfig(
        schema=schema,
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
    )
    try:
        load_job = bq_client.load_table_from_json(
            data, full_table_id, job_config=job_config
        )
        load_job.result()
        print("...load job finished.")
    except Exception as e:
        print(f"Error loading table {full_table_id}: {e}")

def load_iceberg_table_from_memory(table_id, data, schema):
    """Creates a BQ-managed Iceberg table and loads data from memory."""
    full_table_id = f"{PROJECT_ID}.{DATASET_ID}.{table_id}"
    print(f"Creating and loading BQ-managed Iceberg table: {full_table_id}...")
    try:
        bq_client.delete_table(full_table_id, not_found_ok=True)
        print("...deleted existing table (if any).")
        table = bigquery.Table(full_table_id, schema=schema)
        table.table_format = "ICEBERG"
        bq_client.create_table(table)
        print("...Iceberg table definition created.")
        job_config = bigquery.LoadJobConfig(
            schema=schema,
            write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
        )
        load_job = bq_client.load_table_from_json(
            data, full_table_id, job_config=job_config
        )
        load_job.result()
        print("...load job finished.")
    except Exception as e:
        print(f"Error creating or loading Iceberg table {full_table_id}: {e}")


def create_external_table(table_id, schema, gcs_uri):
    """Creates a new BQ external table pointing to a GCS file."""
    full_table_id = f"{PROJECT_ID}.{DATASET_ID}.{table_id}"
    print(f"Creating external table: {full_table_id}...")
    external_config = bigquery.ExternalConfig("CSV")
    external_config.source_uris = [gcs_uri]
    external_config.schema = schema
    external_config.csv_options.skip_leading_rows = 1
    table = bigquery.Table(full_table_id)
    table.external_data_configuration = external_config
    try:
        bq_client.delete_table(full_table_id, not_found_ok=True)
        print(f"...deleted existing table (if any).")
        bq_client.create_table(table)
        print("...external table created.")
    except Exception as e:
        print(f"Error creating external table {full_table_id}: {e}")

# --- Dataplex Data Quality Functions ---

def create_data_quality_scan(project_id, table_name, rules):
    """Creates and runs a Dataplex AutoDQ scan for a given table."""
    clean_table_name = table_name.replace('_', '-')
    scan_id = f"dq-{clean_table_name}-{random.randint(1000, 9999)}"

    # CORRECTED: The resource string must be fully qualified with the service name.
    table_resource_string = f"//bigquery.googleapis.com/projects/{project_id}/datasets/{DATASET_ID}/tables/{table_name}"

    print(f"\nCreating Dataplex DQ scan for table: {table_name}...")

    try:
        dq_client = dataplex_v1.DataScanServiceClient()
        parent = f"projects/{project_id}/locations/{LOCATION}"

        data_scan = dataplex_v1.types.DataScan(
            data={"resource": table_resource_string},
            data_quality_spec={"rules": rules},
            execution_spec={
                "trigger": {"on_demand": {}}
            },
        )

        request = dataplex_v1.CreateDataScanRequest(
            parent=parent,
            data_scan=data_scan,
            data_scan_id=scan_id,
        )

        operation = dq_client.create_data_scan(request=request)
        result = operation.result()

        print(f"...Successfully created DQ scan: {result.name}")
        CREATED_SCAN_IDS.append(result.name)

    except Exception as e:
        print(f"!!! An error occurred creating the DQ scan for {table_name}: {e}")
        print("Please ensure the Dataplex API is enabled in your project.")


def get_customers_dq_rules():
    """Returns a list of appropriate DQ rules for the 'customers' table."""
    return [
        dataplex_v1.types.DataQualityRule(
            column="customer_id",
            non_null_expectation={},
            dimension="VALIDITY",
            description="Customer ID must not be empty."
        ),
        dataplex_v1.types.DataQualityRule(
            column="customer_id",
            uniqueness_expectation={},
            dimension="UNIQUENESS",
            description="Each Customer ID must be unique."
        ),
        dataplex_v1.types.DataQualityRule(
            column="email",
            regex_expectation={"regex": r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"},
            dimension="VALIDITY",
            description="Email must be in a valid format."
        ),
    ]

def get_products_dq_rules():
    """Returns a list of appropriate DQ rules for the 'products' table."""
    return [
        dataplex_v1.types.DataQualityRule(
            column="product_id",
            non_null_expectation={},
            dimension="VALIDITY",
            description="Product ID must not be empty."
        ),
        dataplex_v1.types.DataQualityRule(
            column="unit_price",
            range_expectation={"min_value": "0.01"},
            dimension="VALIDITY",
            description="Unit price must be a positive value."
        ),
        dataplex_v1.types.DataQualityRule(
            column="category",
            set_expectation={"values": ["Electronics", "Homeware", "Stationery"]},
            dimension="VALIDITY",
            description="Category must be one of the allowed values."
        )
    ]

def get_orders_dq_rules():
    """Returns a list of appropriate DQ rules for the 'orders' table."""
    customers_table_fqn = f"`{PROJECT_ID}.{DATASET_ID}.customers`"
    return [
        dataplex_v1.types.DataQualityRule(
            column="order_id",
            uniqueness_expectation={},
            dimension="UNIQUENESS",
            description="Each Order ID must be unique."
        ),
        dataplex_v1.types.DataQualityRule(
            row_condition_expectation={
                "sql_expression": f"customer_id IS NULL OR customer_id IN (SELECT customer_id FROM {customers_table_fqn})"
            },
            dimension="CONSISTENCY",
            description="Customer ID must exist in the customers table."
        )
    ]

def get_order_items_dq_rules():
    """Returns a list of appropriate DQ rules for the 'order_items' table."""
    orders_table_fqn = f"`{PROJECT_ID}.{DATASET_ID}.orders`"
    products_table_fqn = f"`{PROJECT_ID}.{DATASET_ID}.products`"
    return [
        dataplex_v1.types.DataQualityRule(
            column="quantity",
            range_expectation={"min_value": "1"},
            dimension="VALIDITY",
            description="Quantity must be at least 1."
        ),
        dataplex_v1.types.DataQualityRule(
            row_condition_expectation={
                 "sql_expression": f"order_id IS NULL OR order_id IN (SELECT order_id FROM {orders_table_fqn})"
            },
            dimension="CONSISTENCY",
            description="Order ID must exist in the orders table."
        ),
        dataplex_v1.types.DataQualityRule(
            row_condition_expectation={
                 "sql_expression": f"product_id IS NULL OR product_id IN (SELECT product_id FROM {products_table_fqn})"
            },
            dimension="CONSISTENCY",
            description="Product ID must exist in the products table."
        )
    ]




def main():
    print(f"Starting data setup for project: {PROJECT_ID}\n")

    try:
        bucket = ensure_gcs_bucket_exists()
        ensure_bq_dataset_exists()
    except Exception as e:
        print(f"Failed to create cloud resources: {e}")
        print("Please check your permissions and project configuration.")
        return

    print("\n--- Step 1: Handling Internal Tables (Load Jobs) ---")

    customers_schema = [
        bigquery.SchemaField("customer_id", "STRING", "REQUIRED"),
        bigquery.SchemaField("first_name", "STRING"),
        bigquery.SchemaField("last_name", "STRING"),
        bigquery.SchemaField("email", "STRING"),
        bigquery.SchemaField("join_date", "DATE"),
    ]
    customers_data = get_mock_customers()
    load_table_from_memory("customers", customers_data, customers_schema)

    products_schema = [
        bigquery.SchemaField("product_id", "STRING", "REQUIRED"),
        bigquery.SchemaField("product_name", "STRING"),
        bigquery.SchemaField("category", "STRING"),
        bigquery.SchemaField("unit_price", "FLOAT"),
    ]
    products_data = get_mock_products()
    load_table_from_memory("products", products_data, products_schema)

    orders_schema = [
        bigquery.SchemaField("order_id", "STRING", "REQUIRED"),
        bigquery.SchemaField("customer_id", "STRING"),
        bigquery.SchemaField("order_date", "DATE"),
        bigquery.SchemaField("status", "STRING"),
    ]
    orders_data = get_mock_orders()
    load_iceberg_table_from_memory("orders", orders_data, orders_schema)

    print("\n--- Step 2: Handling External Tables (GCS) ---")

    order_items_csv_data = get_mock_order_items_csv()
    order_items_gcs_uri = upload_to_gcs(bucket, "raw/order_items/order_items.csv", order_items_csv_data)

    order_items_schema = [
        bigquery.SchemaField("item_id", "STRING", "REQUIRED"),
        bigquery.SchemaField("order_id", "STRING"),
        bigquery.SchemaField("product_id", "STRING"),
        bigquery.SchemaField("quantity", "INTEGER"),
    ]
    create_external_table("order_items", order_items_schema, order_items_gcs_uri)

    print("\n--- Step 3: Creating Dataplex Data Quality Scans ---")

    create_data_quality_scan(PROJECT_ID, "customers", get_customers_dq_rules())
    create_data_quality_scan(PROJECT_ID, "products", get_products_dq_rules())
    create_data_quality_scan(PROJECT_ID, "orders", get_orders_dq_rules())
    create_data_quality_scan(PROJECT_ID, "order_items", get_order_items_dq_rules())

    print("\n--- Setup Complete! ---")
    print(f"Project: {PROJECT_ID}")
    print(f"Dataset: {DATASET_ID}")
    print("Tables created:")
    print(" - customers (Standard BQ Table)")
    print(" - products (Standard BQ Table)")
    print(" - orders (BQ-Managed Iceberg Table)")
    print(" - order_items (External CSV Table)")
    print("Dataplex DQ Scans created for all tables.")



def cleanup_dataplex_scans():
    """Deletes all Dataplex scans created by this script."""
    print("\nAttempting to delete Dataplex Data Quality Scans...")
    if not CREATED_SCAN_IDS:
        print("...no scans were created, skipping.")
        return

    dq_client = dataplex_v1.DataScanServiceClient()
    deleted_count = 0
    for scan_name in CREATED_SCAN_IDS:
        print(f"...deleting scan: {scan_name}")
        try:
            request = dataplex_v1.DeleteDataScanRequest(name=scan_name)
            operation = dq_client.delete_data_scan(request=request)
            operation.result()
            print("...scan deleted successfully.")
            deleted_count += 1
        except dataplex_exceptions.NotFound:
            print("...scan not found, may have already been deleted.")
        except Exception as e:
            print(f"!!! Error deleting scan {scan_name}: {e}")
    print(f"...deleted {deleted_count} Dataplex scans.")


def cleanup_resources():
    """
    Deletes all created resources for a clean teardown.
    WARNING: This is destructive and irreversible.
    """
    print("\n--- STARTING RESOURCE CLEANUP ---")

    cleanup_dataplex_scans()

    print(f"\nAttempting to delete GCS Bucket: {BUCKET_NAME}...")
    try:
        bucket = storage_client.get_bucket(BUCKET_NAME)
        blobs = list(bucket.list_blobs())
        for blob in blobs:
            blob.delete()
        bucket.delete()
        print(f"...bucket {BUCKET_NAME} deleted successfully.")
    except exceptions.NotFound:
        print(f"...bucket {BUCKET_NAME} not found, skipping.")
    except Exception as e:
        print(f"Error deleting bucket {BUCKET_NAME}: {e}")

    print(f"\nAttempting to delete BigQuery Dataset: {DATASET_ID}...")
    try:
        bq_client.delete_dataset(
            DATASET_ID, delete_contents=True, not_found_ok=True
        )
        print(f"...dataset {DATASET_ID} deleted successfully.")
    except Exception as e:
        print(f"Error deleting dataset {DATASET_ID}: {e}")

    print("\n--- CLEANUP COMPLETE ---")


if __name__ == "__main__":
    main()
    # Uncomment the line below to automatically delete all created resources after the script runs.
    #cleanup_resources()

Starting data setup for project: bq-sme-governance-build

Checking for GCS bucket: bq-sme-governance-build-lab-data-source...
...bucket not found, creating new bucket.
...created bucket bq-sme-governance-build-lab-data-source in US-CENTRAL1
Checking for BigQuery dataset: sme_raw_layer...
...dataset not found, creating new dataset.
...created dataset.

--- Step 1: Handling Internal Tables (Load Jobs) ---
Starting load job for table: bq-sme-governance-build.sme_raw_layer.customers...
...load job finished.
Starting load job for table: bq-sme-governance-build.sme_raw_layer.products...
...load job finished.
Creating and loading BQ-managed Iceberg table: bq-sme-governance-build.sme_raw_layer.orders...
...deleted existing table (if any).
...Iceberg table definition created.
...load job finished.

--- Step 2: Handling External Tables (GCS) ---
Uploading raw/order_items/order_items.csv to GCS bucket bq-sme-governance-build-lab-data-source...
...upload complete.
Creating external table: bq-sme-g