# Data catalog export

https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_get_table

Generated with Gemini and tested

Luis Gerardo Baeza <br>
Jun 18, 2024 <br>
Use on your own responsibility

In [None]:
pip install google-cloud-datacatalog

In [None]:
from google.cloud import bigquery
from google.cloud import datacatalog_v1
import pandas as pd
import re

In [None]:
project_id = ""
dataset_id = ""

bq_client = bigquery.Client(project=project_id)
datacatalog_client = datacatalog_v1.PolicyTagManagerClient()

dataset_ref = bq_client.dataset(dataset_id)
tables = list(bq_client.list_tables(dataset_ref))

In [None]:
def get_policytags_info(parent):
    taxonomy_path = parent[:re.search(r"(.*/)(.*/)(.*/)(.*/)(.*/)(.*/)", parent).end(5)-1]
    taxonomy = datacatalog_client.get_taxonomy(name=taxonomy_path)
    policy_tag = datacatalog_client.get_policy_tag(name=parent)
    
    return taxonomy.display_name, policy_tag.display_name

### Print table schema information

### JSON format

In [None]:
table_info = []
for table in tables:
    table_ref = dataset_ref.table(table.table_id)
    table_obj = bq_client.get_table(table_ref)  # Get table details
    
    # Extract Schema
    schema_info = []
    for field in table_obj.schema:
        policy_tags = []
        if field.policy_tags:
            tags = list(field.policy_tags.names) 
            policy_tags_names = []
            for parent in tags:
                taxonomy, p_tag = get_policytags_info(parent)
                policy_tags_names.append(f"{taxonomy}:{p_tag}")
            policy_tags = policy_tags_names

        field_obj = {
            'name': field.name,
            'type': field.field_type,
            'mode': field.mode,
            'policy_tags': policy_tags
        }
        schema_info.append(field_obj)

    # Store Table Information
    table_info.append({
        'table_id': table.table_id,
        'schema': schema_info,
    })
table_info

In [None]:
df = pd.DataFrame(table_info)
df.head()

### Expanded Format

In [None]:
table_info = []
for table in tables:
    table_ref = dataset_ref.table(table.table_id)
    table_obj = bq_client.get_table(table_ref)  # Get table details
    
    # Extract Schema
    schema_info = []
    policy_tags = []
    for field in table_obj.schema:
        if field.policy_tags:
            tags = list(field.policy_tags.names) 
            policy_tags_names = []
            for parent in tags:
                taxonomy, p_tag = get_policytags_info(parent)
                policy_tags_names.append(f"{taxonomy}:{p_tag}")
            policy_tags = policy_tags_names

        field_obj = {
            'table': table.table_id,
            'column': field.name,
            'type': field.field_type,
            'mode': field.mode,
            'policy_tags': policy_tags
        }
        table_info.append(field_obj)

table_info

In [None]:
df = pd.DataFrame(table_info)
df.head(20)