In [2]:
# Set environment and corresponding meta_db
environment = "dev"  # Change this to "test" or "prod" as needed

if environment == "test":
    meta_db = "meta_test_db"
elif environment == "dev":
    meta_db = "meta_db"
else:
    raise ValueError(f"Unknown environment: {environment}")

StatementMeta(, 6e9f8c8d-eb02-4b70-a601-54973ff66a9b, 4, Finished, Available, Finished)

#### Create meta tables

In [6]:
#---------------------------------------------------------------------------------------------
# Name: create_meta_tables.ipynb
#---------------------------------------------------------------------------------------------
# Purpose: Creates the meta delta tables in lk_cdsa_bronze.meta_db
#---------------------------------------------------------------------------------------------
# ver.  | date     | author         | change
#---------------------------------------------------------------------------------------------
# v1    | 09/02/25 | K. Hardis      | Initial Version.
#---------------------------------------------------------------------------------------------

print('||-------------create_meta_tables.ipynb--------------||')
print('||-----')

print('||-----')

print(f"||    Environment: {environment}")
print(f"||    Meta Schema: {meta_db}")
print('||-----')

# Set current database context
spark.catalog.setCurrentDatabase(meta_db)

try:

    print('|| 0. Create meta tables if not exists')
    print('||-----')

    # Create meta tables if not exists
    spark.sql(f"""
        -- FILE_STATUS_LKP
        CREATE TABLE IF NOT EXISTS {meta_db}.FILE_STATUS_LKP (
          file_status STRING NOT NULL,
          file_status_id BIGINT,
          created_date TIMESTAMP,
          created_by STRING,
          modified_date TIMESTAMP,
          modified_by STRING
        ) USING DELTA
    """)
    
    spark.sql(f"""
        -- PROVIDER
        CREATE TABLE IF NOT EXISTS {meta_db}.PROVIDER (
          provider_id BIGINT,
          provider_name STRING,
          provider_code STRING,
          created_date TIMESTAMP,
          created_by STRING,
          modified_date TIMESTAMP,
          modified_by STRING
        ) USING DELTA
    """)
    
    spark.sql(f"""
        -- RECEIVER
        CREATE TABLE IF NOT EXISTS {meta_db}.RECEIVER (
          receiver_id BIGINT,
          receiver_name STRING,
          receiver_code STRING,
          created_date TIMESTAMP,
          created_by STRING,
          modified_date TIMESTAMP,
          modified_by STRING
        ) USING DELTA
    """)
    
    spark.sql(f"""
        -- BATCH
        CREATE TABLE IF NOT EXISTS meta_db.BATCH (
          batch_id BIGINT,
          batch_name STRING,
          batch_description STRING,
          batch_type STRING,
          batch_rows BIGINT,
          batch_status STRING,
          status_message STRING,
          start_time TIMESTAMP,
          end_time TIMESTAMP,
          created_date TIMESTAMP,
          created_by STRING,
          modified_date TIMESTAMP,
          modified_by STRING
        ) USING DELTA
    """)
    
    spark.sql(f"""
        -- DATA_OBJECT
        CREATE TABLE IF NOT EXISTS {meta_db}.DATA_OBJECT (
          object_id BIGINT,
          object_name STRING,
          object_type STRING,
          object_description STRING,
          database_name STRING,
          schema_name STRING,
          filename_pattern STRING,
          filename_extension STRING,
          control_file_flag STRING,
          control_file_pattern STRING,
          control_file_extension STRING,
          encryption_flag STRING,
          encryption_type STRING,
          compression_flag STRING,
          compression_type STRING,
          file_encoding STRING,
          column_delimiter STRING,
          row_delimiter STRING,
          field_count BIGINT,
          header_row_count BIGINT,
          footer_row_count BIGINT,
          record_length BIGINT,
          file_format STRING,
          text_qualifier STRING,
          error_row_threshold_pct BIGINT,
          required_ind STRING,
          file_transfer_method STRING,
          landing_directory STRING,
          created_date TIMESTAMP,
          created_by STRING,
          modified_date TIMESTAMP,
          modified_by STRING
        ) USING DELTA
    """)
    
    spark.sql(f"""
        -- DATA_FILE
        CREATE TABLE IF NOT EXISTS {meta_db}.DATA_FILE (
          file_id BIGINT,
          object_id BIGINT,
          file_pattern STRING,
          file_date TIMESTAMP,
          file_received_date TIMESTAMP,
          file_status STRING,
          file_byte_size BIGINT,
          filename STRING,
          expected_row_count BIGINT,
          row_count BIGINT,
          fm_file_id BIGINT,
          fm_good_record_count BIGINT,
          fm_error_record_count BIGINT,
          stg_good_record_count BIGINT,
          file_path STRING,
          batch_id BIGINT,
          created_date TIMESTAMP,
          created_by STRING,
          modified_date TIMESTAMP,
          modified_by STRING
        ) USING DELTA
    """)
    
    spark.sql(f"""
        -- EXTRACT_CONFIG
        CREATE TABLE IF NOT EXISTS {meta_db}.EXTRACT_CONFIG (
          extract_id BIGINT,
          extract_name STRING,
          extract_type STRING,
          target_schema STRING,
          target_object STRING,
          location STRING,
          file_name STRING,
          file_name_has_datetime_ind STRING,
          format_type STRING,
          compression_str STRING,
          record_delimiter STRING,
          field_delimiter STRING,
          file_extension STRING,
          escape_character STRING,
          escape_unenclosed_field STRING,
          date_format STRING,
          time_format STRING,
          timestamp_format STRING,
          binary_format STRING,
          field_optionally_enclosed_by STRING,
          null_if STRING,
          overwrite STRING,
          single STRING,
          max_file_size BIGINT,
          object_schema STRING,
          object_name STRING,
          sqlquery STRING,
          header STRING,
          created_date TIMESTAMP,
          created_by STRING,
          modified_date TIMESTAMP,
          modified_by STRING
        ) USING DELTA
    """)
    
    spark.sql(f"""
        -- PROCESS
        CREATE TABLE IF NOT EXISTS {meta_db}.PROCESS (
          process_id BIGINT,
          parent_process_id BIGINT,
          process_name STRING,
          process_folder STRING,
          process_qualifier STRING,
          process_type STRING,
          process_group STRING,
          process_text STRING,
          process_path STRING,
          created_date TIMESTAMP,
          created_by STRING,
          modified_date TIMESTAMP,
          modified_by STRING
        ) USING DELTA
    """)
    
    spark.sql(f"""
        -- PROCESS_LOG
        CREATE TABLE IF NOT EXISTS {meta_db}.PROCESS_LOG (
          process_log_id BIGINT,
          process_id BIGINT,
          source_id BIGINT,
          target_id BIGINT,
          src_file_id BIGINT,
          tgt_file_id BIGINT,
          process_execute_id BIGINT,
          start_time TIMESTAMP,
          end_time TIMESTAMP,
          inserted_rows BIGINT,
          updated_rows BIGINT,
          deleted_rows BIGINT,
          error_row_count BIGINT,
          status STRING,
          status_message STRING,
          batch_id BIGINT,
          created_date TIMESTAMP,
          created_by STRING,
          modified_date TIMESTAMP,
          modified_by STRING
        ) USING DELTA
    """)
    
    spark.sql(f"""
        -- PROCESS_LOG_DETAIL
        CREATE TABLE IF NOT EXISTS {meta_db}.PROCESS_LOG_DETAIL (
          process_log_detail_id BIGINT,
          process_log_id BIGINT,
          process_name STRING,
          log_detail_message STRING,
          sql_query STRING,
          start_time TIMESTAMP,
          elapsed_duration BIGINT,
          total_row_cnt BIGINT,
          created_date TIMESTAMP,
          created_by STRING,
          modified_date TIMESTAMP,
          modified_by STRING
        ) USING DELTA
    """)
    
    spark.sql(f"""
        -- SOURCE
        CREATE TABLE IF NOT EXISTS {meta_db}.SOURCE (
          source_id BIGINT,
          provider_id BIGINT,
          source_code STRING,
          source_type STRING,
          category STRING,
          sub_category STRING,
          source_system_name STRING,
          feed_cd STRING,
          stage_name STRING,
          count_threshold BIGINT,
          filelist_filename STRING,
          cr_flag STRING,
          cr_rekey_flag STRING,
          receipt_frequency STRING,
          receipt_day BIGINT,
          receipt_time STRING,
          post_validation_sql STRING,
          purge_type STRING,
          purge_max_length BIGINT,
          purge_max_length_type STRING,
          created_date TIMESTAMP,
          created_by STRING,
          modified_date TIMESTAMP,
          modified_by STRING
        ) USING DELTA
    """)
    
    spark.sql(f"""
        -- SOURCE_FEED
        CREATE TABLE IF NOT EXISTS {meta_db}.SOURCE_FEED (
          source_id BIGINT,
          validate_prior_to_load_ind STRING,
          compression_str STRING,
          date_format STRING,
          time_format STRING,
          timestamp_format STRING,
          escape_char STRING,
          escape_unenclosed_field STRING,
          trim_space STRING,
          null_if STRING,
          error_on_column_count_mismatch STRING,
          on_error STRING,
          purge_str STRING,
          return_failed_only STRING,
          enforce_length STRING,
          truncatecolumns STRING,
          force_str STRING,
          acceptinvchars STRING,
          target_object_name STRING,
          add_record_id_ind STRING,
          split_filename_pattern STRING,
          strip_outer_element STRING,
          created_date TIMESTAMP,
          created_by STRING,
          modified_date TIMESTAMP,
          modified_by STRING
        ) USING DELTA
    """)
    
    spark.sql(f"""
        -- SOURCE_FEED_COLUMN
        CREATE TABLE IF NOT EXISTS {meta_db}.SOURCE_FEED_COLUMN (
          column_id BIGINT,
          source_id BIGINT,
          ordinal_position BIGINT,
          column_name STRING,
          data_type STRING,
          max_length BIGINT,
          scale BIGINT,
          fixed_width_start_position BIGINT,
          fixed_width_length BIGINT,
          column_property STRING,
          created_date TIMESTAMP,
          created_by STRING,
          modified_date TIMESTAMP,
          modified_by STRING
        ) USING DELTA
    """)
    
    spark.sql(f"""
        -- SOURCE_VIEW_INCREMENTAL
        CREATE TABLE IF NOT EXISTS {meta_db}.SOURCE_VIEW_INCREMENTAL (
          source_id BIGINT,
          source_code STRING,
          input_sql STRING,
          column_name STRING,
          last_pull_date STRING,
          created_date TIMESTAMP,
          created_by STRING,
          modified_date TIMESTAMP,
          modified_by STRING
        ) USING DELTA
    """)
    
    spark.sql(f"""
        -- SOURCE_RANK
        CREATE TABLE IF NOT EXISTS {meta_db}.SOURCE_RANK (
          source_rank BIGINT,
          source_id BIGINT,
          created_date TIMESTAMP,
          created_by STRING,
          modified_date TIMESTAMP,
          modified_by STRING
        ) USING DELTA
    """)
    
    spark.sql(f"""
        -- TARGET
        CREATE TABLE IF NOT EXISTS {meta_db}.TARGET (
          target_id BIGINT,
          target_name STRING,
          receiver_id BIGINT,
          target_type STRING,
          category STRING,
          sub_category STRING,
          delivery_frequency STRING,
          delivery_day BIGINT,
          delivery_time STRING,
          purge_type STRING,
          purge_column STRING,
          purge_max_length BIGINT,
          purge_max_length_type STRING,
          created_date TIMESTAMP,
          created_by STRING,
          modified_date TIMESTAMP,
          modified_by STRING
        ) USING DELTA
    """)
    
    spark.sql(f"""
        -- TARGET_VIEW
        CREATE TABLE IF NOT EXISTS {meta_db}.TARGET_VIEW (
          target_id BIGINT,
          view_name STRING,
          source_object_name STRING,
          stage_table_name STRING,
          mdb_table_name STRING,
          full_refresh_ind STRING,
          cr_ind STRING,
          cr_rekey_ind STRING,
          archive_table_ind STRING,
          fact_or_dimension STRING,
          natural_key STRING,
          order_by_cols STRING,
          surrogate_key STRING,
          generate_surr_key_ind STRING,
          archive_ret_copies BIGINT,
          source_data_2_best STRING,
          partition_by_cols STRING,
          indiv_id_change_col STRING,
          add_id_change_col STRING,
          house_id_change_col STRING,
          site_id_change_col STRING,
          batch_name STRING,
          batch_table STRING,
          id_change_table STRING,
          retain_col_list STRING,
          drop_stg_tables STRING,
          created_date TIMESTAMP,
          created_by STRING,
          modified_date TIMESTAMP,
          modified_by STRING
        ) USING DELTA
    """)
    
    spark.sql(f"""
        -- TARGET_VIEW_COLUMN_OVERRIDE
        CREATE TABLE IF NOT EXISTS {meta_db}.TARGET_VIEW_COLUMN_OVERRIDE (
          view_name STRING,
          column_name STRING,
          target_id BIGINT,
          mdb_table_name STRING,
          transformation_sql STRING,
          created_date TIMESTAMP,
          created_by STRING,
          modified_date TIMESTAMP,
          modified_by STRING
        ) USING DELTA
    """)

except Exception as e:
    print(f"||    Error: {e}")

print('||-----')
print('||----------------SUCCESS----------------||')
print('||-----')


StatementMeta(, 6e9f8c8d-eb02-4b70-a601-54973ff66a9b, 8, Finished, Available, Finished)

||-------------create_meta_tables.ipynb--------------||
||-----
||-----
||    Environment: dev
||    Meta Schema: meta_db
||-----
|| 0. Create meta tables if not exists
||-----
||-----
||----------------SUCCESS----------------||
||-----


#### Re-initialize Meta - Drop all meta tables

In [3]:
#---------------------------------------------------------------------
# Re-initialize Meta - Drop all meta tables
#---------------------------------------------------------------------

print('||-----')

print(f"||    Environment: {environment}")
print(f"||    Meta Schema: {meta_db}")
print('||-----')

try:

    # Set current database context
    spark.catalog.setCurrentDatabase(meta_db)

    # Use SQL to list only physical tables in the meta schema
    tables_raw = spark.sql(f"SHOW TABLES IN {meta_db}")
    meta_tables = [
        row["tableName"]
        for row in tables_raw.collect()
        if not row["isTemporary"] and not row["tableName"].endswith("_new")
    ]

    # Drop each meta table
    for table_name in meta_tables:
        print(f"Dropping table: {meta_db}.{table_name}")
        spark.sql(f"DROP TABLE IF EXISTS {meta_db}.{table_name}")

except Exception as e:
    print(f"||    Error: {e}")

print('||-----')
print('||----------------SUCCESS----------------||')
print('||-----')


StatementMeta(, 6e9f8c8d-eb02-4b70-a601-54973ff66a9b, 5, Finished, Available, Finished)

||-----
||    Environment: dev
||    Meta Schema: meta_db
||-----
Dropping table: meta_db.batch
Dropping table: meta_db.data_file
Dropping table: meta_db.data_object
Dropping table: meta_db.extract_config
Dropping table: meta_db.file_status_lkp
Dropping table: meta_db.process
Dropping table: meta_db.process_log
Dropping table: meta_db.process_log_detail
Dropping table: meta_db.provider
Dropping table: meta_db.receiver
Dropping table: meta_db.source
Dropping table: meta_db.source_feed
Dropping table: meta_db.source_feed_column
Dropping table: meta_db.source_rank
Dropping table: meta_db.source_view_incremental
Dropping table: meta_db.target
Dropping table: meta_db.target_view
Dropping table: meta_db.target_view_column_override
||-----
||----------------SUCCESS----------------||
||-----


#### Verify Meta - Display records from each Meta table

In [None]:
#------------------------------------------------------------------------------------------
# Verify Meta - Display records from each Meta table
#------------------------------------------------------------------------------------------

print('||-----')
print(f"||    Environment: {environment}")
print(f"||    Meta Schema: {meta_db}")
print('||-----')

try:
    # Use SQL to list only physical tables in the meta schema
    tables_raw = spark.sql(f"SHOW TABLES IN {meta_db}")
    meta_tables = [
        row["tableName"]
        for row in tables_raw.collect()
        if not row["isTemporary"] and not row["tableName"].endswith("_new")
    ]

    # Display records from each meta table
    for table_name in meta_tables:
        print(f"{table_name}:")
    
        # Run the query
        display(spark.sql(f"""
            SELECT * 
            FROM {meta_db}.{table_name} 
        """))

except Exception as e:
    print(f"||    Error listing tables: {e}")

print('||-----')
print('||----------------SUCCESS----------------||')
print('||-----')
