In [8]:
import fnmatch
from pyspark.sql import SparkSession
from datetime import datetime

# Initialize Spark session
spark = SparkSession.builder.getOrCreate()

# Data source
data_source = "raw_customer_data"
print(f"||    Processing files for data source: {data_source}")
print('||-----')

# ABFSS path and pattern
full_path = "abfss://CDSA@onelake.dfs.fabric.microsoft.com/lk_cdsa_landing_zone.Lakehouse/Files/raw_customer_data"
pattern = "*.csv"

# Read file metadata
file_df = spark.read.format("binaryFile").option("recursiveFileLookup", "true").load(full_path)

# Extract and filter filenames
all_files = [row["path"].split("/")[-1] for row in file_df.select("path").collect()]
filtered_files = [f for f in all_files if fnmatch.fnmatch(f, pattern)]

# Compare against metadata table
registered_df = spark.table("meta_db.data_file").select("filename")
registered_files = [row.filename for row in registered_df.collect()]
unregistered_files = [f for f in filtered_files if f not in registered_files]

# Get current max batch_id for daily_update in STARTED state
batch_row = spark.sql("""
    SELECT COALESCE(MAX(batch_id), 0) AS max_id
    FROM meta_db.BATCH
    WHERE batch_name = 'daily_update' AND batch_status = 'STARTED'
""").first()
current_batch_id = batch_row["max_id"]

# Get current max file_id
file_id_row = spark.sql("SELECT COALESCE(MAX(file_id), 0) AS max_id FROM meta_db.data_file").first()
next_file_id = file_id_row["max_id"] + 1

if unregistered_files:
    now_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    for i, filename in enumerate(unregistered_files):
        spark.sql(f"""
            INSERT INTO meta_db.data_file
            VALUES (
                {next_file_id + i},    -- file_id
                NULL,                  -- object_id
                NULL,                  -- file_pattern
                NULL,                  -- file_date
                TIMESTAMP('{now_ts}'), -- file_received_date
                'REGISTERED',          -- file_status
                NULL,                  -- file_byte_size
                '{filename}',          -- filename
                NULL,                  -- expected_row_count
                NULL,                  -- row_count
                NULL,                  -- fm_file_id
                NULL,                  -- fm_good_record_count
                NULL,                  -- fm_error_record_count
                NULL,                  -- stg_good_record_count
                NULL,                  -- file_path
                {current_batch_id},    -- batch_id
                TIMESTAMP('{now_ts}'), -- created_date
                'system',              -- created_by
                TIMESTAMP('{now_ts}'), -- modified_date
                'system'               -- modified_by
            )
        """)
    print(f"||    Registered {len(unregistered_files)} new files.")
else:
    print("||    No new files to register.")

print('||-----')
print('||----------------SUCCESS----------------||')
print('||-----')


StatementMeta(, 831a4ec0-3d26-4d48-a00e-a7007fb0d5b9, 10, Finished, Available, Finished)

||    Processing files for data source: raw_customer_data
||-----
||    Registered 1 new files.
||-----
||----------------SUCCESS----------------||
||-----


In [9]:
%%sql
select * from lk_cdsa_bronze.meta_db.data_file;

-- delete from lk_cdsa_bronze.meta_db.data_file;

StatementMeta(, 831a4ec0-3d26-4d48-a00e-a7007fb0d5b9, 11, Finished, Available, Finished)

<Spark SQL result set with 1 rows and 20 fields>

In [14]:
#------------------------------------------------------------------------------------------
# Verify Meta - Display meta table records
#------------------------------------------------------------------------------------------

# Set the database context
spark.catalog.setCurrentDatabase("meta_db")

# Get all meta tables in the current database
meta_tables = [t.name for t in spark.catalog.listTables()]

print(meta_tables)

# Display records from each meta table for inspection
for table_name in meta_tables:
    print(f"{table_name}:")
    display(spark.sql(f"SELECT * FROM meta_db.{table_name}"))

StatementMeta(, a2df5e4a-af58-4210-98cb-d6df8e72fb9c, 50, Finished, Available, Finished)

['batch', 'data_file', 'data_object', 'extract_config', 'file_status_lkp', 'process', 'process_log', 'process_log_detail', 'provider', 'receiver', 'source', 'source_feed', 'source_feed_column', 'source_rank', 'source_view_incremental', 'target', 'target_view', 'target_view_column_override']
batch:


SynapseWidget(Synapse.DataFrame, 286d76da-dc64-4779-90fe-fa9e8cf1a8e3)

data_file:


SynapseWidget(Synapse.DataFrame, 0aaf49a5-ff6d-4844-b5d1-ebcadcd588be)

data_object:


SynapseWidget(Synapse.DataFrame, 03e304a9-e841-4703-840a-2dc7444fbe6d)

extract_config:


SynapseWidget(Synapse.DataFrame, 372eb52b-57e4-46f9-bb66-a4230c5ce5ae)

file_status_lkp:


SynapseWidget(Synapse.DataFrame, 28c89501-384a-40c0-bd7b-88db57c2e43b)

process:


SynapseWidget(Synapse.DataFrame, 257ba7f2-d99c-4018-b85c-a57e2cb03e8a)

process_log:


SynapseWidget(Synapse.DataFrame, 4701cc87-3b11-4940-84f9-218818bf8f04)

process_log_detail:


SynapseWidget(Synapse.DataFrame, e3e0c6ea-af62-46c3-8605-31b481afb8e4)

provider:


SynapseWidget(Synapse.DataFrame, 13e6a223-d09e-47e1-a14a-9aaa847d626f)

receiver:


SynapseWidget(Synapse.DataFrame, 0d0d31a7-7828-4f6c-89aa-9da42df5fb51)

source:


SynapseWidget(Synapse.DataFrame, 0ac69e19-62df-4a44-8161-6cdef82f09cf)

source_feed:


SynapseWidget(Synapse.DataFrame, 37a2bb9d-b7bc-41d0-8d4e-fddd8c7f6076)

source_feed_column:


SynapseWidget(Synapse.DataFrame, 013f2e5e-493e-456c-ae75-ff2c1513240f)

source_rank:


SynapseWidget(Synapse.DataFrame, 44dd6352-e0a8-4ed1-bd8b-295a562c9ed5)

source_view_incremental:


SynapseWidget(Synapse.DataFrame, 2b28929a-41a2-46e5-8783-f2902aef8d6e)

target:


SynapseWidget(Synapse.DataFrame, 40eb87ea-750a-422e-a58b-b153e9fdc6ab)

target_view:


SynapseWidget(Synapse.DataFrame, 2f320ec9-834d-4447-90fd-2f5c940a925e)

target_view_column_override:


SynapseWidget(Synapse.DataFrame, 2fb8acb7-fa97-4ac5-bd0d-7ff0474bbecb)

In [13]:
%%sql

/*
select * from lk_cdsa_bronze.meta_db.data_object;
select * from lk_cdsa_bronze.meta_db.provider;
select * from lk_cdsa_bronze.meta_db.source;
select * from lk_cdsa_bronze.meta_db.source_feed_column order by ordinal_position;
*/

/*
delete from lk_cdsa_bronze.meta_db.data_object;
delete from lk_cdsa_bronze.meta_db.provider;
delete from lk_cdsa_bronze.meta_db.source;
delete from lk_cdsa_bronze.meta_db.source_feed_column;
*/

/*
-----------------------------------------------------------
-- DML: DATA_OBJECT
-----------------------------------------------------------
INSERT INTO lk_cdsa_bronze.meta_db.data_object (
    object_id, object_name, object_type, object_description, database_name, schema_name, filename_pattern, filename_extension, control_file_flag, control_file_pattern, control_file_extension,
    encryption_flag, encryption_type, compression_flag, compression_type, file_encoding, column_delimiter, row_delimiter, field_count, header_row_count, footer_row_count, record_length,
    file_format, text_qualifier, error_row_threshold_pct, required_ind, file_transfer_method, landing_directory
)
SELECT
    COALESCE(MAX(object_id), 0) + 1 AS object_id, 'CDTQ_CUSTOMER_FILE', 'file', 'file input', NULL, NULL, 'cdtq_cust%', 'csv', 'N', NULL, NULL, 'N', NULL, 'N', NULL, 'UTF-8', ',', 'CRLF', 16, 1, 0, 0, 'DELIMITED', '"', 0, 'N', 'SFTP', '/Files/raw_customer_data'
FROM lk_cdsa_bronze.meta_db.data_object;

INSERT INTO lk_cdsa_bronze.meta_db.data_object (
    object_id, object_name, object_type, object_description, database_name, schema_name, filename_pattern, filename_extension, control_file_flag, control_file_pattern, control_file_extension,
    encryption_flag, encryption_type, compression_flag, compression_type, file_encoding, column_delimiter, row_delimiter, field_count, header_row_count, footer_row_count, record_length,
    file_format, text_qualifier, error_row_threshold_pct, required_ind, file_transfer_method, landing_directory
)
SELECT
    COALESCE(MAX(object_id), 0) + 1 AS object_id, 'STG_CDTQ_CUSTOMER', 'table', 'Stage table for CDTQ_CUSTOMER', 'lk_cdsa_bronze', 'bronze_db', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0, NULL, NULL, 0, NULL, NULL, NULL
FROM lk_cdsa_bronze.meta_db.data_object;

INSERT INTO lk_cdsa_bronze.meta_db.data_object (
    object_id, object_name, object_type, object_description, database_name, schema_name, filename_pattern, filename_extension, control_file_flag, control_file_pattern, control_file_extension,
    encryption_flag, encryption_type, compression_flag, compression_type, file_encoding, column_delimiter, row_delimiter, field_count, header_row_count, footer_row_count, record_length,
    file_format, text_qualifier, error_row_threshold_pct, required_ind, file_transfer_method, landing_directory
)
SELECT
    COALESCE(MAX(object_id), 0) + 1 AS object_id, 'VW_STG_CDTQ_CUSTOMER', 'view', 'View for STG_CDTQ_CUSTOMER', 'lk_cdsa_bronze', 'bronze_db', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0, NULL, NULL, 0, NULL, NULL, NULL
FROM lk_cdsa_bronze.meta_db.data_object;

-----------------------------------------------------------
-- DML: PROVIDER
-----------------------------------------------------------
INSERT INTO lk_cdsa_bronze.meta_db.provider (provider_id, provider_name, provider_code)
SELECT COALESCE(MAX(provider_id), 0) + 1 AS provider_id, 'Merkle', 'MRK'
FROM lk_cdsa_bronze.meta_db.provider;

-----------------------------------------------------------
-- DML: SOURCE
-----------------------------------------------------------
INSERT INTO lk_cdsa_bronze.meta_db.source (source_id, provider_id, source_code, source_type, category, sub_category, source_system_name, feed_cd, stage_name, count_threshold, filelist_filename, cr_flag, cr_rekey_flag, receipt_frequency, receipt_day, receipt_time, post_validation_sql, purge_type, purge_max_length, purge_max_length_type)
SELECT COALESCE(MAX(source_id), 0) + 1 AS source_id, 1, 'CDTQ_CUSTOMER','file','customer','incremental','N/A','CDTQ_CUSTOMER','STG_CDTQ_CUSTOMER',9999,NULL,'N','N','daily',0,'00:00:00.000000',NULL,NULL,NULL,NULL
FROM lk_cdsa_bronze.meta_db.source;

-----------------------------------------------------------
-- DML: SOURCE_FEED
-----------------------------------------------------------
INSERT INTO lk_cdsa_bronze.meta_db.source_feed(source_id, validate_prior_to_load_ind, compression_str, date_format, time_format, timestamp_format, escape_char, escape_unenclosed_field, trim_space, null_if, error_on_column_count_mismatch, on_error, purge_str, return_failed_only, enforce_length, truncatecolumns, force_str, acceptinvchars, target_object_name, add_record_id_ind, split_filename_pattern, strip_outer_element)
SELECT 1 AS source_id, 'N', '', 'AUTO', 'AUTO', 'AUTO', 'NONE', 'NONE', 'TRUE', 'N/A', 'FALSE', 'continue', 'FALSE', 'FALSE', 'TRUE', 'FALSE', 'FALSE', 'NULL', 'VW_STG_CDTQ_CUSTOMER', 'Y', '', NULL;

-----------------------------------------------------------
-- DML: SOURCE_FEED_COLUMN
-----------------------------------------------------------
INSERT INTO lk_cdsa_bronze.meta_db.source_feed_column(source_id, ordinal_position, column_name, data_type, max_length, scale, fixed_width_start_position, fixed_width_length, column_property)
SELECT 1 AS source_id, 1, 'customer_id', 'BIGINT', NULL, NULL, NULL, NULL, '' UNION ALL
SELECT 1 AS source_id, 2, 'first_name', 'VARCHAR', 50, NULL, NULL, NULL, '' UNION ALL
SELECT 1 AS source_id, 3, 'last_name', 'VARCHAR', 50, NULL, NULL, NULL, '' UNION ALL
SELECT 1 AS source_id, 4, 'street_address', 'VARCHAR', 150, NULL, NULL, NULL, '' UNION ALL
SELECT 1 AS source_id, 5, 'city', 'VARCHAR', 50, NULL, NULL, NULL, '' UNION ALL
SELECT 1 AS source_id, 6, 'state_prov', 'VARCHAR', 20, NULL, NULL, NULL, '' UNION ALL
SELECT 1 AS source_id, 7, 'postal', 'VARCHAR', 50, NULL, NULL, NULL, '' UNION ALL
SELECT 1 AS source_id, 8, 'company', 'VARCHAR', 150, NULL, NULL, NULL, '' UNION ALL
SELECT 1 AS source_id, 9, 'job', 'VARCHAR', 50, NULL, NULL, NULL, '' UNION ALL
SELECT 1 AS source_id, 10, 'email', 'VARCHAR', 150, NULL, NULL, NULL, '' UNION ALL
SELECT 1 AS source_id, 11, 'random_number', 'VARCHAR', 50, NULL, NULL, NULL, '' UNION ALL
SELECT 1 AS source_id, 12, 'guid', 'VARCHAR', 50, NULL, NULL, NULL, '' UNION ALL
SELECT 1 AS source_id, 13, 'ipv4', 'VARCHAR', 50, NULL, NULL, NULL, '' UNION ALL
SELECT 1 AS source_id, 14, 'phone', 'VARCHAR', 50, NULL, NULL, NULL, '' UNION ALL
SELECT 1 AS source_id, 15, 'dob', 'VARCHAR', 50, NULL, NULL, NULL, '' UNION ALL
SELECT 1 AS source_id, 16, 'origin', 'VARCHAR', 50, NULL, NULL, NULL, ''

-- INSERT INTO target
-- (target_id, target_name, receiver_id, target_type, category, sub_category, delivery_frequency, delivery_day, delivery_time, purge_type, purge_column, purge_max_length, purge_max_length_type)
-- VALUES(164, 'VW_STG_AWSSQL_CUSTOMER', 1, 'View', 'STG', 'Account', 'DAILY', NULL, NULL, NULL, NULL, NULL, NULL);
-- INSERT INTO target
-- (target_id, target_name, receiver_id, target_type, category, sub_category, delivery_frequency, delivery_day, delivery_time, purge_type, purge_column, purge_max_length, purge_max_length_type)
-- VALUES(165, 'AWSSQL_CUSTOMER', 1, 'Table', 'MDB', 'Account', 'DAILY', NULL, NULL, NULL, NULL, NULL, NULL);

-- select * from target_view where target_id in (4,165)
-- update target_view set source_object_name = 'VW__STG_AWSSQL_CUSTOMER' where target_id = 165

-- select * from target_view where target_id = 4
-- INSERT INTO target_view
-- (target_id, view_name, source_object_name, stage_table_name, mdb_table_name, full_refresh_ind, cr_ind, archive_table_ind, fact_or_dimension, dynamic_view_ind, natural_key, order_by_cols, surrogate_key, generate_surr_key_ind, static_view_sql, archive_ret_copies, source_data_2_best, partition_by_cols, indiv_id_change_col, add_id_change_col, house_id_change_col, site_id_change_col, batch_name, check_existing, replace_existing, batch_table, id_change_table, retain_col_list, drop_stg_tables)
-- VALUES(165, 'VW_AWSSQL_CUSTOMER', 'VW__STG_AWSSQL_CUSTOMER', 'AWSSQL_CUSTOMER___sp__daily_insert_tbl', 'AWSSQL_CUSTOMER', 'Y', 'Y', 'Y', 'DIMENSION', 'Y', 'customer_id', 'file_id desc, record_id desc', '', 'N', NULL, '', 'N', 'customer_id', 'indiv_key,cr_addr_key', 'cr_addr_key', '', '', 'daily_update', '', 'N', 'BATCH', 'cr_id_change', 'create_process_log_id', 'Y');
*/

StatementMeta(, a2df5e4a-af58-4210-98cb-d6df8e72fb9c, 48, Finished, Available, Finished)

<Spark SQL result set with 3 rows and 32 fields>

<Spark SQL result set with 1 rows and 7 fields>

<Spark SQL result set with 1 rows and 24 fields>

<Spark SQL result set with 16 rows and 14 fields>