In [12]:
# Welcome to your new notebook
# Type here in the cell editor to add code!

#Standard Libraries
from notebookutils import mssparkutils
from pyspark.sql.functions import col, trim
from pyspark.sql import functions as F

StatementMeta(, 0d808e92-f23c-4de3-b09a-add80c9445c0, 14, Finished, Available, Finished)

In [13]:
# Function to interface with the Spark catalog and retrieve data
def extract_table(table_name: str):
    """
    Reads a table from the Spark catalog and loads it into a DataFrame.

    Args:
        table_name (str): The name of the table to be extracted (e.g., 'gold_sales_data').

    Returns:
        pyspark.sql.dataframe.DataFrame: A Spark DataFrame containing the table data.
    """    
    df = spark.read.table(table_name)
    return df

StatementMeta(, 0d808e92-f23c-4de3-b09a-add80c9445c0, 15, Finished, Available, Finished)

In [14]:
# Function to automate whitespace removal across the entire DataFrame
def trim_all_string_cols(df):
    """
    Identifies all string-type columns and applies the SQL trim function to remove 
    leading and trailing whitespace.
    
    Args:
        df (pyspark.sql.DataFrame): The input DataFrame to be cleaned.
        
    Returns:
        pyspark.sql.DataFrame: A new DataFrame with trimmed string values.
    """    
    # Get all columns that are of type 'string'
    string_columns = [c for c, t in df.dtypes if t == 'string']

    # Apply trim to those columns specifically
    df_trimmed = df.withColumns({c: trim(col(c)) for c in string_columns})

    # Return the transformed DataFrame with cleaned string data
    return df_trimmed

StatementMeta(, 0d808e92-f23c-4de3-b09a-add80c9445c0, 16, Finished, Available, Finished)

In [15]:
# Wrapper function to orchestrate data cleaning tasks
def trim_data(df):

    trimmed_df = trim_all_string_cols(df)

    return trimmed_df

StatementMeta(, 0d808e92-f23c-4de3-b09a-add80c9445c0, 17, Finished, Available, Finished)

In [16]:
# Function to persist DataFrame results back to the Lakehouse as a Delta Table
def load_df_to_delta(df, table_name: str, mode: str = "overwrite"):
    """
    Load and saves dataframe data as a Delta Table.
    
    Args:
        df (spark DataFrame): df to be loaded to delta table.
        table_name: name of destination table 
        mode (str): 'overwrite' to replace the table, default.
    """
    try:
        # 1. Save as a Delta Table 
        df.write.format("delta") \
            .mode(mode) \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_name)

        print(f"Table '{table_name}' loaded successfully !")
        
    except Exception as e:
        print(f"Error processing table {table_name}: {str(e)}")


StatementMeta(, 0d808e92-f23c-4de3-b09a-add80c9445c0, 18, Finished, Available, Finished)

In [17]:
# Function to perform Data Quality (DQ) checks for null values in key customer fields
def check_null_columns(df):
    """
    Scans specific customer identity and location columns for NULL values,
    counts the occurrences, and previews the problematic records.
    
    Args:
        df (pyspark.sql.DataFrame): The DataFrame to be inspected.
    """

    # Identify rows where any of the mandatory customer fields are null.
    null_df = df.filter(
            F.col("customer_id").isNull() | 
            F.col("customer_unique_id").isNull() | 
            F.col("customer_zip_code_prefix").isNull() | 
            F.col("customer_city").isNull() |             
            F.col("customer_state").isNull()
        )

    null_count = null_df.count()
    print(f"Records with null columns: {null_count}")

    if null_count > 0:
        # Reuses the 'null_df' transformation logic to output rows to the console
        null_df.show()
pass

StatementMeta(, 0d808e92-f23c-4de3-b09a-add80c9445c0, 19, Finished, Available, Finished)

In [18]:
# Function acting as a central hub for data cleaning and filtering logic
def clean_dirty_data(df):
    #Clean by removing dirty data

    print("Cleaning script executed ! ")

    return df


StatementMeta(, 0d808e92-f23c-4de3-b09a-add80c9445c0, 20, Finished, Available, Finished)

In [19]:
# Function to validate foreign key relationships between two tables
def check_referencing_key(df, ref_table_name: str, df_col: str, ref_col: str, error_col: str): 
    """
    Validates that keys in the input DataFrame exist in a reference Spark table.
    If keys are missing, it flags those rows in a new error column.

    Args:
        df (pyspark.sql.DataFrame): The source DataFrame to validate.
        ref_table_name (str): The name of the Catalog table used for validation.
        df_col (str): The join column name in the source DataFrame.
        ref_col (str): The join column name in the reference table.
        error_col (str): The name of the column to create for flagging errors (e.g., 'is_invalid').

    Returns:
        pyspark.sql.DataFrame: The DataFrame with an additional error flag column if mismatches exist.
    """

    # 1. Filter for rows in df where the zip_code IS NOT in referencing_df
    # For this compare the desired column to keep the join lean
    ref_df = spark.read.table(ref_table_name)

    # 2. Perform a Left Anti-style check using a Left Join
    invalid_keys_df = df.alias("main").join(
        ref_df.alias("ref"), 
        df[df_col] == ref_df[ref_col], 
        how="left"
    )

    # 3. Identify rows where the join failed (resulted in NULL from the reference side)
    invalid_count = invalid_keys_df.filter(col("ref." + ref_col).isNull()).count()

    if invalid_count > 0:
        print(f"Found {invalid_count} Keys: {df_col} not present in the referencing table: {ref_table_name}")
        print(f"{error_col} column set to Y")

        # Display the specific keys that are missing from the reference table
        invalid_keys_df.select(df_col).filter(col("ref." + ref_col).isNull()).show()

        # 4. Add the error flag ('Y') and clean up the schema
        # We use F.when() to flag the null matches and select "main.*" to remove ref columns.
        df = invalid_keys_df.withColumn(
        error_col,
        F.when(F.col("ref." + ref_col).isNull(), "Y")
        ).select("main.*", error_col)

    else:
        # Confirms all data is valid according to the reference table
        print(f"All {df_col} found to be valid and present in the referencing table: {ref_table_name}.\n")

    return df

StatementMeta(, 0d808e92-f23c-4de3-b09a-add80c9445c0, 21, Finished, Available, Finished)

In [20]:
# Function to verify data uniqueness and identify duplicate primary keys
def check_records_count(df, col_name: str): 
    """
    Validates the integrity of a specific column by comparing total row counts 
    against distinct row counts and identifying any duplicate values.

    Args:
        df (pyspark.sql.DataFrame): The DataFrame to be audited.
        col_name (str): The name of the column that should be unique (e.g., a Primary Key).
    """

    # Print count records and print (if any duplicate records)
    print(f"Total count: {df.select(col_name).count()}")
    print(f"Distinct count: {df.select(col_name).distinct().count()}")
    print("Records to investigate (if any not unique key)")
    df.groupBy(col_name).count().filter("count > 1").show()

StatementMeta(, 0d808e92-f23c-4de3-b09a-add80c9445c0, 22, Finished, Available, Finished)

In [21]:
# ==============================================================================
# MAIN ETL PIPELINE EXECUTION: Bronze to Silver
# ==============================================================================

# 1. Configuration: Define source and destination entry points in the Lakehouse
from_table_name = 'BronzeLakeHouse.dbo.customers_bronze'
to_table_name = 'customers_silver'

# 2. Extract: Load the raw data from the Bronze layer into a Spark DataFrame
df = extract_table(from_table_name)

# 3. Transform (Cleaning): Remove whitespace from all string columns
df = trim_data(df)

# 4. Data Quality Check: Print count of rows with null values in mandatory fields
check_null_columns(df)

# 5. Referential Integrity Check A: Validate customer zip codes against geolocation master data
df = check_referencing_key(df, 'geolocation_silver', 'customer_zip_code_prefix', 'geolocation_zip_code_prefix', 'not_in_geo_flag')

# 6. Referential Integrity Check B: Validate customer IDs against the orders table
df = check_referencing_key(df, 'BronzeLakeHouse.dbo.orders_bronze', 'customer_id', 'customer_id', 'not_in_orders_flag')

# 7. Uniqueness Validation: Ensure 'customer_id' remains a unique primary key after transformations
check_records_count(df, 'customer_id')

# 8. Conditional Cleaning: Optional hook to remove rows flagged in steps 5 & 6
# >> If dirty Record to Clean here. 
# df = clean_dirty_data(df)

# 9. Load: Persist the final validated DataFrame as a Delta table in the Silver layer
load_df_to_delta(df, to_table_name)

StatementMeta(, 0d808e92-f23c-4de3-b09a-add80c9445c0, 23, Finished, Available, Finished)

Records with null columns: 0
Found 278 Keys: customer_zip_code_prefix not present in the referencing table: geolocation_silver
not_in_geo_flag column set to Y
+------------------------+
|customer_zip_code_prefix|
+------------------------+
|                   72300|
|                   11547|
|                   64605|
|                   72465|
|                   07729|
|                   72904|
|                   35408|
|                   78554|
|                   73369|
|                   08980|
|                   29949|
|                   65137|
|                   28655|
|                   73255|
|                   28388|
|                   06930|
|                   71676|
|                   64047|
|                   29949|
|                   61906|
+------------------------+
only showing top 20 rows

All customer_id found to be valid and present in the referencing table: BronzeLakeHouse.dbo.orders_bronze.

Total count: 99441
Distinct count: 99441
Records to investi

In [22]:
# df.createOrReplaceTempView("test")
# # check_df = spark.sql(""" select count(*) from test where not_in_geo_flag IS NULL """)
# check_df = spark.sql(""" select count(*) from test where not_in_orders_flag IS NULL """)

# print(check_df.show())

StatementMeta(, 0d808e92-f23c-4de3-b09a-add80c9445c0, 24, Finished, Available, Finished)

In [23]:
# stop spark session
mssparkutils.session.stop()


StatementMeta(, 0d808e92-f23c-4de3-b09a-add80c9445c0, 25, Finished, Available, Finished)