In [1]:
# ==============================================================================
# STANDARD LIBRARIES & SPARK CONFIGURATION
# ==============================================================================

# mssparkutils: Microsoft Fabric utility toolset used for file system 
# management, secret retrieval, and cross-notebook orchestration.
from notebookutils import mssparkutils

# col: Function used to select and wrap column names for Spark transformations.
# trim: Removes leading and trailing whitespace from string-type data.
from pyspark.sql.functions import col, trim

# F: Standard alias for PySpark SQL functions. Using this namespace ensures 
# code clarity and prevents conflicts with built-in Python functions.
from pyspark.sql import functions as F

StatementMeta(, f0bd5a0c-d5b0-46e4-a6d0-b8b4e9006b24, 3, Finished, Available, Finished)

In [2]:
# Function to interface with the Spark catalog and retrieve data
def extract_table(table_name: str):
    """
    Reads a table from the Spark catalog and loads it into a DataFrame.
    
    In Microsoft Fabric, this pulls metadata and data from the 'Tables' 
    section of your Lakehouse.

    Args:
        table_name (str): The name of the table to be extracted.
                          Can be a simple name ('customers') or a 
                          three-part name ('Lakehouse.dbo.customers').

    Returns:
        pyspark.sql.dataframe.DataFrame: A Spark DataFrame containing the table data.
    """
    df = spark.read.table(table_name)
    
    return df

StatementMeta(, f0bd5a0c-d5b0-46e4-a6d0-b8b4e9006b24, 4, Finished, Available, Finished)

In [3]:
# Function to automate whitespace removal across the entire DataFrame
def trim_all_string_cols(df):
    """
    Identifies all string-type columns and applies the SQL trim function to remove 
    leading and trailing whitespace.
    
    Args:
        df (pyspark.sql.DataFrame): The input DataFrame to be cleaned.
        
    Returns:
        pyspark.sql.DataFrame: A new DataFrame with trimmed string values.
    """
    
    # List comprehension to filter columns: df.dtypes returns a list of (name, type) tuples.
    # We only target columns where the type 't' is exactly 'string'.
    string_columns = [c for c, t in df.dtypes if t == 'string']

    # withColumns (introduced in Spark 3.3+) allows for multiple column updates at once.
    # This dictionary comprehension maps each string column to a trimmed version of itself.
    df_trimmed = df.withColumns({c: trim(col(c)) for c in string_columns})

    # Return the transformed DataFrame with cleaned string data
    return df_trimmed

StatementMeta(, f0bd5a0c-d5b0-46e4-a6d0-b8b4e9006b24, 5, Finished, Available, Finished)

In [4]:
# Wrapper function to orchestrate the initial transformation phase
def trim_data(df):

    trimmed_df = trim_all_string_cols(df)
    # group_df = trimmed_df.groupBy(['geolocation_zip_code_prefix', 'geolocation_state']) \
    #     .agg({'geolocation_lat': 'mean', 'geolocation_lng':'mean'})

    return trimmed_df

StatementMeta(, f0bd5a0c-d5b0-46e4-a6d0-b8b4e9006b24, 6, Finished, Available, Finished)

In [5]:
# Function to persist DataFrame results back to the Lakehouse as a Delta Table
def load_df_to_delta(df, table_name: str, mode: str = "overwrite"):    
    """
    Load and saves dataframe data as a Delta Table.
    
    Args:
        df (spark DataFrame): df to be loaded to delta table.
        table_name: name of destination table 
        mode (str): 'overwrite' to replace the table, default.
    """
    try:
        # 1. Save as a Delta Table 
        df.write.format("delta") \
            .mode(mode) \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_name)

        print(f"Table '{table_name}' loaded successfully !")
        
    except Exception as e:
        print(f"Error processing table {table_name}: {str(e)}")


StatementMeta(, f0bd5a0c-d5b0-46e4-a6d0-b8b4e9006b24, 7, Finished, Available, Finished)

In [6]:
# Function to identify and display records with missing mandatory product data
def check_null_columns(df):
    """
    Scans the DataFrame for NULL values in the product_id column,
    calculates the impact, and previews the records for investigation.
    
    Args:
        df (pyspark.sql.DataFrame): The DataFrame to be inspected.
    """

    # Filter for rows where critical identifiers are missing.
    null_df = df.filter(       
            F.col("product_id").isNull()
        )
    # Perform a count action to determine the scale of data quality issues
    null_count = null_df.count()
    print(f"Records with null columns: {null_count}")

    # If gaps are found, display the problematic rows for manual audit
    if null_count > 0:
            # Reuses the filtered logic
        null_df.show()
pass

StatementMeta(, f0bd5a0c-d5b0-46e4-a6d0-b8b4e9006b24, 8, Finished, Available, Finished)

In [7]:
# Function acting as a central hub for specific data removal or filtering logic
def clean_dirty_data(df):
    """
    Applies custom business rules to strictly remove 'dirty' or invalid records 
    from the dataset. Currently acts as a pass-through function.
    """

    #Clean by removing dirty data
    print("Cleaning script executed ! ")

    return df


StatementMeta(, f0bd5a0c-d5b0-46e4-a6d0-b8b4e9006b24, 9, Finished, Available, Finished)

In [8]:
# Function to validate foreign key relationships and flag orphaned records
def check_referencing_key(df, ref_table_name: str, df_col: str, ref_col: str, error_col: str): 
    """
    Performs a left join against a reference table to find keys in the source 
    that do not exist in the reference. Orphaned rows are flagged with 'Y'.

    Args:
        df (pyspark.sql.DataFrame): The source DataFrame to validate.
        ref_table_name (str): The name of the reference catalog table.
        df_col (str): The column name in the source DataFrame to check.
        ref_col (str): The column name in the reference table to match against.
        error_col (str): The name of the flag column to be created.

    Returns:
        pyspark.sql.DataFrame: The DataFrame with the added error flag column.
    """

    ## 1. Load the reference data from the Lakehouse catalog
    ref_df = spark.read.table(ref_table_name)

    # 2. Left Join: Keeps all records from 'main' and matches from 'ref'
    # Aliasing prevents naming collisions between the two tables.
    invalid_keys_df = df.alias("main").join(
        ref_df.alias("ref"), 
        df[df_col] == ref_df[ref_col], 
        how="left"
    )
    
    # 3. Identify mismatches where the reference column is NULL
    invalid_count = invalid_keys_df.filter(col("ref." + ref_col).isNull()).count()

    if invalid_count > 0:
        print(f"Found {invalid_count} Keys: {df_col} not present in the referencing table: {ref_table_name}")
        print(f"{error_col} column set to Y")

        # Display the specific raw values that failed the lookup
        invalid_keys_df.select(col("main." + df_col)) \
                       .filter(col("ref." + ref_col) \
                       .isNull()).show()

        df = invalid_keys_df.withColumn(
            error_col,
            F.when(F.col("ref." + ref_col).isNull(), "Y")
            ).select("main.*", error_col)
        
        # invalid_keys_df.filter(col("ref." + ref_col).isNull()) \
        #                .groupBy(col("main." + df_col)) \
        #                .count().show()             
    else:
        print(f"All {df_col} found to be valid and present in the referencing table: {ref_table_name}.\n")

    return df

StatementMeta(, f0bd5a0c-d5b0-46e4-a6d0-b8b4e9006b24, 10, Finished, Available, Finished)

In [9]:
# Function to verify data uniqueness and identify duplicate primary keys
def check_records_count(df, col_name: str): 
# Check that after cleaning records is now unique and dirty data removed

    print(f"Total count: {df.select(col_name).count()}")
    print(f"Distinct count: {df.select(col_name).distinct().count()}")
    print("Records to investigate (if any not unique key)")
    df.groupBy(col_name).count().filter("count > 1").show()

StatementMeta(, f0bd5a0c-d5b0-46e4-a6d0-b8b4e9006b24, 11, Finished, Available, Finished)

In [10]:
# Function to standardize data types and correct column naming conventions
def convert_data_type_rename_column(df):
    """
    Standardizes the schema of the products dataset by casting numerical strings 
    to appropriate Float/Integer types and correcting spelling errors in headers.

    Args:
        df (pyspark.sql.DataFrame): The input DataFrame from the Bronze layer.

    Returns:
        pyspark.sql.DataFrame: A DataFrame with corrected types and renamed columns.
    """

    # 1. Type Casting: Converting metadata and physical dimensions to numerical types.
    # We use .cast("float") for measurements and .cast("int") for discrete counts.
    df_convert = df.withColumn("product_name_lenght", col("product_name_lenght").cast("float")) \
                       .withColumn("product_description_lenght", col("product_description_lenght").cast("float")) \
                       .withColumn("product_photos_qty", col("product_photos_qty").cast("int")) \
                       .withColumn("product_weight_g", col("product_weight_g").cast("float")) \
                       .withColumn("product_length_cm", col("product_length_cm").cast("float")) \
                       .withColumn("product_height_cm", col("product_height_cm").cast("float")) \
                       .withColumn("product_width_cm", col("product_width_cm").cast("float")) 

    # 2. Renaming: Correcting the spelling of 'lenght' to 'length'.
    # This ensures the Silver layer adheres to standard English naming conventions.
    df_renamed = df_convert.withColumnRenamed("product_name_lenght", "product_name_length") \
                           .withColumnRenamed("product_description_lenght", "product_description_length")

    return df_renamed


StatementMeta(, f0bd5a0c-d5b0-46e4-a6d0-b8b4e9006b24, 12, Finished, Available, Finished)

In [11]:
# Function to enrich the dataset with English category translations
def merge_english_translation_column(df):
    """
    Joins the product data with a translation reference table to provide 
    English equivalents for Portuguese category names.

    Args:
        df (pyspark.sql.DataFrame): The product DataFrame.

    Returns:
        pyspark.sql.DataFrame: The enriched DataFrame including English translations.
    """

    # 1. Extract and Clean the Translation reference table
    translate_df = extract_table("BronzeLakeHouse.dbo.product_category_name_translation_bronze")
    translate_df = trim_data(translate_df)

    # 2. Collision Prevention: Suffix all columns in the reference table with '_ref'.
    # This ensures that joining doesn't result in ambiguous column names.
    translate_df = translate_df.select([col(c).alias(c + "_ref") for c in translate_df.columns])

    # 3. Perform Left Join to pull in the English translation
    # We select all original columns ("main.*") plus only the specific translation column.
    merge_df = df.alias("main").join(
        translate_df.alias("ref"), 
        df['product_category_name'] == translate_df['product_category_name_ref'], 
        how="left"
    ).select("main.*", "product_category_name_english_ref")

    # 4. Data Cleanup: Drop the redundant/untransformed length columns.
    # (Note: These are usually dropped here because they were renamed in the previous step).
    df = merge_df.drop("product_name_lenght", "product_description_lenght")    
    
    return df

StatementMeta(, f0bd5a0c-d5b0-46e4-a6d0-b8b4e9006b24, 13, Finished, Available, Finished)

In [12]:
# Function to safely cast columns to Float while auditing for non-numeric noise
def cast_to_float(df, column_list):
    """
    Iterates through a list of columns, identifies non-numeric strings that would 
    fail a float conversion, and then performs the type cast.

    Args:
        df (pyspark.sql.DataFrame): The input DataFrame.
        column_list (list): A list of column names to be converted to Float.

    Returns:
        pyspark.sql.DataFrame: The DataFrame with columns cast to Float type.
    """

    for c in column_list:

        # Regex pattern: Matches optional negative sign, optional leading digits, 
        # an optional decimal point, and optional trailing digits.
        numeric_pattern = r'^-?\d*\.?\d*$'
        
        # 1. Audit Phase: Identify "dirty" data.
        # Finds rows that do not match the numeric pattern and are not already NULL.
        invalid_count = df.filter(~col(c).rlike(numeric_pattern) & col(c).isNotNull()).count()
        
        if invalid_count > 0:
            # Alerts the user to data quality issues that will result in NULLs after casting.
            print(f"Warning: Column '{c}' has {invalid_count} non-numeric rows. These will become NULL.")
        
        # 2. Transformation Phase: Apply the cast.
        df = df.withColumn(c, col(c).cast("float"))
        
    return df

StatementMeta(, f0bd5a0c-d5b0-46e4-a6d0-b8b4e9006b24, 14, Finished, Available, Finished)

In [13]:
# ==============================================================================
# MAIN ETL PIPELINE EXECUTION: Products Bronze to Silver
# ==============================================================================

# 1. Configuration: Define source and destination
from_table_name = 'BronzeLakeHouse.dbo.products_bronze'
to_table_name = 'products_silver'

# 2. Extract: Load raw product data
df = extract_table(from_table_name)

# 3. Transform (Cleaning): Global trim on string columns and DQ Null check
df = trim_data(df)
check_null_columns(df)

# 4. Referential Integrity: Validate categories against the translation reference
# Adds 'not_in_product_translate_flag' for categories missing translation mapping.
df = check_referencing_key(df, 'BronzeLakeHouse.dbo.product_category_name_translation_bronze', \
                               'product_category_name', \
                               'product_category_name', \
                               'not_in_product_translate_flag')

# 5. Enrichment: Merge English category names into the main dataset
df = merge_english_translation_column(df)

# 6. Schema Enforcement: Robustly cast measurement columns to Float
# This step includes regex auditing to warn about non-numeric noise.
columns_to_convert = ["product_photos_qty", "product_weight_g","product_length_cm","product_height_cm","product_width_cm"]
df = cast_to_float(df, columns_to_convert)

# 7. Print the resulting schema and check for unique primary keys
df.printSchema(1)
check_records_count(df, 'product_id')

# 8. Conditional Cleaning: Optional hook for final data removal logic
# df = clean_dirty_data(df)

# 9. Load: Persist the enriched, typed, and validated data to the Silver layer
load_df_to_delta(df, to_table_name)

StatementMeta(, f0bd5a0c-d5b0-46e4-a6d0-b8b4e9006b24, 15, Finished, Available, Finished)

Records with null columns: 0
Found 623 Keys: product_category_name not present in the referencing table: BronzeLakeHouse.dbo.product_category_name_translation_bronze
not_in_product_translate_flag column set to Y
+---------------------+
|product_category_name|
+---------------------+
|                 NULL|
|                 NULL|
|                 NULL|
|                 NULL|
|                 NULL|
|                 NULL|
|                 NULL|
|                 NULL|
|                 NULL|
|                 NULL|
|                 NULL|
|                 NULL|
|                 NULL|
|                 NULL|
|                 NULL|
|                 NULL|
|                 NULL|
|                 NULL|
|                 NULL|
|                 NULL|
+---------------------+
only showing top 20 rows

root
 |-- product_id: string (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_photos_qty: float (nullable = true)
 |-- product_weight_g: float (nullabl

In [14]:
# df.createOrReplaceTempView("test")
# check_df = spark.sql(""" select count(*) from test where not_in_geo_flag = 'Y' """)
# # check_df = spark.sql(""" select count(*) from test where not_in_orders_flag IS NULL """)

# check_df.show()

StatementMeta(, f0bd5a0c-d5b0-46e4-a6d0-b8b4e9006b24, 16, Finished, Available, Finished)

In [15]:
# ==============================================================================
# SESSION TERMINATION
# ==============================================================================

# Explicitly stops the Spark session and releases the allocated compute 
# resources (executors/nodes) back to the Microsoft Fabric pool.

mssparkutils.session.stop()


StatementMeta(, f0bd5a0c-d5b0-46e4-a6d0-b8b4e9006b24, 17, Finished, Available, Finished)