In [0]:
df = spark.table('workspace.bronze.erp_px_cat_g1v2')
display(df)

In [0]:
%sql
SELECT ID, CAT, SUBCAT, MAINTENANCE, COUNT(*) AS duplicate_count
FROM workspace.bronze.erp_px_cat_g1v2
GROUP BY ID, CAT, SUBCAT, MAINTENANCE
HAVING COUNT(*) > 1

In [0]:
%sql
SELECT ID, CAT, SUBCAT, MAINTENANCE,
       CASE WHEN ID != TRIM(ID) OR LENGTH(ID) != LENGTH(TRIM(ID)) THEN 'extra_spaces' ELSE '' END AS ID_flag,
       CASE WHEN CAT != TRIM(CAT) OR LENGTH(CAT) != LENGTH(TRIM(CAT)) THEN 'extra_spaces' ELSE '' END AS CAT_flag,
       CASE WHEN SUBCAT != TRIM(SUBCAT) OR LENGTH(SUBCAT) != LENGTH(TRIM(SUBCAT)) THEN 'extra_spaces' ELSE '' END AS SUBCAT_flag,
       CASE WHEN MAINTENANCE != TRIM(MAINTENANCE) OR LENGTH(MAINTENANCE) != LENGTH(TRIM(MAINTENANCE)) THEN 'extra_spaces' ELSE '' END AS MAINTENANCE_flag
FROM workspace.bronze.erp_px_cat_g1v2
WHERE (ID != TRIM(ID) OR LENGTH(ID) != LENGTH(TRIM(ID)))
   OR (CAT != TRIM(CAT) OR LENGTH(CAT) != LENGTH(TRIM(CAT)))
   OR (SUBCAT != TRIM(SUBCAT) OR LENGTH(SUBCAT) != LENGTH(TRIM(SUBCAT)))
   OR (MAINTENANCE != TRIM(MAINTENANCE) OR LENGTH(MAINTENANCE) != LENGTH(TRIM(MAINTENANCE)))

In [0]:
df = spark.table('workspace.bronze.erp_px_cat_g1v2')
display(df.filter(df.ID.isNull() | df.CAT.isNull() | df.SUBCAT.isNull() | df.MAINTENANCE.isNull()))

In [0]:
from pyspark.sql import functions as F

def clean_erp_px_cat(df):
    """
    Applies Silver-layer transformations: 
    1. Trims all string columns automatically.
    2. Standardizes ID (removes special chars).
    3. Renames and reorders columns in one pass.
    """
    return df.select(
        # 1. Standardize ID (Clean + Alias to 'product_id')
        F.trim(F.col("ID")).alias("product_id"),
        F.upper(F.trim(F.regexp_replace(F.col("ID"), '[^A-Za-z0-9]', ''))).alias("std_ID"),
        F.trim(F.col("CAT")).alias("category"),
        F.trim(F.col("SUBCAT")).alias("subcategory"),
        F.trim(F.col("MAINTENANCE")).alias("maintenance")
    )

# ---------------------------------------------------------
# Execution Flow
# ---------------------------------------------------------

# 1. Read
bronze_df = spark.table('workspace.bronze.erp_px_cat_g1v2')

# 2. Transform (Single optimized plan)
silver_df = clean_erp_px_cat(bronze_df)

# 3. Write
target_table = 'workspace.silver.erp_px_cat_g1v2'
silver_df.write.mode('overwrite').saveAsTable(target_table)

# 4. Verify (Read from target to confirm write success)
display(spark.table(target_table))