In [0]:
df = spark.table('workspace.bronze.erp_loc_a101')
display(df)

In [0]:
%sql
SELECT CID, CNTRY, COUNT(*) AS duplicate_count
FROM workspace.bronze.erp_loc_a101
GROUP BY CID, CNTRY
HAVING COUNT(*) > 1

In [0]:
%sql
SELECT CID, CNTRY,
       CASE WHEN CID != TRIM(CID) OR LENGTH(CID) != LENGTH(TRIM(CID)) THEN 'extra_spaces' ELSE '' END AS CID_flag,
       CASE WHEN CNTRY != TRIM(CNTRY) OR LENGTH(CNTRY) != LENGTH(TRIM(CNTRY)) THEN 'extra_spaces' ELSE '' END AS CNTRY_flag
FROM workspace.bronze.erp_loc_a101
WHERE (CID != TRIM(CID) OR LENGTH(CID) != LENGTH(TRIM(CID)))
   OR (CNTRY != TRIM(CNTRY) OR LENGTH(CNTRY) != LENGTH(TRIM(CNTRY)))

In [0]:
SELECT *
FROM workspace.bronze.erp_loc_a101
WHERE CID IS NULL OR CNTRY IS NULL

In [0]:
from pyspark.sql import functions as F

def clean_erp_locations(df):
    """
    Applies Silver-layer transformations: 
    1. Trims all string columns automatically.
    2. Standardizes CID (removes special chars).
    3. Renames and reorders columns in one pass.
    """
    return df.select(
        # 1. Standardize CID (Clean + Alias to 'customer_id')
        F.trim(F.col("CID")).alias("customer_id"),
        
        # 2. Generate Join Key (std_CID)
        F.upper(F.trim(F.regexp_replace(F.col("CID"), '[^A-Za-z0-9]', ''))).alias("std_CID"),
        
        # 3. Country (Trim + Alias)
        F.trim(F.col("CNTRY")).alias("country")
        
        # Note: If you had 50 other columns to just 'pass through' and trim:
        # *[F.trim(F.col(c)).alias(c) for c, t in df.dtypes if t == 'string' and c not in ['CID', 'CNTRY']]
    )

# ---------------------------------------------------------
# Execution Flow
# ---------------------------------------------------------

# 1. Read
bronze_df = spark.table('workspace.bronze.erp_loc_a101')

# 2. Transform (Single optimized plan)
silver_df = clean_erp_locations(bronze_df)

# 3. Write
target_table = 'workspace.silver.erp_loc_a101'
silver_df.write.mode('overwrite').saveAsTable(target_table)

# 4. Verify (Read from target to confirm write success)
display(spark.table(target_table))