In [0]:
df = spark.table('workspace.bronze.erp_cust_az12')
display(df)

In [0]:
%sql
SELECT CID, BDATE, GEN, COUNT(*) AS duplicate_count
FROM workspace.bronze.erp_cust_az12
GROUP BY CID, BDATE, GEN
HAVING COUNT(*) > 1

In [0]:
%sql
SELECT CID, BDATE, GEN,
       CASE WHEN CID != TRIM(CID) OR LENGTH(CID) != LENGTH(TRIM(CID)) THEN 'extra_spaces' ELSE '' END AS CID_flag,
       CASE WHEN GEN != TRIM(GEN) OR LENGTH(GEN) != LENGTH(TRIM(GEN)) THEN 'extra_spaces' ELSE '' END AS GEN_flag
FROM workspace.bronze.erp_cust_az12
WHERE (CID != TRIM(CID) OR LENGTH(CID) != LENGTH(TRIM(CID)))
   OR (GEN != TRIM(GEN) OR LENGTH(GEN) != LENGTH(TRIM(GEN)))

In [0]:
%sql
-- Check for missing values in BDATE
SELECT COUNT(*) AS missing_bdate_count
FROM workspace.bronze.erp_cust_az12
WHERE BDATE IS NULL;

-- Show sample date values and their format
SELECT DISTINCT BDATE
FROM workspace.bronze.erp_cust_az12
ORDER BY BDATE DESC;

In [0]:
from pyspark.sql import functions as F

df = spark.table('workspace.bronze.erp_cust_az12')

# Convert BDATE to date type (tolerate invalid values)
df = df.withColumn('birth_date', F.expr("try_to_date(CAST(BDATE AS STRING), 'yyyy-MM-dd')"))

# Optionally drop original column or keep for traceability
df = df.drop('BDATE')

display(df)

In [0]:
%sql
-- Check for missing values in GEN
SELECT COUNT(*) AS missing_gen_count
FROM workspace.bronze.erp_cust_az12
WHERE GEN IS NULL;

-- Show unique values in GEN
SELECT DISTINCT GEN
FROM workspace.bronze.erp_cust_az12;

In [0]:
# List unique values and counts for GEN
from pyspark.sql import functions as F

gen_counts = spark.table('workspace.bronze.erp_cust_az12') \
    .withColumn('GEN', F.when(F.col('GEN') == 'Male', 'M')
                        .when(F.col('GEN') == 'Female', 'F')
                        .otherwise(F.col('GEN'))) \
    .groupBy('GEN') \
    .count() \
    .orderBy('count', ascending=False)
display(gen_counts)

# Flag unexpected values (e.g., null or empty)
unique_gens = [row['GEN'] for row in gen_counts.collect()]
issues = [val for val in unique_gens if val is None or val == '']
print("Unexpected or ambiguous GEN values:", issues)

In [0]:
%sql
-- Standardize CID for joinability
SELECT UPPER(TRIM(REGEXP_REPLACE(CID, '[^A-Za-z0-9]', ''))) AS std_CID, BDATE, GEN
FROM workspace.bronze.erp_cust_az12

In [0]:
from pyspark.sql import functions as F

# 1. Define Logic / UDFs at the top (or in a separate module)
def clean_bronze_customers(df):
    """
    Applies Silver-layer transformations: trimming, casting, and standardization.
    """
    return df.select(
        # A. Identity / Renaming
        F.col("CID").alias("customer_id"),
        
        # B. Complex Transformation: Standardization
        F.upper(F.trim(F.regexp_replace(F.col("CID"), '[^A-Za-z0-9]', ''))).alias("std_CID"),
        
        # C. Date Casting (Safe)
        F.expr("try_to_date(CAST(BDATE AS STRING), 'yyyy-MM-dd')").alias("birth_date"),
        
        # D. Categorical Cleanup (Map logic)
        F.when(F.col("GEN") == 'Male', 'M')
         .when(F.col("GEN") == 'Female', 'F')
         .otherwise(F.col("GEN")).alias("gender")
         
        # Note: If you have 100 other columns to keep, use: 
        # *[F.trim(F.col(c)).alias(c) for c, t in df.dtypes if t == 'string' and c not in ['CID', 'BDATE', 'GEN']]
    )

# ---------------------------------------------------------
# Execution Flow
# ---------------------------------------------------------

# 2. Read
bronze_df = spark.table('workspace.bronze.erp_cust_az12')

# 3. Transform (One single projection)
silver_df = clean_bronze_customers(bronze_df)

# 4. Write
target_table = 'workspace.silver.erp_cust_az12'
silver_df.write.mode('overwrite').saveAsTable(target_table)

# 5. Verify (Read from Target to ensure 'Write' success and avoid re-calc)
display(spark.table(target_table))