In [0]:
from pyspark.sql import DataFrame, functions as F
from pyspark.sql.types import StringType
from typing import Dict
from itertools import chain

def create_map_from_dict(mapping: Dict[str, str]):
    return F.create_map([F.lit(x) for x in chain(*mapping.items())])

def trim_all_string_columns(df: DataFrame) -> DataFrame:
    str_cols = {f.name for f in df.schema.fields if isinstance(f.dataType, StringType)}
    return df.select(
        *[F.trim(F.col(c)).alias(c) if c in str_cols else F.col(c) for c in df.columns]
    )

def standardize_key(df: DataFrame, col: str, new_col: str) -> DataFrame:
    clean_col = F.upper(F.regexp_replace(F.col(col), '[^A-Za-z0-9]', ''))
    return df.withColumn(new_col, clean_col)

def normalize_gender(df: DataFrame, col: str) -> DataFrame:
    gender_rules = {'F': 'Female', 'M': 'Male', 'FEMALE': 'Female', 'MALE': 'Male'}
    map_col = create_map_from_dict(gender_rules)
    return df.withColumn(col, F.coalesce(map_col[F.upper(F.col(col))], F.col(col)))

def validate_date(df: DataFrame, col: str) -> DataFrame:
    return df.withColumn(col, F.expr(f"try_to_date(CAST({col} AS STRING), 'yyyy-MM-dd')"))


In [0]:
def process_erp_cust_az12(bronze_table: str, silver_table: str):
    df = spark.table(bronze_table)
    df = trim_all_string_columns(df)
    df = standardize_key(df, 'CID', 'std_CID')
    df = validate_date(df, 'BDATE')
    df = normalize_gender(df, 'GEN')
    # Rename columns for silver
    df = df.withColumnRenamed('CID', 'customer_id') \
           .withColumnRenamed('BDATE', 'birth_date') \
           .withColumnRenamed('GEN', 'gender')
    # Reorder columns for clarity
    cols = ['customer_id', 'std_CID', 'birth_date', 'gender']
    df = df.select(*cols)
    df.write.mode('overwrite').saveAsTable(silver_table)
    display(df)

# Run the cleaning pipeline
process_erp_cust_az12('workspace.bronze.erp_cust_az12', 'workspace.silver.erp_cust_az12')