In [0]:
from pyspark.sql import DataFrame, functions as F
from pyspark.sql.types import StringType
from typing import Dict, Optional
from itertools import chain

# ==============================================================================
# Helper Functions (Infrastructure)
# ==============================================================================

def create_map_from_dict(mapping: Dict[str, str]):
    """Converts a Python Dictionary into a PySpark Map Column."""
    return F.create_map([F.lit(x) for x in chain(*mapping.items())])

# ==============================================================================
# Core Transformation Logic
# ==============================================================================

def trim_all_string_columns(df: DataFrame) -> DataFrame:
    """Trims all string columns in a single projection (O(1) overhead)."""
    str_cols = {f.name for f in df.schema.fields if isinstance(f.dataType, StringType)}
    return df.select(
        *[F.trim(F.col(c)).alias(c) if c in str_cols else F.col(c) for c in df.columns]
    )

def standardize_key(df: DataFrame, col: str, new_col: str) -> DataFrame:
    """Standardizes a key by uppercasing and removing non-alphanumeric chars."""
    # Note: Regex handles space removal automatically; explicit trim is redundant.
    clean_col = F.upper(F.regexp_replace(F.col(col), '[^A-Za-z0-9]', ''))
    return df.withColumn(new_col, clean_col)

def normalize_categorical(df: DataFrame, col: str, rules: Dict[str, str]) -> DataFrame:
    """
    Generic function to normalize any categorical column using a dictionary.
    Replaces specific 'normalize_gender' and 'normalize_marital_status' functions.
    """
    map_col = create_map_from_dict(rules)
    # Coalesce keeps the original value if the lookup fails (safe fallback)
    return df.withColumn(col, F.coalesce(map_col[F.upper(F.col(col))], F.col(col)))

def flag_invalid_numeric(df: DataFrame, col: str, min_val: Optional[int] = None, max_val: Optional[int] = None) -> DataFrame:
    """Flags rows that are not valid integers within range."""
    col_val = F.col(col).cast('int')
    is_valid = col_val.isNotNull()
    
    if min_val is not None:
        is_valid = is_valid & (col_val >= min_val)
    if max_val is not None:
        is_valid = is_valid & (col_val <= max_val)
        
    return df.withColumn(f"is_valid_{col}", is_valid)

In [0]:
def process_crm_cust_info(bronze_table: str, silver_table: str):
    print(f"Starting processing for {silver_table}...")
    
    # Define Business Rules (Dictionaries)
    gender_rules = {
        'F': 'Female', 'M': 'Male', 
        'FEMALE': 'Female', 'MALE': 'Male', 'UNK': 'Unknown'
    }
    marital_rules = {
        'S': 'Single', 'M': 'Married',
        'SINGLE': 'Single', 'MARRIED': 'Married'
    }

    # 1. Read
    df_bronze = spark.table(bronze_table)
    
    # 2. Transform Pipeline
    df_silver = (df_bronze
        .transform(trim_all_string_columns)
        .transform(lambda df: standardize_key(df, 'cst_key', 'customer_key'))
        .transform(lambda df: normalize_categorical(df, 'cst_gndr', gender_rules))
        .transform(lambda df: normalize_categorical(df, 'cst_marital_status', marital_rules))
        .transform(lambda df: flag_invalid_numeric(df, 'cst_id', min_val=0, max_val=99999999))
    )
    
    # 3. Final Schema Projection (The Contract)
    df_final = df_silver.select(
        F.col("cst_id").alias("customer_id"),
        F.col("customer_key"),
        F.col("cst_firstname").alias("first_name"),
        F.col("cst_lastname").alias("last_name"),
        F.col("cst_marital_status").alias("marital_status"),
        F.col("cst_gndr").alias("gender"),
        F.col("cst_create_date").alias("created_date")
    )

    # 4. Filter (Optional - Soft Delete)
    df_final = df_final.filter(F.col("customer_key").isNotNull())

    # 5. Write
    df_final.write.format("delta").mode("overwrite").saveAsTable(silver_table)
    
    print(f"Successfully wrote to {silver_table}")

# ==============================================================================
# Execution
# ==============================================================================
process_crm_cust_info(
    bronze_table='workspace.bronze.crm_cust_info', 
    silver_table='workspace.silver.crm_cust_info')

In [0]:
display(spark.table('workspace.silver.crm_cust_info'))