In [0]:
from pyspark.sql import DataFrame, functions as F
from pyspark.sql.types import StringType
from typing import Dict, Optional
from itertools import chain

# ==============================================================================
# Helper Functions (Infrastructure)
# ==============================================================================

def create_map_from_dict(mapping: Dict[str, str]):
    """Converts a Python Dictionary into a PySpark Map Column."""
    # --------------------------------------------------------------------------
    # WHAT: Convert Python Dict {'k': 'v'} -> Spark Map (k, v).
    #       itertools.chain(*mapping.items()) flattens the list of tuples 
    #       [(k1, v1), (k2, v2)] into [k1, v1, k2, v2].
    #       F.create_map expects this flattened list structure.
    # WHY:  Decouples Business Logic from Code. It allows you to define complex 
    #       mappings (like Country Codes) in clean Python dictionaries instead of 
    #       writing 50 lines of 'F.when().otherwise()' logic.
    # --------------------------------------------------------------------------
    return F.create_map([F.lit(x) for x in chain(*mapping.items())])

# ==============================================================================
# Core Transformation Logic
# ==============================================================================

def trim_all_string_columns(df: DataFrame) -> DataFrame:
    """Trims all string columns in a single projection (O(1) overhead)."""
    # --------------------------------------------------------------------------
    # WHAT: Scan the schema to find all columns that are of type 'StringType'.
    #       We use a set comprehension for fast O(1) lookups.
    # --------------------------------------------------------------------------
    str_cols = {f.name for f in df.schema.fields if isinstance(f.dataType, StringType)}
    
    # --------------------------------------------------------------------------
    # WHAT: Reconstruct the DataFrame in ONE 'select' statement.
    #       - If col is a string: Apply F.trim().
    #       - If col is not a string: Pass it through (F.col()).
    # WHY:  Performance (DAG Optimization). 
    #       A naive approach uses a 'for' loop with 'df.withColumn()'. 
    #       If you have 100 columns, that creates 100 separate plans (DAG Explosion).
    #       This approach creates 1 plan. It is 10x-50x faster on wide tables.
    # --------------------------------------------------------------------------
    return df.select(
        *[F.trim(F.col(c)).alias(c) if c in str_cols else F.col(c) for c in df.columns]
    )

def standardize_key(df: DataFrame, col: str, new_col: str) -> DataFrame:
    """Standardizes a key by uppercasing and removing non-alphanumeric chars."""
    # --------------------------------------------------------------------------
    # WHAT: Apply Regex Replacement '[^A-Za-z0-9]'.
    #       This removes anything that is NOT a letter or number (e.g., spaces, -, _).
    #       Then apply F.upper() to normalize case.
    # WHY:  Join Reliability. 
    #       "Product-A " and "product_a" should be treated as the exact same key.
    #       This creates a robust Join Key for Silver->Gold transformations.
    # --------------------------------------------------------------------------
    clean_col = F.upper(F.regexp_replace(F.col(col), '[^A-Za-z0-9]', ''))
    
    # --------------------------------------------------------------------------
    # WHAT: Add the new standardized key as a new column.
    # WHY:  Non-destructive. We keep the original 'col' for audit/debug purposes
    #       but use 'new_col' for actual joining.
    # --------------------------------------------------------------------------
    return df.withColumn(new_col, clean_col)

def normalize_categorical(df: DataFrame, col: str, rules: Dict[str, str]) -> DataFrame:
    """
    Generic function to normalize any categorical column using a dictionary.
    """
    # --------------------------------------------------------------------------
    # WHAT: Convert the Python rules dictionary into a Spark Map column.
    # --------------------------------------------------------------------------
    map_col = create_map_from_dict(rules)
    
    # --------------------------------------------------------------------------
    # WHAT: Perform a Key-Value lookup.
    #       map_col[F.upper(F.col(col))] attempts to find the value in the map.
    #       F.coalesce(..., F.col(col)) provides the "Else" logic.
    #       If the lookup returns NULL (key not found), keep the original value.
    # WHY:  Safe Standardization. 
    #       If we receive a new code 'Z' that isn't in our dictionary, we don't 
    #       want to null it out. We pass 'Z' through so we can see it and update 
    #       our dictionary later.
    # --------------------------------------------------------------------------
    return df.withColumn(col, F.coalesce(map_col[F.upper(F.col(col))], F.col(col)))

def flag_invalid_numeric(df: DataFrame, col: str, min_val: Optional[int] = None, max_val: Optional[int] = None) -> DataFrame:
    """Flags rows that are not valid integers within range."""
    # --------------------------------------------------------------------------
    # WHAT: Safe Casting. Try to turn the string into an integer.
    #       Spark returns NULL if the cast fails (e.g., cast("Apple" as int) -> NULL).
    # --------------------------------------------------------------------------
    col_val = F.col(col).cast('int')
    
    # --------------------------------------------------------------------------
    # WHAT: The Base Condition. It must be successfully castable (Not Null).
    # --------------------------------------------------------------------------
    is_valid = col_val.isNotNull()
    
    # --------------------------------------------------------------------------
    # WHAT: Conditional Range Checks.
    #       If the user provided min/max constraints, add them to the boolean logic.
    #       We use '&' (Bitwise AND) to chain boolean conditions in Spark.
    # --------------------------------------------------------------------------
    if min_val is not None:
        is_valid = is_valid & (col_val >= min_val)
    if max_val is not None:
        is_valid = is_valid & (col_val <= max_val)
        
    # --------------------------------------------------------------------------
    # WHAT: Return the DataFrame with a NEW boolean column (True/False).
    # WHY:  "Soft Failure" Strategy. Instead of dropping bad rows (data loss), 
    #       we tag them. This allows the Gold layer to decide: 
    #       "Do I exclude them?" or "Do I include them for a Data Quality Report?"
    # --------------------------------------------------------------------------
    return df.withColumn(f"is_valid_{col}", is_valid)

In [0]:
def process_crm_cust_info(bronze_table: str, silver_table: str):
    print(f"Starting processing for {silver_table}...")
    
    # ==========================================================================
    # STEP 0: DEFINE BUSINESS RULES (Configuration)
    # ==========================================================================
    # WHAT: Define dictionaries mapping raw codes to clean, human-readable values.
    # WHY:  Centralized Logic. Instead of burying these rules inside a complex SQL 
    #       'CASE WHEN' statement, we define them here. This handles dirty data 
    #       variations (e.g., 'F' and 'FEMALE' map to the same target).
    gender_rules = {
        'F': 'Female', 'M': 'Male', 
        'FEMALE': 'Female', 'MALE': 'Male', 'UNK': 'Unknown'
    }
    marital_rules = {
        'S': 'Single', 'M': 'Married',
        'SINGLE': 'Single', 'MARRIED': 'Married'
    }

    # ==========================================================================
    # STEP 1: READ (Ingestion)
    # ==========================================================================
    # WHAT: Lazy load the Bronze table into a DataFrame.
    # WHY:  'spark.table' connects to the Unity Catalog metastore, preserving 
    #       lineage and access controls defined in the platform.
    df_bronze = spark.table(bronze_table)
    
    # ==========================================================================
    # STEP 2: TRANSFORM PIPELINE (Cleaning)
    # ==========================================================================
    # WHAT: Chain operations.
    #       1. trim_all_string_columns: Removes leading/trailing whitespace.
    #       2. standardize_key: cleans 'cst_key' -> 'customer_key' (Upper + Regex).
    #       3. normalize_categorical: Applies the 'gender_rules' dict to 'cst_gndr'.
    #       4. normalize_categorical: Applies 'marital_rules' to 'cst_marital_status'.
    #       5. flag_invalid_numeric: Checks if 'cst_id' is a valid integer.
    # WHY:  Standardization. We ensure " Male " becomes "Male" (trim) and then 
    #       apply the dictionary lookup. This prepares the data for joining.
    df_silver = (df_bronze
        .transform(trim_all_string_columns)
        .transform(lambda df: standardize_key

In [0]:
display(spark.table('workspace.silver.crm_cust_info'))