In [0]:
from pyspark.sql import DataFrame, functions as F
from pyspark.sql.types import StringType
from typing import Dict, Optional
from itertools import chain

def create_map_from_dict(mapping: Dict[str, str]):
    return F.create_map([F.lit(x) for x in chain(*mapping.items())])

def trim_all_string_columns(df: DataFrame) -> DataFrame:
    """Optimized trimming (O(1) overhead)"""
    str_cols = {f.name for f in df.schema.fields if isinstance(f.dataType, StringType)}
    return df.select(
        *[F.trim(F.col(c)).alias(c) if c in str_cols else F.col(c) for c in df.columns]
    )

def standardize_key(df: DataFrame, col: str, new_col: str) -> DataFrame:
    clean_col = F.upper(F.regexp_replace(F.col(col), '[^A-Za-z0-9]', ''))
    return df.withColumn(new_col, clean_col)

def normalize_product_line(df: DataFrame, col: str) -> DataFrame:
    # 1. Externalize the dictionary for readability
    line_rules = {'R': 'Road', 'S': 'Sport', 'M': 'Mountain'}
    map_col = create_map_from_dict(line_rules)
    return df.withColumn(col, F.coalesce(map_col[F.upper(F.col(col))], F.col(col)))

def flag_invalid_numeric(df: DataFrame, col: str, min_val: Optional[int] = None, max_val: Optional[int] = None) -> DataFrame:
    """
    CRITICAL FIX: Creates a flag ('is_valid_...') instead of filtering.
    """
    col_val = F.col(col).cast('int')
    is_valid = col_val.isNotNull()
    
    if min_val is not None:
        is_valid = is_valid & (col_val >= min_val)
    if max_val is not None:
        is_valid = is_valid & (col_val <= max_val)
        
    return df.withColumn(f"is_valid_{col}", is_valid)

def flag_invalid_date(df: DataFrame, col: str) -> DataFrame:
    """
    CRITICAL FIX: Creates a flag instead of filtering.
    """
    return df.withColumn(f"is_valid_{col}", F.col(col).isNotNull())

In [0]:
def process_crm_prd_info(bronze_table: str, silver_table: str):
    print(f"Starting processing for {silver_table}...")
    
    # 1. Read
    df_bronze = spark.table(bronze_table)
    
    # 2. Transform Pipeline (Using Flags now)
    df_silver = (df_bronze
        .transform(trim_all_string_columns)
        .transform(lambda df: standardize_key(df, 'prd_key', 'std_prd_key'))
        .transform(lambda df: normalize_product_line(df, 'prd_line'))
        .transform(lambda df: flag_invalid_numeric(df, 'prd_id', min_val=0, max_val=99999999))
        .transform(lambda df: flag_invalid_numeric(df, 'prd_cost', min_val=0, max_val=1000000))
        .transform(lambda df: flag_invalid_date(df, 'prd_start_dt'))
    )
    
    # 3. Final Schema Projection (The Contract)
    # Only select columns that match the target table schema
    df_final = df_silver.select(
        F.col("prd_id").alias("product_id"),
        F.col("prd_key").alias("product_key"),
        F.col("std_prd_key"),
        F.col("prd_nm").alias("product_name"),
        F.col("prd_cost").alias("product_cost"),
        F.col("prd_line").alias("product_line"),
        F.col("prd_start_dt").alias("start_date"),
        F.col("prd_end_dt").alias("end_date")
    )

    # 4. Filter (Optional Soft Delete)
    df_final = df_final.filter(F.col("std_prd_key").isNotNull())

    # 5. Write
    df_final.write.format("delta").mode("overwrite").saveAsTable(silver_table)
    
    print(f"Successfully wrote to {silver_table}")
    display(spark.table(silver_table))

# Run the cleaning pipeline
process_crm_prd_info('workspace.bronze.crm_prd_info', 'workspace.silver.crm_prd_info')