In [0]:
from pyspark.sql import DataFrame, functions as F
from pyspark.sql.types import StringType
from typing import Dict
from itertools import chain

def create_map_from_dict(mapping: Dict[str, str]):
    return F.create_map([F.lit(x) for x in chain(*mapping.items())])

def trim_all_string_columns(df: DataFrame) -> DataFrame:
    str_cols = {f.name for f in df.schema.fields if isinstance(f.dataType, StringType)}
    return df.select(
        *[F.trim(F.col(c)).alias(c) if c in str_cols else F.col(c) for c in df.columns]
    )

def standardize_key(df: DataFrame, col: str, new_col: str) -> DataFrame:
    clean_col = F.upper(F.regexp_replace(F.col(col), '[^A-Za-z0-9]', ''))
    return df.withColumn(new_col, clean_col)

def validate_numeric(df: DataFrame, col: str, min_val: int = None, max_val: int = None) -> DataFrame:
    col_val = F.col(col).cast('int')
    is_valid = col_val.isNotNull()
    if min_val is not None:
        is_valid = is_valid & (col_val >= min_val)
    if max_val is not None:
        is_valid = is_valid & (col_val <= max_val)
    return df.filter(is_valid)

def validate_date(df: DataFrame, col: str) -> DataFrame:
    return df.filter(F.col(col).isNotNull())


In [0]:
from pyspark.sql import DataFrame, functions as F
from pyspark.sql.types import StringType
from typing import Dict
from itertools import chain

# ... (Keep your Helper Functions exactly as they are) ...
# [validate_numeric, validate_date, trim_all_string_columns, standardize_key]

def process_crm_sales_details(bronze_table: str, silver_table: str):
    print(f"Processing {silver_table}...")
    df = spark.table(bronze_table)
    
    # 1. Clean Strings
    df = trim_all_string_columns(df)
    
    # 2. Standardize Key (Part A: Clean Chars)
    df = standardize_key(df, 'sls_prd_key', 'std_sls_prd_key')
    
    # ==========================================================================
    # CRITICAL FIX: Match the Logic from Dim_Product
    # ==========================================================================
    # We must truncate this to 4 chars so it matches the 'std_prd_key' in the 
    # Product Dimension (e.g., "BIKE-123" -> "BIKE").
    df = df.withColumn('std_sls_prd_key', F.col('std_sls_prd_key').substr(1, 4))

    # 3. Dates & Validation (Your existing logic)
    df = df.withColumn('order_date', F.expr("try_to_date(CAST(sls_order_dt AS STRING), 'yyyyMMdd')")) \
           .withColumn('ship_date', F.expr("try_to_date(CAST(sls_ship_dt AS STRING), 'yyyyMMdd')")) \
           .withColumn('due_date', F.expr("try_to_date(CAST(sls_due_dt AS STRING), 'yyyyMMdd')"))
    
    df = validate_numeric(df, 'sls_sales', min_val=0, max_val=1000000)
    df = validate_numeric(df, 'sls_quantity', min_val=0, max_val=10000)
    df = validate_numeric(df, 'sls_price', min_val=0, max_val=100000)

    # 4. Rename & Select
    df = df.withColumnRenamed('sls_ord_num', 'order_number') \
           .withColumnRenamed('sls_prd_key', 'product_key') \
           .withColumnRenamed('sls_cust_id', 'customer_id') \
           .withColumnRenamed('sls_sales', 'sales_amount') \
           .withColumnRenamed('sls_quantity', 'quantity') \
           .withColumnRenamed('sls_price', 'price')

    df = df.drop('sls_order_dt', 'sls_ship_dt', 'sls_due_dt')
    
    # Note: We select 'std_sls_prd_key' to use it for the join later
    cols = ['order_number', 'product_key', 'std_sls_prd_key', 'customer_id', 'order_date', 'ship_date', 'due_date', 'sales_amount', 'quantity', 'price']
    df = df.select(*cols)
    
    df.write.mode('overwrite').saveAsTable(silver_table)
    display(df)

# Run logic
process_crm_sales_details('workspace.bronze.crm_sales_details', 'workspace.silver.crm_sales_details')