In [0]:
from pyspark.sql import DataFrame, functions as F
from pyspark.sql.types import StringType
from typing import Dict
from itertools import chain

def create_map_from_dict(mapping: Dict[str, str]):
    return F.create_map([F.lit(x) for x in chain(*mapping.items())])

def trim_all_string_columns(df: DataFrame) -> DataFrame:
    str_cols = {f.name for f in df.schema.fields if isinstance(f.dataType, StringType)}
    return df.select(
        *[F.trim(F.col(c)).alias(c) if c in str_cols else F.col(c) for c in df.columns]
    )

def standardize_key(df: DataFrame, col: str, new_col: str) -> DataFrame:
    clean_col = F.upper(F.regexp_replace(F.col(col), '[^A-Za-z0-9]', ''))
    return df.withColumn(new_col, clean_col)

def normalize_country(df: DataFrame, col: str) -> DataFrame:
    # Example mapping, expand as needed
    country_rules = {'US': 'United States', 'CA': 'Canada', 'UK': 'United Kingdom'}
    map_col = create_map_from_dict(country_rules)
    return df.withColumn(col, F.coalesce(map_col[F.upper(F.col(col))], F.col(col)))


In [0]:
def process_erp_loc_a101(bronze_table: str, silver_table: str):
    df = spark.table(bronze_table)
    df = trim_all_string_columns(df)
    df = standardize_key(df, 'CID', 'std_CID')
    df = normalize_country(df, 'CNTRY')
    # Rename columns for silver
    df = df.withColumnRenamed('CID', 'customer_id') \
           .withColumnRenamed('CNTRY', 'country')
    # Reorder columns for clarity
    cols = ['customer_id', 'std_CID', 'country']
    df = df.select(*cols)
    df.write.mode('overwrite').saveAsTable(silver_table)
    display(df)

# Run the cleaning pipeline
process_erp_loc_a101('workspace.bronze.erp_loc_a101', 'workspace.silver.erp_loc_a101')