In [0]:
from pyspark.sql import DataFrame, functions as F
from pyspark.sql.types import StringType
from typing import Dict
from itertools import chain

def create_map_from_dict(mapping: Dict[str, str]):
    return F.create_map([F.lit(x) for x in chain(*mapping.items())])

def trim_all_string_columns(df: DataFrame) -> DataFrame:
    str_cols = {f.name for f in df.schema.fields if isinstance(f.dataType, StringType)}
    return df.select(
        *[F.trim(F.col(c)).alias(c) if c in str_cols else F.col(c) for c in df.columns]
    )

def standardize_key(df: DataFrame, col: str, new_col: str) -> DataFrame:
    clean_col = F.upper(F.regexp_replace(F.col(col), '[^A-Za-z0-9]', ''))
    return df.withColumn(new_col, clean_col)

def normalize_category(df: DataFrame, col: str) -> DataFrame:
    # Example mapping, expand as needed
    cat_rules = {'BIKE': 'Bicycle', 'COMP': 'Component', 'ACC': 'Accessory'}
    map_col = create_map_from_dict(cat_rules)
    return df.withColumn(col, F.coalesce(map_col[F.upper(F.col(col))], F.col(col)))


In [0]:
def process_erp_px_cat_g1v2(bronze_table: str, silver_table: str):
    df = spark.table(bronze_table)
    df = trim_all_string_columns(df)
    df = standardize_key(df, 'ID', 'std_ID')
    df = normalize_category(df, 'CAT')
    # Rename columns for silver
    df = df.withColumnRenamed('ID', 'product_id') \
           .withColumnRenamed('CAT', 'category') \
           .withColumnRenamed('SUBCAT', 'subcategory') \
           .withColumnRenamed('MAINTENANCE', 'maintenance')
    # Reorder columns for clarity
    cols = ['product_id', 'std_ID', 'category', 'subcategory', 'maintenance']
    df = df.select(*cols)
    df.write.mode('overwrite').saveAsTable(silver_table)
    display(df)

# Run the cleaning pipeline
process_erp_px_cat_g1v2('workspace.bronze.erp_px_cat_g1v2', 'workspace.silver.erp_px_cat_g1v2')