#Read from bronze table

In [0]:
df = spark.table("workspace.bronze.epr_cust")
df.display()

#Transformation

##remove unwanted spaces

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import col, trim, StringType

In [0]:


for field in df.schema.fields:
    if isinstance(field.dataType, StringType):
        df = df.withColumn(field.name, trim(col(field.name)))
        df.display()

##clean id

In [0]:

df = df.withColumn(
    "cid",
    F.when(col("cid").startswith("NAS"),
           F.substring(col("cid"), 4, F.length(col("cid"))))
     .otherwise(col("cid"))
)
df.display()

## birtdate validation

In [0]:
df = df.withColumn(
    "bdate",
    F.when(col("bdate") > F.current_date(), None)
     .otherwise(col("bdate"))
)
df.display()

##gernder normalization

In [0]:

df = df.withColumn(
    "gen",
    F.when(F.upper(col("gen")).isin("F", "FEMALE"), "Female")
     .when(F.upper(col("gen")).isin("M", "MALE"), "Male")
     .otherwise("n/a")
)

##rename columns

In [0]:
new1 = {
    "cid": "customer_number",
    "bdate": "birth_date",
    "gen": "gender"
}
for old, new_name in new1.items():
    df = df.withColumnRenamed(old, new_name)

#writing silver table

In [0]:

df.write.mode("overwrite").format("delta").saveAsTable("workspace.silver.erp_customers")

#check table

In [0]:
%sql
select*from workspace.silver.erp_customers