#Loading RAW data from bronze Layer (CRM)

## Init


In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import StringType

##Importing Sales Data

In [0]:
df=spark.table("workspace.bronze.crm_sales_details")
df.display()

###Cleaning up sales data

In [0]:
df=spark.sql("""
                SELECT
                sls_ord_num,
                sls_prd_key,
                sls_cust_id,
                CASE
                WHEN sls_order_dt=0 OR LEN(sls_order_dt)!=8 THEN NULL
                ELSE to_date(sls_order_dt,'yyyyMMdd')
                END AS sales_order_date,
                CASE
                WHEN sls_ship_dt=0 OR LEN(sls_ship_dt)!=8 THEN NULL
                ELSE to_date(sls_ship_dt,'yyyyMMdd')
                END AS sales_ship_date,
                CASE
                WHEN sls_due_dt=0 OR LEN(sls_due_dt)!=8 THEN NULL
                ELSE to_date(sls_due_dt,'yyyyMMdd')
                END AS sls_due_date,
                sls_sales,
                sls_quantity,
                sls_price
                FROM
                workspace.bronze.crm_sales_details
             """)

In [0]:
df=(
    df.withColumn("sls_cust_id",expr("CAST(sls_cust_id AS INT)"))
    .withColumn("sls_sales",expr("CAST(sls_sales AS INT)"))
    .withColumn("sls_quantity",expr("CAST(sls_quantity AS INT)"))
    .withColumn("sls_price",expr("""
                                 CASE 
                                 WHEN CAST(sls_sales AS INT) * CAST(sls_quantity AS INT) != CAST(sls_price AS INT) THEN CAST(sls_sales AS INT) * CAST(sls_quantity AS INT)
                                 ELSE CAST(ABS(sls_price) AS INT)
                                 END
                                 """))   
)
# df.display()

###Trimming empty Spaces

In [0]:
for field in df.schema.fields:
    if isinstance(field.dataType,StringType):
        df=df.withColumn(field.name,trim(col(field.name)))
# df.display()

###Renaming Columns

In [0]:
RENAME_COLUMNS={
    'sls_ord_num':'sales_order_number',
    'sls_prd_key':'sales_product_key',
    'sls_cust_id':'sales_customer_id',
    'sales_order_date':'sales_order_date',
    'sales_ship_date':'sales_ship_date',
    'sls_due_date':'sales_due_date',
    'sls_sales':'total_sales',
    'sls_quantity':'sales_quantity',
    'sls_price':'sales_price'
}

for old_name,new_name in RENAME_COLUMNS.items():
    df=df.withColumnRenamed(old_name,new_name)  
df.display()

###Loading Sales Data

In [0]:
(
  df.write
  .mode("overwrite")
  .format("delta")
  .saveAsTable("workspace.silver.crm_sales_details")
)