#Loading RAW data from bronze Layer (CRM)

## Init


In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import StringType

##Importing Product Data

In [0]:
df=spark.table("workspace.bronze.crm_prd_info")
df.display()

###Cleaning up Product data

In [0]:
df=spark.sql("""
             SELECT
                t.prd_id,
                t.prd_key,
                t.prd_nm,
                t.prd_cost,
                t.prd_line,
                t.prd_start_dt,
                CASE 
                WHEN t.prd_end_dt < t.prd_start_dt then t.new_dt
                ELSE t.prd_end_dt
                END AS product_end_date
                FROM
                (SELECT 
                prd_id,
                prd_key,
                prd_nm,
                prd_cost,
                prd_line,
                CAST(prd_start_dt AS DATE),
                CAST(prd_end_dt AS DATE),
                LEAD(CAST(prd_start_dt AS DATE)) OVER ( PARTITION BY prd_key ORDER BY CAST(prd_start_dt AS DATE))-1 as new_dt
                FROM
                workspace.bronze.crm_prd_info)t
             """)
# df.show()


###trimming empty Spaces

In [0]:
for field in df.schema.fields:
    if isinstance(field.dataType,StringType):
        df=df.withColumn(field.name,trim(col(field.name)))
# df.display()

In [0]:
df=(
    df.withColumn("prd_line",expr("""
                                CASE 
                                WHEN UPPER(prd_line)='M' THEN 'Mountain'
                                WHEN UPPER(prd_line)='R' THEN 'Road'
                                WHEN UPPER(prd_line)='S' THEN 'Other Sales'
                                WHEN UPPER(prd_line)='T' THEN 'Trekking'
                                ELSE 'n/a'
                                END 
                                  """))
        .withColumn("prd_cat_id",expr("REPLACE(SUBSTRING(prd_key,1,5),'-','_')"))
        .withColumn("prd_key",expr("SUBSTRING(prd_key,7,LEN(prd_key))"))
        .withColumn("prd_cost",expr("CAST(COALESCE(prd_cost,0) AS INT)"))
)
# df.display()

###Renaming Columns

In [0]:
RENAME_COLUMNS={
    'prd_id':'product_id',
    'prd_key':'product_key',
    'prd_nm':'product_name',
    'prd_cost':'product_cost',
    'prd_line':'product_line',
    'prd_start_dt':'product_start_date',
    'product_end_date':'product_end_date',
    'prd_cat_id':'product_category_id'
}
for old_name,new_name in RENAME_COLUMNS.items():
    df=df.withColumnRenamed(old_name,new_name)
df.display()

###Loading Product Data

In [0]:
(
  df.write
  .mode("overwrite")
  .format("delta")
  .saveAsTable("workspace.silver.crm_prd_info")  
)