In [0]:
%run "../0.includes/configuration"

In [0]:
%run "../0.includes/utils"

In [0]:
%sql
SELECT * 
FROM bronze.sales

### General constraint check

In [0]:
%sql
SELECT *
FROM bronze.sales
WHERE 
  sku IS NULL                        
  OR price < 0                           
  OR sold < 0                            -- range check
  OR LENGTH(name) = 0                    -- empty name
  OR price IS NULL


### Check if there is any duplicate row 

In [0]:
%sql 
SELECT sku, COUNT(*) 
FROM bronze.sales 
GROUP BY sku 
HAVING COUNT(*) > 1
 
-- # sales_df = spark.read.format("delta").load(f"{BRONZE_FOLDER_PATH}/sales").dropDuplicates(["sku"])
-- # display(sales_df)


### List all categories to replace the description

In [0]:
%sql
SELECT DISTINCT category 
FROM bronze.sales

In [0]:
%sql
SELECT
  name AS product_name,
  CASE 
    WHEN category = 'Bakery' 
      THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' yummy bread'))
    WHEN category = 'Pantry' 
      THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' tasty pasta'))
    WHEN category = 'Meat' 
      THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' juicy meat'))
    WHEN category = 'Condiments' 
      THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' extra taste'))
    WHEN category = 'Beverages' 
      THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' fizzy drinks'))
    WHEN category = 'Dairy' 
      THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' creamy cheese'))
    WHEN category = 'Frozen'
      THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' frozen food')) 
    WHEN category = 'Produce'
      THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' fresh fruits'))
    WHEN category = 'Snacks'
      THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' crunchy snacks'))
  END AS updated_description
FROM bronze.sales;


### Adjust the `name` column

In [0]:
%sql
SELECT
  name,
  CASE
    WHEN regexp_replace(name, '[^0-9]', '') = '' THEN '0'
    ELSE regexp_replace(name, '[^0-9 ]', '')
  END AS product_type, 
  regexp_replace(name, '[0-9]', '') AS updated_name
FROM bronze.sales;


### Add the `total_revenue` column for each `record`

In [0]:
%sql 
SELECT 
  name, 
  price, 
  sold,
  round(price * sold, 2) AS total_revenue
FROM bronze.sales

### Combine all cleansed data

In [0]:
%sql 
SELECT
  sku, 
  category, 
  regexp_replace(name, '[0-9]', '') AS name,
  CASE
    WHEN regexp_replace(name, '[^0-9]', '') = '' THEN '0'
    ELSE TRIM(regexp_replace(name, '[^0-9 ]', ''))
  END AS product_type,
  price, 
  sold,
  round(price * sold, 2) AS total_revenue, 
  CASE 
    WHEN category = 'Bakery' 
      THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' yummy bread'))
    WHEN category = 'Pantry' 
      THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' tasty pasta'))
    WHEN category = 'Meat' 
      THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' juicy meat'))
    WHEN category = 'Condiments' 
      THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' extra taste'))
    WHEN category = 'Beverages' 
      THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' fizzy drinks'))
    WHEN category = 'Dairy' 
      THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' creamy cheese'))
    WHEN category = 'Frozen'
      THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' frozen food')) 
    WHEN category = 'Produce'
      THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' fresh fruits'))
    WHEN category = 'Snacks'
      THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' crunchy snacks'))
  END AS description, 
  date_trunc('day', from_utc_timestamp(current_timestamp(), 'Europe/Amsterdam')) AS sale_date
FROM bronze.sales;


In [0]:
%sql
SHOW DATABASES

### Drop table (if needed)

In [0]:
# table_name = "silver.cleansed_sales"

# spark.sql(f"DROP TABLE IF EXISTS {table_name}")
# dbutils.fs.rm(SILVER_FOLDER_PATH, recurse=True)

### INSERT INTO THE SILVER LAYER

In [0]:
if (spark._jsparkSession.catalog().tableExists("silver.cleansed_sales")): 
    print(">>> Start incremental loading ...")
    query = """
            WITH incremental_data AS ( 
            SELECT
                sku, 
                category, 
                regexp_replace(name, '[0-9]', '') AS name,
                CASE
                WHEN regexp_replace(name, '[^0-9]', '') = '' THEN '0'
                ELSE TRIM(regexp_replace(name, '[^0-9 ]', ''))
                END AS product_type,
                price, 
                sold,
                round(price * sold, 2) AS total_revenue, 
                CASE 
                WHEN category = 'Bakery' 
                    THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' yummy bread'))
                WHEN category = 'Pantry' 
                    THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' tasty pasta'))
                WHEN category = 'Meat' 
                    THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' juicy meat'))
                WHEN category = 'Condiments' 
                    THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' extra taste'))
                WHEN category = 'Beverages' 
                    THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' fizzy drinks'))
                WHEN category = 'Dairy' 
                    THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' creamy cheese'))
                WHEN category = 'Frozen'
                    THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' frozen food')) 
                WHEN category = 'Produce'
                    THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' fresh fruits'))
                WHEN category = 'Snacks'
                    THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' crunchy snacks'))
                END AS description, 
                ingestion_date AS sale_date
            FROM bronze.sales
            )

            MERGE INTO silver.cleansed_sales AS tgt
            USING incremental_data AS src
            ON tgt.sku = src.sku AND tgt.sale_date = src.sale_date -- merge condition
            WHEN MATCHED 
            THEN UPDATE SET *
            WHEN NOT MATCHED 
            THEN INSERT *
            """
    spark.sql(query)

else:
    print(">>> Table does not exist. Init a new one")
    query = """
                CREATE TABLE silver.cleansed_sales
                AS
                (
                    SELECT
                        sku, 
                        category, 
                        regexp_replace(name, '[0-9]', '') AS name,
                        CASE
                        WHEN regexp_replace(name, '[^0-9]', '') = '' THEN '0'
                        ELSE TRIM(regexp_replace(name, '[^0-9 ]', ''))
                        END AS product_type,
                        price, 
                        sold,
                        round(price * sold, 2) AS total_revenue, 
                        CASE 
                        WHEN category = 'Bakery' 
                            THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' yummy bread'))
                        WHEN category = 'Pantry' 
                            THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' tasty pasta'))
                        WHEN category = 'Meat' 
                            THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' juicy meat'))
                        WHEN category = 'Condiments' 
                            THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' extra taste'))
                        WHEN category = 'Beverages' 
                            THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' fizzy drinks'))
                        WHEN category = 'Dairy' 
                            THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' creamy cheese'))
                        WHEN category = 'Frozen'
                            THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' frozen food')) 
                        WHEN category = 'Produce'
                            THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' fresh fruits'))
                        WHEN category = 'Snacks'
                            THEN TRIM(REPLACE(description, ' - A high quality and popular choice.', ' crunchy snacks'))
                        END AS description, 
                        ingestion_date AS sale_date
                    FROM bronze.sales
                    )
            """
    spark.sql(query)

In [0]:
%sql
SELECT * 
FrOM silver.cleansed_sales