In [0]:
spark.sql("use md_globalretail")
spark.sql("""
CREATE TABLE IF NOT EXISTS md_products (
    product_id STRING,
    name STRING,
    category STRING,
    brand STRING,
    price DOUBLE,
    stock_quantity INT,
    rating DOUBLE,
    is_active BOOLEAN,
    price_category STRING,
    stock_status STRING,
    last_updated TIMESTAMP
)
USING DELTA
""")

Out[1]: DataFrame[]

In [0]:
last_processed_df = spark.sql("SELECT MAX(last_updated) as last_processed FROM md_products")
last_processed_timestamp = last_processed_df.collect()[0]['last_processed']

if last_processed_timestamp is None:
    last_processed_timestamp = '1900-01-01T00:00:00.000+00:00'

In [0]:
spark.sql(f"""
CREATE OR REPLACE TEMPORARY VIEW rw_products_incremental AS
SELECT *
FROM rw_globalretail.rw_products WHERE ingestion_timestamp > '{last_processed_timestamp}'
""")

Out[11]: DataFrame[]

In [0]:
%sql
select * from rw_products_incremental

brand,category,is_active,name,price,product_id,rating,stock_quantity,ingestion_timestamp
BeautyGlow,Toys,True,Product 1,995.73,1,3.5,989,2024-11-10T22:02:37.530+0000
GardenMaster,Garden,True,Product 2,497.76,2,3.8,495,2024-11-10T22:02:37.530+0000
BeautyGlow,Electronics,True,Product 3,331.63,3,4.6,10,2024-11-10T22:02:37.530+0000
TechPro,Beauty,False,Product 4,798.83,4,4.7,683,2024-11-10T22:02:37.530+0000
HomeSmart,Automotive,False,Product 5,-454.98,5,4.4,719,2024-11-10T22:02:37.530+0000
BookWorm,Electronics,False,Product 6,645.3,6,2.2,823,2024-11-10T22:02:37.530+0000
FashionX,Automotive,False,Product 7,549.08,7,1.1,999,2024-11-10T22:02:37.530+0000
TechPro,Books,False,Product 8,982.36,8,2.4,542,2024-11-10T22:02:37.530+0000
FashionX,Toys,True,Product 9,307.14,9,1.0,671,2024-11-10T22:02:37.530+0000
BeautyGlow,Garden,False,Product 10,871.38,10,3.4,975,2024-11-10T22:02:37.530+0000


In [0]:
'''
Data Transformations:
   - Price normalization (setting negative prices to 0)
   - Stock quantity normalization (setting negative stock to 0)
   - Rating normalization (clamping between 0 and 5)
   - Price categorization (Premium, Standard, Budget)
   - Stock status calculation (Out of Stock, Low Stock, Moderate Stock, Sufficient Stock)
   '''

Out[6]: '\nData Transformations:\n   - Price normalization (setting negative prices to 0)\n   - Stock quantity normalization (setting negative stock to 0)\n   - Rating normalization (clamping between 0 and 5)\n   - Price categorization (Premium, Standard, Budget)\n   - Stock status calculation (Out of Stock, Low Stock, Moderate Stock, Sufficient Stock)\n   '

In [0]:
spark.sql("""
CREATE OR REPLACE TEMPORARY VIEW md_products_incremental AS
SELECT
    product_id,
    name,
    category,
    brand,
    CASE
        WHEN price < 0 THEN 0
        ELSE price
    END AS price,
    CASE
        WHEN stock_quantity < 0 THEN 0
        ELSE stock_quantity
    END AS stock_quantity,
    CASE
        WHEN rating < 0 THEN 0
        WHEN rating > 5 THEN 5
        ELSE rating
    END AS rating,
    is_active,
    CASE
        WHEN price > 1000 THEN 'Premium'
        WHEN price > 100 THEN 'Standard'
        ELSE 'Budget'
    END AS price_category,
    CASE
        WHEN stock_quantity = 0 THEN 'Out of Stock'
        WHEN stock_quantity < 10 THEN 'Low Stock'
        WHEN stock_quantity < 50 THEN 'Moderate Stock'
        ELSE 'Sufficient Stock'
    END AS stock_status,
    CURRENT_TIMESTAMP() AS last_updated
FROM rw_products_incremental
WHERE name IS NOT NULL AND category IS NOT NULL
""")

Out[13]: DataFrame[]

In [0]:
%sql
select * from md_products_incremental

product_id,name,category,brand,price,stock_quantity,rating,is_active,price_category,stock_status,last_updated
1,Product 1,Toys,BeautyGlow,995.73,989,3.5,True,Standard,Sufficient Stock,2024-11-10T22:37:46.900+0000
2,Product 2,Garden,GardenMaster,497.76,495,3.8,True,Standard,Sufficient Stock,2024-11-10T22:37:46.900+0000
3,Product 3,Electronics,BeautyGlow,331.63,10,4.6,True,Standard,Moderate Stock,2024-11-10T22:37:46.900+0000
4,Product 4,Beauty,TechPro,798.83,683,4.7,False,Standard,Sufficient Stock,2024-11-10T22:37:46.900+0000
5,Product 5,Automotive,HomeSmart,0.0,719,4.4,False,Budget,Sufficient Stock,2024-11-10T22:37:46.900+0000
6,Product 6,Electronics,BookWorm,645.3,823,2.2,False,Standard,Sufficient Stock,2024-11-10T22:37:46.900+0000
7,Product 7,Automotive,FashionX,549.08,999,1.1,False,Standard,Sufficient Stock,2024-11-10T22:37:46.900+0000
8,Product 8,Books,TechPro,982.36,542,2.4,False,Standard,Sufficient Stock,2024-11-10T22:37:46.900+0000
9,Product 9,Toys,FashionX,307.14,671,1.0,True,Standard,Sufficient Stock,2024-11-10T22:37:46.900+0000
10,Product 10,Garden,BeautyGlow,871.38,975,3.4,False,Standard,Sufficient Stock,2024-11-10T22:37:46.900+0000


In [0]:
spark.sql("""
MERGE INTO md_products target
USING md_products_incremental source
ON target.product_id = source.product_id
WHEN MATCHED THEN
    UPDATE SET *
WHEN NOT MATCHED THEN
    INSERT *
""")

Out[15]: DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
%sql
select * from  md_products

product_id,name,category,brand,price,stock_quantity,rating,is_active,price_category,stock_status,last_updated
1,Product 1,Toys,BeautyGlow,995.73,989,3.5,True,Standard,Sufficient Stock,2024-11-10T22:38:28.429+0000
2,Product 2,Garden,GardenMaster,497.76,495,3.8,True,Standard,Sufficient Stock,2024-11-10T22:38:28.429+0000
3,Product 3,Electronics,BeautyGlow,331.63,10,4.6,True,Standard,Moderate Stock,2024-11-10T22:38:28.429+0000
4,Product 4,Beauty,TechPro,798.83,683,4.7,False,Standard,Sufficient Stock,2024-11-10T22:38:28.429+0000
5,Product 5,Automotive,HomeSmart,0.0,719,4.4,False,Budget,Sufficient Stock,2024-11-10T22:38:28.429+0000
6,Product 6,Electronics,BookWorm,645.3,823,2.2,False,Standard,Sufficient Stock,2024-11-10T22:38:28.429+0000
7,Product 7,Automotive,FashionX,549.08,999,1.1,False,Standard,Sufficient Stock,2024-11-10T22:38:28.429+0000
8,Product 8,Books,TechPro,982.36,542,2.4,False,Standard,Sufficient Stock,2024-11-10T22:38:28.429+0000
9,Product 9,Toys,FashionX,307.14,671,1.0,True,Standard,Sufficient Stock,2024-11-10T22:38:28.429+0000
10,Product 10,Garden,BeautyGlow,871.38,975,3.4,False,Standard,Sufficient Stock,2024-11-10T22:38:28.429+0000
