In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable

In [0]:
df = spark.sql("select * from fmcg.bronze.products")
display(df)

In [0]:
#remove duplicates
df2 = spark.sql(
    """
    select distinct product_name,product_id,category,read_timestamp,_metadata_filename,_metadata_file_size from fmcg.bronze.products;
    """
)
display(df2)
df2.createOrReplaceTempView("remove_duplicates")

In [0]:
%sql
select count(*) from fmcg.bronze.products

In [0]:
df3 = df2.withColumn("original_product_name", df2["product_name"])

# df3 = df3.withColumn(
#     "product_name", 
#     F.trim(F.split(df3["product_name"], r"\s\(").getItem(0))  # Extract product name
# )

df3 = df3.withColumn(
    "variant", 
    F.trim(F.regexp_extract(df3["original_product_name"], r"\(([^)]+)\)", 1))  # Extract content inside parentheses
)
df3 = df3.select("product_name","variant","product_id","category","read_timestamp","_metadata_filename","_metadata_file_size")
display(df3)

In [0]:
df4 = df3.withColumn("product_code", F.sha2(F.col("product_name"), 256))\
        .withColumn("product_id",F.when(F.col("product_id")=="XYZ123","99999999").otherwise(F.col("product_id")))
display(df4)

In [0]:
df5 = df4.withColumn(
    "product_name",
    F.regexp_replace(F.col("product_name"), r"(?i)\bprotien\b", "Protein")  # Case-insensitive match
).withColumn(
    "category",
    F.regexp_replace(F.col("category"), r"(?i)\bprotien\b", "protein")  # Case-insensitive match
)
display(df5)

In [0]:
df6 = df5.withColumn("division",
               F.when(F.col("category") == "energy bars",        "Nutrition Bars")
                .when(F.col("category") == "protein bars",       "Nutrition Bars")
                .when(F.col("category") == "granola & cereals",  "Breakfast Foods")
                .when(F.col("category") == "recovery dairy",     "Dairy & Recovery")
                .when(F.col("category") == "healthy snacks",      "Healthy Snacks")
                .when(F.col("category") == "electrolyte mix",    "Hydration & Electrolytes")
                .otherwise("Other")
               )



df7 = df6.select("product_name","variant","product_id","product_code","category","division","read_timestamp","_metadata_filename","_metadata_file_size")
display(df7)

In [0]:
df7.write.format("delta")\
    .option("delta.enableChangeDataFeed", "true")\
    .mode("overwrite") \
    .saveAsTable(f"fmcg.silver.s_products")

In [0]:
df8 = spark.sql("select * from fmcg.silver.s_products")
display(df8)

In [0]:
df8 = df8.withColumnRenamed("product_name", "product")
df_final = df8.select("product_code","division","category","product","variant")
display(df_final)


In [0]:
df_final.write.format("delta")\
    .option("delta.enableChangeDataFeed", "true")\
    .mode("overwrite") \
    .saveAsTable(f"fmcg.gold.sb_products")

In [0]:
df_final_gold = spark.sql("select * from fmcg.gold.sb_products")
display(df_final_gold)

In [0]:
target_table = DeltaTable.forName(spark, "fmcg.gold.dim_products") 

target_table.alias("target").merge(
                    df_final_gold.alias("source"),
                    "target.product_code = source.product_code"  
                    ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()