In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable

In [0]:
%sql
delete from fmcg.bronze.gross_price
    where (product_id = 88888888 and gross_price = -440)
          or (product_id = 88888888 and gross_price = -83)
          or (product_id = 88888888 and gross_price = 138)
          or (product_id = 99999999 and gross_price = 132)

In [0]:
df = spark.sql("select * from fmcg.bronze.gross_price")
df.createOrReplaceTempView("main_df")
display(df)

In [0]:
df1 = df.withColumn(
    "month", 
    F.coalesce(
        F.date_format(F.try_to_date(F.col("month"), "yyyy/MM/dd"), "dd-MM-yyyy"),
        F.date_format(F.try_to_date(F.col("month"), "yyyy-MM-dd"), "dd-MM-yyyy"),
        F.date_format(F.try_to_date(F.col("month"), "dd/MM/yyyy"), "dd-MM-yyyy"),
		F.date_format(F.try_to_date(F.col("month"), "dd-MM-yyyy"), "dd-MM-yyyy"),
        F.date_format(F.try_to_date(F.col("month"), "MM/dd/yyyy"), "dd-MM-yyyy"),
        F.date_format(F.try_to_date(F.col("month"), "MM-dd-yyyy"), "dd-MM-yyyy")	
    )
)
display(df1)

In [0]:
df2 = df1.withColumn(
    "gross_price",F.when(
        F.col("gross_price").rlike("^-?\\d*(\\.\\d+)?$")
                            ,F.when(
                                F.col("gross_price")<0,F.col("gross_price")*-1
                            ).otherwise(F.col("gross_price"))
                         ).otherwise(0)
    )
display(df2)

In [0]:
df_product_code = spark.sql("select * from fmcg.silver.s_products")
display(df_product_code)

In [0]:
df3 = df2.join(df_product_code, df2.product_id == df_product_code.product_id, how='left')

df4 = df3.select(df2.product_id,"product_code","month","gross_price",df2.read_timestamp,df2._metadata_filename,df2._metadata_file_size)
display(df4)
df4.createOrReplaceTempView("merged_table")



In [0]:
df5 = spark.sql("""
        select * from (select *,rank() over (partition by product_id order by month) as rank_a
            from merged_table )
            where rank_a<2
                """)
display(df5)

In [0]:
df6 = df5.withColumn(
    "product_code", 
    F.when(F.col("product_code").isNull(),F.col("product_id").cast("string")).otherwise(F.col("product_code"))
)
display(df6)

In [0]:
df6.write.format("delta")\
    .option("delta.enableChangeDataFeed", "true")\
    .mode("overwrite") \
    .saveAsTable(f"fmcg.silver.s_gross_price")

In [0]:
df7 = spark.sql("select * from fmcg.silver.s_gross_price")
display(df7)


In [0]:
df8 = df7.select("product_code","gross_price",F.to_date(F.col("month"), "dd-MM-yyyy").alias("month")) 
display(df8)

In [0]:
df_final = df8.withColumnRenamed("gross_price", "price_inr")\
         .withColumn("year",F.year(F.col("month")))\
         .select("product_code","price_inr","year")
display(df_final)

In [0]:
df_final.write.format("delta")\
    .option("delta.enableChangeDataFeed", "true")\
    .mode("overwrite") \
    .saveAsTable(f"fmcg.gold.sb_gross_price")

In [0]:
df_final_gold = spark.sql("select * from fmcg.gold.sb_gross_price")
display(df_final_gold)

In [0]:
target_table = DeltaTable.forName(spark, "fmcg.gold.dim_gross_price") 

target_table.alias("target").merge(
                    df_final_gold.alias("source"),
                    "target.product_code = source.product_code"  
                    ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

In [0]:
%sql
SELECT product_code , COUNT(*) AS total_records 
FROM `fmcg`.`gold`.`dim_gross_price`
group by product_code
having count(*)>1