In [0]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window

customer= spark.table("workspace.default.customers")
display(customer)

orders_data = [
    (101, 1, 120.0, "2025-12-01"),
    (102, 1,  75.0, "2025-12-02"),
    (103, 2,  50.0, "2025-12-03"),
    (104, 4, 200.0, "2025-12-05"),
    (106, 4, 210.0, "2025-12-07"),  # newer order for customer 4
    (105, 99, 60.0, "2025-12-06")
]
orders = (spark.createDataFrame(orders_data, ["OrderId", "CustomerId", "Amount", "OrderDate"])
          .withColumn("OrderDate", F.to_date("OrderDate"))
)
display(orders)
#out = "/Volumes/workspace/default/mainvol/deepika/week1/orders_parquet"
#orders.write.mode("overwrite").parquet(out)
#display(spark.read.parquet(out))


In [0]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window

w=Window.partitionBy(F.col("CustomerId")).orderBy(F.col("OrderDate").desc())
ranked= orders.withColumn("rn",F.row_number().over(w))
latest= ranked.filter(F.col("rn")==1)
display(latest)

cust_latest=(customer
.join(latest,on="CustomerId",how="left")
.select("CustomerId", "CustomerName", "Country", "OrderId", "Amount", "OrderDate")
.orderBy("CustomerId")
)
display(cust_latest)

big_latest = cust_latest.filter(F.col("Amount") >= 100)
display(big_latest.orderBy(F.col("Amount").desc()))




In [0]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window
orders_hist_data = [
    (1, 101, 120.0, "2025-12-01", "2025-12-01"),
    (1, 102,  75.0, "2025-12-02", "2025-12-02"),
    (1, 103,  50.0, "2025-12-03", "2025-12-03"),
    (1, 107, 130.0, "2025-12-08", "2025-12-08"),
    (1, 108,  50.0, "2025-12-10", "2025-12-10"),
    (2, 103,  50.0, "2025-12-03", "2025-12-03"),
    (2, 108,  50.0, "2025-12-10", "2025-12-10"),
    (4, 104, 200.0, "2025-12-05", "2025-12-05"),
    (4, 106, 210.0, "2025-12-07", "2025-12-07")
]

orders_hist = (spark.createDataFrame(orders_hist_data, ["CustomerId","OrderId","Amount","OrderDate","UpdatedAt"])
               .withColumn("OrderDate", F.to_date("OrderDate"))
               .withColumn("UpdatedAt", F.to_date("UpdatedAt"))
)
display(orders_hist)
w=Window.partitionBy("CustomerId").orderBy(F.col("OrderDate").desc())
changes=(orders_hist
         .withColumn("previousamount",F.lag("Amount").over(w))
         .withColumn("Delta",F.col("Amount")-F.col("previousamount"))
         .withColumn("Ischanged",
                     F.when(F.col("previousamount").isNull(), F.lit(0))
                     .when(F.col("Amount") != F.col("previousamount"), F.lit(1))
                     .otherwise(F.lit(0))
         ))
display(changes.filter(F.col("Ischanged")==1))

In [0]:
import pyspark.sql.functions as F

# orders_hist is your “initial” dataframe from Day 7
orders_hist.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("workspace.default.orders_silver")
display(spark.table("workspace.default.orders_silver"))

incremental_data = [
    (1, 107, 150.0, "2025-12-08"),  # updated amount (was 130)
    (2, 109,  90.0, "2025-12-12")   # new order
]

incr= (spark.createDataFrame(incremental_data, ["CustomerId","OrderId","Amount","OrderDate"])
       .withColumn("OrderDate", F.to_date("OrderDate"))
       .withColumn("UpdatedAt",F.current_timestamp())
)
display(incr)
incr.createOrReplaceTempView("src")
spark.sql("""
          MERGE into workspace.default.orders_silver tgt
          using src
          on tgt.CustomerId=src.CustomerId and tgt.OrderId=src.OrderId
          when matched then update set tgt.Amount=src.Amount, tgt.UpdatedAt=src.UpdatedAt
          when not matched then insert *            
          """)
display(spark.table("workspace.default.orders_silver"))

