In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import concat_ws, collect_list, split, explode, row_number

data = [
    (1, "John", "Engineering", "Laptop", 1200, "2023-01-10"),
    (1, "John", "Engineering", "Mouse", 40, "2023-01-11"),
    (1, "John", "Engineering", "Keyboard", 80, "2023-01-12"),
    (2, "Jane", "HR", "Laptop", 1100, "2023-02-01"),
    (2, "Jane", "HR", "Mouse", 35, "2023-02-03"),
    (3, "Sam", "Engineering", "Laptop", 1300, "2023-03-05"),
    (3, "Sam", "Engineering", "Monitor", 300, "2023-03-06"),
    (3, "Sam", "Engineering", "Mouse", 45, "2023-03-07"),
    (4, "Alice", "Finance", "Laptop", 1250, "2023-04-01"),
    (4, "Alice", "Finance", "Keyboard", 90, "2023-04-02"),
    (5, "Bob", "HR", "Laptop", 1000, "2023-04-10")
]

cols = ["emp_id", "emp_name", "department", "product", "amount", "purchase_date"]

df = spark.createDataFrame(data, cols)

#Scenario 1: List all products per employee (string + aggregation)
df_collect_list= df.groupBy("emp_id","emp_name").agg(F.concat_ws("|",F.collect_list("product")).alias("Products_list"))
display(df_collect_list)

#Scenario 2: Convert product list back to individual rows
df_explode=df_collect_list.withColumn("products",F.explode(F.split("Products_list","\\|")))
display(df_explode)

#Find total spend per employee + rank within department
windows_spec=Window.partitionBy("department").orderBy("total_amount")
df_total_sum = df.groupBy("emp_id","department").agg(F.sum("amount").alias("total_amount")).withColumn("rank",F.dense_rank().over(windows_spec))
display(df_total_sum)

#Scenario 4: Get latest purchase per employee
w = Window.partitionBy("emp_id").orderBy(F.desc("Purchase_date"))
df_Latest_purchase = df.withColumn("RowNum", F.row_number().over(w)).filter(F.col("RowNum") == 1).drop("RowNum")
display(df_Latest_purchase)

#Scenario 5: Employees who purchased more than 2 distinct products