In [0]:
# An objective is to find 2 insights
#1. Find 5 items which were sold the most in each province
#2. Promotion effectiveness overall the supermarkets

In [0]:
import pandas as pd
from pyspark.sql.functions import sum, desc, row_number, when, col
from pyspark.sql.window import Window

1. Top 5 items sold in each province

In [0]:
#Step 1: Load Datasets
sales_df = spark.read.csv("/FileStore/tables/sales.csv", header=True)
item_df = spark.read.csv("/FileStore/tables/item.csv", header=True)
promotion_df = spark.read.csv("/FileStore/tables/promotion.csv", header=True)
supermrkt_df = spark.read.csv("/FileStore/tables/supermarket.csv", header=True)

In [0]:
item_df = item_df.withColumnRenamed("descrption", "description")

In [0]:
#Join sales and item tables
sales_items = sales_df.join(item_df, sales_df.code == item_df.code, "left").select(sales_df.code, sales_df.province, sales_df.units, item_df.description)

In [0]:
# Group by item code and province, sum the units sold
province_sales = sales_items.groupBy("code", "province", "description").agg(sum("units").alias("total_units"))

In [0]:
# 2. Define a window to rank items per province
windowSpec = Window.partitionBy("province").orderBy(desc("total_units"))

In [0]:
# 3. Add rank column
ranked_df = province_sales.withColumn("rank", row_number().over(windowSpec))

In [0]:
# 4. Filter top 5 per province
top5_per_province = ranked_df.filter(ranked_df.rank <= 5)

2. Promotion Effectiveness

In [0]:
#1. Joining Sales with Promotion on 'code' and 'supermarket'
sales_promo_df = sales_df.join(
    promotion_df,
    on=['code', 'supermarket', "province", "week"],
    how='left'
)

In [0]:
sales_promo_df = sales_promo_df.select(
    "code", "supermarket", "province", "amount", "units", "time", "basket", "day", "voucher", "feature", "display", "week"
)

In [0]:
#2. Adding a column to mark if the item was promoted
sales_promo_df = sales_promo_df.withColumn(
    "is_promoted",
    when(col("week").isNotNull(), 1).otherwise(0)
)

In [0]:
#Aggregating sales by promotion
promo_effectiveness = sales_promo_df.groupBy("is_promoted") \
    .agg(
        sum("units").alias("total_units"),
        sum("amount").alias("total_sales")
    )