In [None]:
import pyspark.sql.functions as f
from pyspark.sql import Window

Let's get back to our orders and products datasets and practice window functions!

**NOTE:** The orders and products datasets were downloaded and preprocessed on module 2. If you haven't run all the notebooks (including exercises solutions) from module 2, please download the data and save it to the FileStore by running the cells bellow

In [None]:
%sh

wget https://raw.githubusercontent.com/inesmcm26/lp-big-data-mercedes/refs/heads/main/data/orders-preprocessed.csv
wget https://raw.githubusercontent.com/inesmcm26/lp-big-data-mercedes/refs/heads/main/data/products-preprocessed.csv

In [None]:
%fs cp file:/databricks/driver/orders-preprocessed.csv dbfs:/FileStore/lp-big-data/orders-data/orders-preprocessed.csv

In [None]:
%fs cp file:/databricks/driver/products-preprocessed.csv dbfs:/FileStore/lp-big-data/orders-data/products-preprocessed.csv

Run the code below to load and join the datasets and take a look at the final dataset.

Next, answer the questions below.

In [None]:
df_orders = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .option("sep", ",")
    .load("/FileStore/lp-big-data/orders-data/orders-preprocessed.csv")
)

df_products = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .option("sep", ",")
    .load("/FileStore/lp-big-data/orders-data/products-preprocessed.csv")
)

df_orders_products = (
    df_orders.join(
        df_products,
        on=['product_id'],
        how='left'
    )
)

df_orders_products.display()

1. Rank each customer based on the total amount of ordered products.

The ranking should have no gaps.

In [None]:
# First, get the total amount of ordered products by customer
df_total_amount = (
    df_orders_products
    .groupBy('customer_id')
    .agg(f.sum('amount').alias('total_amount'))
)

df_total_amount.display()

In [None]:
# Define the window ordered by 'total_amount' in descending order
window = Window.orderBy(f.desc('total_amount'))

(
    df_total_amount
    # Rank the customers by their total amount of ordered products
    .withColumn('customer_rank', f.dense_rank().over(window))
).display()

Here is the whole solution:

In [None]:
window = Window.orderBy(f.desc('total_amount'))

(
    df_orders_products
    # Get the total amount of ordered products by customer
    .groupBy('customer_id')
    .agg(f.sum('amount').alias('total_amount'))
    # Rank the customers by their total amount of ordered products
    .withColumn('customer_rank', f.dense_rank().over(window))
).display()

2. For each order placed by a customer, what is the difference in days between the order's placing date and the placing date of their previous order?

In [None]:
# Define the window: partition by 'customer_id' and order by 'placing_date'
window = Window.partitionBy('customer_id').orderBy('placing_date')

(
    df_orders_products
    # Get the previous order date for each order
    .withColumn(
        'prev_order_date',
        f.lag('placing_date').over(window)
    )
    # Calculate the difference in days between the current and previous order
    .withColumn(
        'days_diff',
        f.datediff(
            f.col('placing_date'),
            f.col('prev_order_date')
        )
    )
    .select(
        'customer_id',
        'order_id',
        'placing_date',
        'prev_order_date',
        'days_diff'
    )
).display()

3. For each order placed by a customer, what is the difference in revenue between the current order and the average revenue of the customer's previous three orders?

In [None]:
# Define the window: partition by 'customer_id', order by 'placing_date' and select the last 3 rows
window = Window.partitionBy('customer_id').orderBy('placing_date').rowsBetween(-3, -1)

(
    df_orders_products
    # Calculate the average revenue for the last 3 orders
    .withColumn(
        'avg_revenue',
        f.avg('revenue').over(window)
    )
    # Calculate the difference between the total price and the average revenue
    .withColumn(
        'revenue_diff',
        f.col('revenue') - f.col('avg_revenue')
    )
    .select(
        'customer_id',
        'order_id',
        'placing_date',
        'revenue',
        'avg_revenue',
        'revenue_diff'
    )
).display()

4. What is the yearly average profit increase or decrease (difference between year Y and year Y-1) for each supplier?

The average profit for a supplier is determined by calculating the average profit from all orders of products supplied by that supplier.

In [None]:
# First, get the average profit for each supplier in each year
df_avg_profit = (
    df_orders_products
    .groupBy(['order_year', 'supplier_id'])
    .agg(f.avg('profit').alias('avg_profit'))
)

df_avg_profit.display()

In [None]:
# Next, define the window: partition by 'supplier_id', order by 'order_year'
window = Window.partitionBy('supplier_id').orderBy('order_year')

(
    df_avg_profit
    # Get the previous year average profit for each supplier
    .withColumn('prev_year_avg_profit', f.lag('avg_profit').over(window))
    # Calculate the difference between the average profit and the previous year average profit
    .withColumn('profit_diff', f.col('avg_profit') - f.col('prev_year_avg_profit'))
).display()

Here is the whole solution:

In [None]:
(
    df_orders_products
    # Get the average profit for each supplier in each year
    .groupBy(['order_year', 'supplier_id'])
    .agg(f.avg('profit').alias('avg_profit'))
    # Get the previous year average profit for each supplier
    .withColumn('prev_year_avg_profit', f.lag('avg_profit').over(window))
    # Calculate the difference between the average profit and the previous year average profit
    .withColumn('profit_diff', f.col('avg_profit') - f.col('prev_year_avg_profit'))
).display()

5. **BONUS** Calculate the cumulative sum of average revenue generated by each customer over the years

Hint: Check the `rowsBetween()` documentation [here](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.Window.rowsBetween.html#pyspark.sql.Window.rowsBetween)

In [None]:
window = (
    Window
    .partitionBy('customer_id')
    .orderBy('order_year')
    # Define the window frame: from the first year to the current year
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)
)

(
    df_orders_products
    # Calculate the total revenue for each customer in each year
    .groupBy('customer_id', 'order_year')
    .agg(f.avg('revenue').alias('avg_revenue'))
    # Calculate the cumulative revenue for each customer over the years
    .withColumn(
        'cum_revenue',
        f.sum('avg_revenue').over(window)
    )
).display()