In [None]:
import pyspark.sql.functions as f
from pyspark.sql import Window

Let's get back to our orders dataset and practice window functions!

Start by running the code below to load and join the datasets and take a look at the final dataset.

Next, answer the questions below.

In [None]:
df_orders = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .option("sep", ",")
    .load("/FileStore/lp-big-data/preprocessed-data/orders-data/orders_preprocessed.csv")
)

df_products = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .option("sep", ",")
    .load("/FileStore/lp-big-data/preprocessed-data/orders-data/products_preprocessed.csv")
)

df_orders_products = (
    df_orders.join(
        df_products,
        on=['product_id'],
        how='left'
    )
)

df_orders_products.display()

1. Rank each customer based on the total amount of ordered products.

The ranking should have no gaps.

In [None]:
# First, get the total amount of ordered products by customer
df_total_amount = (
    df_orders_products
    .groupBy('customer_id')
    .agg(f.sum('amount').alias('total_amount'))
)

df_total_amount.display()

In [None]:
# Define the window ordered by 'total_amount' in descending order
window = Window.orderBy(f.desc('total_amount'))

(
    df_total_amount
    # Rank the customers by their total amount of ordered products
    .withColumn('customer_rank', f.dense_rank().over(window))
).display()

Here is the whole solution:

In [None]:
window = Window.orderBy(f.desc('total_amount'))

(
    df_orders_products
    # Get the total amount of ordered products by customer
    .groupBy('customer_id')
    .agg(f.sum('amount').alias('total_amount'))
    # Rank the customers by their total amount of ordered products
    .withColumn('customer_rank', f.dense_rank().over(window))
)


2. For each customer, what is the difference in days between each of his order's placing date and the previous order's placing date?

In [None]:
# Define the window: partition by 'customer_id' and order by 'placing_date'
window = Window.partitionBy('customer_id').orderBy('placing_date')

(
    df_orders_products
    # Get the previous order date for each order
    .withColumn(
        'prev_order_date',
        f.lag('placing_date').over(window)
    )
    # Calculate the difference in days between the current and previous order
    .withColumn(
        'days_diff',
        f.datediff(
            f.col('placing_date'),
            f.col('prev_order_date')
        )
    )
    .select(
        'customer_id',
        'order_id',
        'placing_date',
        'prev_order_date',
        'days_diff'
    )
).display()

3. For each customer, what is the difference in revenue between each of his orders and the average revenue of the 3 previously placed orders?

In [None]:
# Define the window: partition by 'customer_id', order by 'placing_date' and select the last 3 rows
window = Window.partitionBy('customer_id').orderBy('placing_date').rowsBetween(-3, -1)

(
    df_orders_products
    # Calculate the average revenue for the last 3 orders
    .withColumn(
        'avg_revenue',
        f.avg('revenue').over(window)
    )
    # Calculate the difference between the total price and the average revenue
    .withColumn(
        'revenue_diff',
        f.col('revenue') - f.col('avg_revenue')
    )
).display()

4. For each customer, what is the absolute difference in days between each of his orders' delivery date and the next order's placing date, where the next order's placing date is before than the current order's delivery date?

Let's break the question down:
- We want to calculate the difference in days between each order delivery date and the next order placing date
- The next order placing date should be before the current order delivery date

In [None]:
# Define the window: partition by 'customer_id', order by 'placing_date'
window = Window.partitionBy('customer_id').orderBy('placing_date')

(
    df_orders_products
    # Get the next order placing date for each order
    .withColumn(
        'next_placing_date',
        f.lead('placing_date').over(window)
    )
    # Calculate the difference in days between the current
    # order delivery date and next order placing date
    .withColumn(
        'days_diff',
        f.datediff(f.col('delivery_date'), f.col('next_placing_date'))
    )
    # Filter the rows where the next order was placed before the current order was delivered
    .filter(f.col('next_placing_date') < f.col('delivery_date'))
    .select(
        'customer_id',
        'placing_date',
        'delivery_date',
        'next_placing_date',
        'days_diff'
    )
).display()

5. What is the yearly average profit increase or decrease (difference between year Y and year Y-1) for each supplier?

In [None]:
# First, get the average profit for each supplier in each year
df_avg_profit = (
    df_orders_products
    .groupBy(['order_year', 'supplier_id'])
    .agg(f.avg('profit').alias('avg_profit'))
)

df_avg_profit.display()

In [None]:
# Next, define the window: partition by 'supplier_id', order by 'order_year'
window = Window.partitionBy('supplier_id').orderBy('order_year')

(
    df_avg_profit
    # Get the previous year average profit for each supplier
    .withColumn('prev_year_avg_profit', f.lag('avg_profit').over(window))
    # Calculate the difference between the average profit and the previous year average profit
    .withColumn('profit_diff', f.col('avg_profit') - f.col('prev_year_avg_profit'))
).display()

Here is the whole solution:

In [None]:
(
    df_orders_products
    # Get the average profit for each supplier in each year
    .groupBy(['order_year', 'supplier_id'])
    .agg(f.avg('profit').alias('avg_profit'))
    # Get the previous year average profit for each supplier
    .withColumn('prev_year_avg_profit', f.lag('avg_profit').over(window))
    # Calculate the difference between the average profit and the previous year average profit
    .withColumn('profit_diff', f.col('avg_profit') - f.col('prev_year_avg_profit'))
).display()

6. Determine the top 3 products with the highest profit margin, for each supplier country.

Profit margin refers to the ratio between profit and revenue:

$profit\_margin_{product} = profit_{product} / revenue_{product}$

where

$profit_{product} = revenue_{product} - cost_{product}$

Let's break this question down:
- To calculate the profit of each product in each supplier country, we need to calculate its total revenue and cost for each supplier country
- To calculate the total cost of each product, we need to multiply the cost per unit by the quantity ordered in each order
- Next, the profit is calculated as the difference between total revenue and total cost
- The profit margin is calculated as the ratio between profit and total revenue
- Finally, the products are ranked by profit margin and the top 3 are selected

In [None]:

# First, calculate the total cost of each product order
df_cost = (
    df_orders_products
    .withColumn('cost', f.col('amount') * f.col('cost_per_unit'))
)

df_cost.display()

In [None]:
# Then, get the total cost and revenue of each product, for each supplier country
df_grouped = (
    df_cost
    .groupBy(['supplier_country', 'product_id'])
    .agg(
        f.sum('cost').alias('total_cost'),
        f.sum('revenue').alias('total_revenue')
    )
)

df_grouped.display()

In [None]:
# Next, calculate the profit and profit margin for each product
df_profit_margin = (   
    df_grouped
    # Calculate the profit
    .withColumn('profit', f.col('total_revenue') - f.col('total_cost'))
    # Calculate the profit margin
    .withColumn('profit_margin', f.col('profit') / f.col('total_revenue'))
)

df_profit_margin.display()

In [None]:
# Finally, rank the products by profit margin

# Define the window: partition by 'supplier_country', order by 'profit_margin' in descending order
window = Window.partitionBy('supplier_country').orderBy(f.desc('profit_margin'))

(
    df_profit_margin
    # Rank the products by their profit margin
    .withColumn('rank', f.rank().over(window))
    # Filter the top 3 products by profit margin
    .filter(f.col('rank') <= 3)
).display()

Here is the whole solution:

In [None]:
(
    df_orders_products
    # Calculate cost per order
    .withColumn('cost', f.col('amount') * f.col('cost_per_unit'))
    # Get total cost and revenue of each product, for each supplier country
    .groupBy(['supplier_country', 'product_id'])
    .agg(
        f.sum('cost').alias('total_cost'),
        f.sum('revenue').alias('total_revenue')
    )
    # Calculate the profit
    .withColumn('profit', f.col('total_revenue') - f.col('total_cost'))
    # Calculate the profit margin
    .withColumn('profit_margin', f.col('profit') / f.col('total_revenue'))
    # Rank the products by their profit margin
    .withColumn('rank', f.rank().over(window))
    # Filter the top 3 products by profit margin
    .filter(f.col('rank') <= 3)
).display()

7. **BONUS** Calculate the cumulative sum of average revenue generated by each customer over the years

Hint: Check the `rowsBetween()` documentation [here](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.Window.rowsBetween.html#pyspark.sql.Window.rowsBetween)

In [None]:
window = (
    Window
    .partitionBy('customer_id')
    .orderBy('order_year')
    # Define the window frame: from the first year to the current year
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)
)

(
    df_orders_products
    # Calculate the total revenue for each customer in each year
    .groupBy('customer_id', 'order_year')
    .agg(f.avg('revenue').alias('avg_revenue'))
    # Calculate the cumulative revenue for each customer over the years
    .withColumn(
        'cum_revenue',
        f.sum('avg_revenue').over(window)
    )
).display()