In [None]:
import pyspark.sql.functions as f
from pyspark.sql import Window

Let's answer some questions about our FruitShop orders data.

Load the data and recall the schema.

In [None]:
df_fruitshop = spark.read.parquet('/FileStore/lp-big-data/fruitshop.parquet')

df_fruitshop.printSchema()

Answer the following questions. Good luck!

1. What is the general average number of different items sold per order?

In [None]:
(
    df_fruitshop
    .withColumn(
        'unique_item_names',
        f.array_distinct(f.transform(f.col('items'), lambda x: x['name']))
    )
    .withColumn("num_items", f.size("unique_item_names"))
    .select(
        f.avg("num_items").alias("average_items_per_order")
    )
).display()

2. What is the total amount of each fruit sold?

***Hint:*** Start by transforming the data into a more usable format.

In [None]:
(
    df_fruitshop
    .select(f.inline('items'))
    .groupBy('name')
    .agg(f.sum('quantity').alias('total_quantity'))
).display()

3. What was the total amount of `Peach` that was sold in orders where `Peach` was at discount?

In [None]:
(
    df_fruitshop
    # Explode items to get one row per item
    .select(
        'order_id',
        'items_discount',
        f.explode(f.col('items')).alias('item')
    )
    # Filter for Peach items with discount
    .filter(
        (f.col('item.name') == 'Peach')
        & (f.array_contains(f.col('items_discount'), 'Peach'))
    )
    .select(
        f.sum('item.quantity').alias('total_amount')
    )
).display()

4. What is the price of the most expensive item in each order?

In [None]:
(
    df_fruitshop
    # Get item prices in a seperate array
    .withColumn(
        'items_price',
        f.transform(f.col('items'), lambda x: x['price'])
    )
    # Get the maximum value in the prices array
    .withColumn(
        'max_price',
        f.array_max(f.col('items_price'))
    )
).display()

5. What is the name of the most expensive item in each order?

***Hint:*** Use a window function.

In [None]:
window = Window.partitionBy('order_id')

(
    df_fruitshop
    .select(
        'order_id',
        f.inline('items')
    )
    .withColumn(
        'max_price',
        f.max(f.col('price')).over(window)
    )
    .filter(
        f.col('price') == f.col('max_price')
    )
    .dropDuplicates()
    .select(
        'order_id',
        'name'
    )
).display()

6. What is the total amount of each fruit that is sold in orders where the number of unique items is greater than the general average?

Let's break down the question:
- Calculate the general average number of unique items sold per order.
- Filter the orders where the number of unique items is greater than the general average.
- Calculate the total amount of each fruit sold in the filtered orders.

In [None]:
# Get the average number of items per order

avg_items_per_order = (
    df_fruitshop
    .withColumn(
        'unique_item_names',
        f.array_distinct(f.transform(f.col('items'), lambda x: x['name']))
    )
    .withColumn("unique_num_items", f.size("unique_item_names"))
    .select(
        f.avg("unique_num_items").alias("average_items_per_order")
    )
).collect()[0][0]


(
    df_fruitshop
    .withColumn('avg_items_per_order', f.lit(avg_items_per_order))
    .filter(
        f.size(
            f.array_distinct(
                f.transform(f.col('items'), lambda x: x['name'])
            )
        ) > f.col('avg_items_per_order')
    )
    .select(
        'order_id',
        f.explode('items').alias('item')
    )
    .groupBy('item.name')
    .agg(f.sum('item.quantity').alias('total_quantity'))

).display()