In [None]:
import pyspark.sql.functions as f

In PySpark DataFrames - Part 2 module you've learned how to join dataframes, perform aggregations and create pivot tables to answer business questions

In this notebook there are some more of those questions that you can answer using PySpark DataFrames methods and SQL functions.

Some of them will require you to look into the documentation to find the right function to use, which I think is the best way to learn how to use PySpark.

So, let's get started!

First, let's get the preprocessed orders and products dataframes like in the theoretical notebook.

In [None]:
df_orders = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .option("sep", ",")
    .load("/FileStore/lp-big-data/preprocessed-data/orders-data/orders_preprocessed.csv")
)

df_products = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .option("sep", ",")
    .load("/FileStore/lp-big-data/preprocessed-data/orders-data/products_preprocessed.csv")
)

Join the tables on the product_id column and store the result in a new dataframe called orders_products.

For the exercises, ignore all orders that don't have a matching product in the products dataframe.

Then, show the first 5 rows of the orders_products dataframe.

In [None]:
df_orders_products = (
    df_orders.join(
        df_products,
        on=['product_id'],
        how='inner'
    )
)

df_orders_products.show(5)

1. Are the fast deliveries cheaper or more expensive for customers than the other ones? And more profitable for suppliers or not?

In [None]:
(
    df_orders_products
    .groupBy('delivery_speed')
    .agg({
        'total_price': 'avg',
        'profit': 'avg',
    })
).display()

2. What is the average lead time of each product considering only the 100 most profitable products?

***Hint:*** Break down the problem into smaller steps.

First, calculate the average profit and lead time for each product
Then select only the 100 most sold products and select the average lead time for them.

In [None]:
(
    df_orders_products
    .groupBy('product_id')
    .agg(
        f.avg('profit').alias('avg_profit'),
        f.avg('lead_time').alias('avg_lead_time'),
    )
    .orderBy(f.desc('avg_profit'))
    .limit(100)
    .select(
        'product_id',
        'avg_lead_time'
    )
).display()

3. What is the variance of total revenue generated per month in each country?

In [None]:
(
    df_orders_products
    .groupBy(['order_month', 'supplier_country'])
    .agg(f.variance('revenue').alias('revenue_variance'))
).display()

4. How many orders of each product category were placed, considering only orders with a number of days to delivery slower than the median?

***Hint:*** Start by calculating the median delivery speed separately. Look into the [documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html) to find the right function to use.

In [None]:
median_delivery_speed = df_orders_products.stat.approxQuantile('days_to_delivery', [0.5], 0.01)[0]

(
    df_orders_products
    .filter(f.col('days_to_delivery') > median_delivery_speed)
    .groupBy('product_category')
    .count()
).display()

5. How many orders were placed on each day of the week, considering only orders placed in the first half of each year?

In [None]:
(
    df_orders_products
    .filter(f.col('order_month') <= 6)
    .groupBy('order_day_of_week')
    .count()
).display()

6. How many unique products were announced in each supplier continent before 15-05-2014?

In [None]:
(
    df_orders_products
    .filter(
        f.col('announcement_date') < f.to_date(f.lit('2014-05-15'))
    )
    .groupBy('supplier_continent')
    .agg(
        f.countDistinct('product_id').alias('number_of_products')
    )
).display()

7. How many orders in each supplier country had a delivery delay greater than the global average delivery delay?

In [None]:
# Convert delivery date to unix timestamp to apply the average function
avg_delivery_date = (
    df_orders_products
    .select(f.avg(f.col('days_to_delivery')))
    .first()
)[0]

(
    df_orders_products
    .filter(f.col('days_to_delivery') > avg_delivery_date)
    .groupBy('supplier_country')
    .count()
).display()

8. What is the average number of products in each order per customer status, considering only customers who placed more than 5 orders?

In [None]:
df_customer_orders = (
    df_orders_products
    .groupBy('customer_id')
    .count()
    .filter(f.col('count') > 5)
)

(
    df_orders_products.join(
        df_customer_orders,
        on='customer_id'
    )
    .groupBy('customer_status')
    .agg(f.avg('amount').alias('avg_amount'))
).display()

9. For each product line, what is the ratio of the number of unique products to the total number of orders?

In [None]:
(
    df_orders_products
    .groupBy('product_line')
    .agg(
        f.countDistinct(f.col('product_id')).alias('unique_products'),
        f.count(f.col('order_id')).alias('total_nr_orders')
    )
    .withColumn('ratio', f.col('unique_products') / f.col('total_nr_orders'))
).display()

10. Which supplier has the highest average delivery delay among the suppliers who have delivered products to at least 150 different customers?

In [None]:
(
    df_orders_products
    .groupBy(['supplier_id', 'supplier_name'])
    .agg(
        f.avg('days_to_delivery').alias('avg_delivery_delay'),
        f.countDistinct('customer_id').alias('nr_unique_customers')
    )
    .filter(f.col('nr_unique_customers') > 150)
    .orderBy(f.desc('avg_delivery_delay'))
).show(1)