To perform total aggregations we can also leverage `agg` function.

In [0]:
order_items = spark.read.json('/public/retail_db_json/order_items')

In [0]:
order_items.dtypes

* Get revenue using `order_item_subtotal` for a given `order_item_order_id` (eg: 2)

In [0]:
order_items.filter('order_item_order_id = 2')

In [0]:
order_items.filter('order_item_order_id = 2').show()

In [0]:
from pyspark.sql.functions import sum

In [0]:
help(sum)

In [0]:
help(order_items.agg)

In [0]:
help(order_items.groupBy('order_item_order_id').agg)

In [0]:
order_items.filter('order_item_order_id = 2').agg(sum('order_item_subtotal')).show()

In [0]:
order_items.filter('order_item_order_id = 2').agg(sum('order_item_subtotal').alias('order_revenue')).show()

* Get number of items, total quantity as well as revenue for a given order item order id (eg: 2)
  * Number of items can be computed using `count` on `order_item_quantity`.
  * Total quantity can be computed using `sum` on `order_item_quantity`.
  * Total Revenue can be computed using `sum` on `order_item_subtotal`.

In [0]:
from pyspark.sql.functions import count

In [0]:
order_items. \
    filter('order_item_order_id = 2'). \
    show()

In [0]:
order_items. \
    filter('order_item_order_id = 2'). \
    agg(
        count('order_item_quantity').alias('order_item_count'),
        sum('order_item_quantity').alias('order_quantity'),
        sum('order_item_subtotal').alias('order_revenue')
    ). \
    show()

In [0]:
# We can only perform one aggregation per one column using this approach
order_items. \
    filter('order_item_order_id = 2'). \
    agg(
        {'order_item_quantity': 'count', 'order_item_subtotal': 'sum'}
    ). \
    toDF('order_item_count', 'order_revenue'). \
    show()