In [3]:
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.types import *

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

spark = SparkSession.builder \
                    .appName("Spark Aggregate Function Demo") \
                    .master("local[3]") \
                    .enableHiveSupport() \
                    .getOrCreate()

In [4]:
invoice_df = spark.read \
        .format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load("invoices.csv")
invoice_df.show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536365|     NULL|WHITE HANGING HEA...|       6|01-12-2010 8.26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|01-12-2010 8.26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|01-12-2010 8.26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|01-12-2010 8.26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|01-12-2010 8.26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 5 rows



In [11]:
invoice_df.select(f.count("*").alias("Count *"),
                  f.sum("Quantity"),
                  f.avg("UnitPrice"),
                  f.count_distinct("InvoiceNo")).show()

+-------+-------------+-----------------+-------------------------+
|Count *|sum(Quantity)|   avg(UnitPrice)|count(DISTINCT InvoiceNo)|
+-------+-------------+-----------------+-------------------------+
| 541909|      5176450|4.611113626088481|                    25900|
+-------+-------------+-----------------+-------------------------+



In [12]:
# using sql expressions 
invoice_df.selectExpr(
        "count(1) as `count 1`",
        "count(StockCode) as `count field`",
        "sum(Quantity) as TotalQuantity",
        "avg(UnitPrice) as AvgPrice"
    ).show()

+-------+-----------+-------------+-----------------+
|count 1|count field|TotalQuantity|         AvgPrice|
+-------+-----------+-------------+-----------------+
| 541909|     541908|      5176450|4.611113626086849|
+-------+-----------+-------------+-----------------+



In [13]:
# Grouping based on country
invoice_df.createOrReplaceTempView("sales")
summary_sql = spark.sql("""
          SELECT Country, InvoiceNo,
                sum(Quantity) as TotalQuantity,
                round(sum(Quantity*UnitPrice),2) as InvoiceValue
          FROM sales
          GROUP BY Country, InvoiceNo""")

summary_sql.show()

+--------------+---------+-------------+------------+
|       Country|InvoiceNo|TotalQuantity|InvoiceValue|
+--------------+---------+-------------+------------+
|United Kingdom|   536446|          329|      440.89|
|United Kingdom|   536508|          216|      155.52|
|United Kingdom|   537018|           -3|         0.0|
|United Kingdom|   537401|          -24|         0.0|
|United Kingdom|   537811|           74|      268.86|
|United Kingdom|  C537824|           -2|       -14.9|
|United Kingdom|   538895|          370|      247.38|
|United Kingdom|   540453|          341|      302.45|
|United Kingdom|   541291|          217|      305.81|
|United Kingdom|   542551|           -1|         0.0|
|United Kingdom|   542576|           -1|         0.0|
|United Kingdom|   542628|            9|      132.35|
|United Kingdom|   542886|          199|      320.51|
|United Kingdom|   542907|           75|      313.85|
|United Kingdom|   543131|          134|       164.1|
|United Kingdom|   543189|  

In [14]:
# using previous example as refernce, performing aggregate functions using dataframe
summary_df = invoice_df \
        .groupBy("Country", "InvoiceNo") \
        .agg(f.sum("Quantity").alias("TotalQuantity"),
             f.round(f.sum(f.expr("Quantity * UnitPrice")), 2).alias("InvoiceValue"),
             f.expr("round(sum(Quantity * UnitPrice),2) as InvoiceValueExpr")
             )

summary_df.show()

+--------------+---------+-------------+------------+----------------+
|       Country|InvoiceNo|TotalQuantity|InvoiceValue|InvoiceValueExpr|
+--------------+---------+-------------+------------+----------------+
|United Kingdom|   536446|          329|      440.89|          440.89|
|United Kingdom|   536508|          216|      155.52|          155.52|
|United Kingdom|   537018|           -3|         0.0|             0.0|
|United Kingdom|   537401|          -24|         0.0|             0.0|
|United Kingdom|   537811|           74|      268.86|          268.86|
|United Kingdom|  C537824|           -2|       -14.9|           -14.9|
|United Kingdom|   538895|          370|      247.38|          247.38|
|United Kingdom|   540453|          341|      302.45|          302.45|
|United Kingdom|   541291|          217|      305.81|          305.81|
|United Kingdom|   542551|           -1|         0.0|             0.0|
|United Kingdom|   542576|           -1|         0.0|             0.0|
|Unite