In [0]:
PySpark Groupby Explained with Example
Similar to SQL GROUP BY clause, PySpark groupBy() function is used to collect the identical data into groups on DataFrame and perform count, sum, avg, min, max functions on the grouped data. In this article, I will explain several groupBy() examples using PySpark (Spark with Python).
simpleData = [("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000)
  ]

schema = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)
2. PySpark groupBy on DataFrame Columns
Let’s do the groupBy() on department column of DataFrame and then find the sum of salary for each department using sum() function.
df.groupBy("department").sum("salary").show(truncate=False)
Similarly, we can calculate the number of employees in each department using.
df.groupBy("department").count()
Calculate the minimum salary of each department using min()
df.groupBy("department").min("salary")
Calculate the maximin salary of each department using max()
df.groupBy("department").max("salary")
Calculate the average salary of each department using avg()
df.groupBy("department").avg( "salary")
Calculate the mean salary of each department using mean()
df.groupBy("department").mean( "salary") 

In [0]:
import pyspark
simpleData = [("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000)
  ]
schema = ["employee_name","department","state","salary","age","bonus"]
df200 = spark.createDataFrame(data = simpleData,schema = schema)
df200.printSchema()
df200.show()
df200.groupBy("department").sum("salary").show()
df200.groupBy("department").count().show()
df200.groupBy("department").min("salary").show()
df200.groupBy("department").max("salary").show()
df200.groupBy("department").avg("salary").show()
df200.groupBy("department").mean("salary").show()


root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+

+----------+-----------+
|department|sum(salary)|
+----------+-----------+
|     Sales|     257000|
|   Finance|   

In [0]:
3. Using Multiple columns
Similarly, we can also run groupBy and aggregate on two or more DataFrame columns, below example does group by on department,state and does sum() on salary and bonus columns.
//GroupBy on multiple columns
df.groupBy("department","state") \
    .sum("salary","bonus") \
    .show(false)

4. Running more aggregates at a time
Using agg() aggregate function we can calculate many aggregations at a time on a single statement using SQL functions sum(), avg(), min(), max() mean() e.t.c. In order to use these, we should import "from pyspark.sql.functions import sum,avg,max,min,mean,count"
from pyspark.sql.functions import sum,avg,max
df.groupBy("department") \
    .agg(sum("salary").alias("sum_salary"), \
         avg("salary").alias("avg_salary"), \
         sum("bonus").alias("sum_bonus"), \
         max("bonus").alias("max_bonus") \
     ) \
    .show(truncate=False)
This example does group on department column and calculates sum() and avg() of salary for each department and calculates sum() and max() of bonus for each department.
5. Using filter on aggregate data
Similar to SQL “HAVING” clause, On PySpark DataFrame we can use either where() or filter() function to filter the rows of aggregated data.
from pyspark.sql.functions import sum,avg,max
df.groupBy("department") \
    .agg(sum("salary").alias("sum_salary"), \
      avg("salary").alias("avg_salary"), \
      sum("bonus").alias("sum_bonus"), \
      max("bonus").alias("max_bonus")) \
    .where(col("sum_bonus") >= 50000) \
    .show(truncate=False)

In [0]:
from pyspark.sql.functions import sum,min,max,avg,col
df200.groupBy("department","state").sum("salary","bonus").show(truncate = False)
df200.groupBy("department").agg(sum("salary").alias("sum"),\
                               avg("salary").alias("salary"),\
                                min("salary").alias("min"),\
                                max("salary").alias("max")
                               ).show()
df200.groupBy("department").agg(sum("salary").alias("sum_salary"),avg("salary").alias("avg_salary"),
                                sum("bonus").alias("bonus_sum"),max("bonus").alias("max_bonus")).where(col("bonus_sum")>=50000).show()

+----------+-----+-----------+----------+
|department|state|sum(salary)|sum(bonus)|
+----------+-----+-----------+----------+
|Sales     |NY   |176000     |30000     |
|Sales     |CA   |81000      |23000     |
|Finance   |CA   |189000     |47000     |
|Finance   |NY   |162000     |34000     |
|Marketing |NY   |91000      |21000     |
|Marketing |CA   |80000      |18000     |
+----------+-----+-----------+----------+

+----------+------+-----------------+-----+-----+
|department|   sum|           salary|  min|  max|
+----------+------+-----------------+-----+-----+
|     Sales|257000|85666.66666666667|81000|90000|
|   Finance|351000|          87750.0|79000|99000|
| Marketing|171000|          85500.0|80000|91000|
+----------+------+-----------------+-----+-----+

+----------+----------+-----------------+---------+---------+
|department|sum_salary|       avg_salary|bonus_sum|max_bonus|
+----------+----------+-----------------+---------+---------+
|     Sales|    257000|85666.66666666667| 