In [None]:
spark

# Data Frame GroupBy

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

In [None]:
simpleData = [
    ("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000),
]

columns= ["employee_name","department","state","salary","age","bonus"]

df = spark.createDataFrame(data = simpleData, schema = columns)

In [None]:
df.printSchema()

In [None]:
df.show(truncate=True)

To aggregate dataframe, we will use `groupBy()`. `groupBy` has an alias `groupBy`. Choose them as you like.

In [None]:
df.groupBy("department")

In [None]:
df.groupby("department")

Notice that the return type of `groupBy` is `GroupedData`, not `DataFrame`.

In [None]:
type(df.groupBy("department"))

We can use aggregate methods of `GroupedData` instance directly.

In [None]:
df.groupBy("department").sum("salary").show(truncate=False)

In [None]:
df.groupBy("department").count().show(truncate=False)

The aggregate methods of `GroupedData` supports multiple arguments to aggregate multiple columns.

In [None]:
(df
  .groupBy("department","state")
  .sum("salary","bonus")
  .show(truncate=False)
)

To alias a aggregate column, use aggregate functions in `pyspark.sql.functions` and chain it wit `alias()`.

In [None]:
(df.groupBy("department")
   .agg(
     F.sum("salary").alias("sum_salary"),
     F.avg("salary").alias("avg_salary"),
     F.sum("bonus").alias("sum_bonus"),
     F.avg("bonus").alias("avg_bonus"),
   )
   .show(truncate=False)
)

## GroupBy and Where

Using `where` after `groupby` operation means applying filtering after aggregation.

In [None]:
(df.groupBy("department")
   .agg(
     F.sum("salary").alias("sum_salary"),
     F.avg("salary").alias("avg_salary"),
     F.sum("bonus").alias("sum_bonus"),
     F.max("bonus").alias("max_bonus"),
   )
   .where(F.col("sum_bonus") >= 50_000)
   .show(truncate=False)
)

## GroupBy and Sort

After getting grouped data from aggregate, we can call `sort()` to order the aggregate column result as a normal dataframe.

In [None]:
dfGroup = (
    df.groupBy("state")
      .agg(F.sum("salary").alias("sum_salary"))
)

In [None]:
dfGroup.show(truncate=False)

In [None]:
dfFilter = dfGroup.filter(dfGroup.sum_salary > 100_000)
dfFilter.show(truncate=False)

In [None]:
dfFilter.sort("sum_salary").show(truncate=False)

In [None]:
dfFilter.sort(F.desc("sum_salary")).show(truncate=False)

In summary, you can chain all operations applying with the grouped data frame within a single statement as below.

In [None]:
(df.groupBy("state")
   .agg(F.sum("salary").alias("sum_salary"))
   .filter(F.col("sum_salary") > 100_000)
   .sort(F.desc("sum_salary"))
   .show(truncate=False)
)

## More about aggregate functions

### approx_count_distinct

In [None]:
df.select(F.approx_count_distinct("salary")).show(truncate=False)

### collect_list

In [None]:
df.select(F.collect_list("salary").alias("salaries")).show(truncate=False)

In [None]:
df.groupBy("state").agg(F.collect_list("salary").alias("salaries")).show(truncate=False)

### collect_set

In [None]:
df.select(F.collect_set("salary")).show(truncate=False)

### distinct

In [None]:
df.select("department", "salary").distinct().show(truncate=False)

### countDistinct

In [None]:
df.select(F.countDistinct("department", "salary")).show(truncate=False)

### count

In [None]:
df.select(F.count("salary")).show(truncate=False)

### first

In [None]:
df.show(truncate=False)
df.select(F.first("salary")).show(truncate=False)

### last

In [None]:
df.show(truncate=False)
df.select(F.last("salary")).show(truncate=False)

### kurtosis, skewness

In [None]:
(df.select(
        F.kurtosis("salary").alias("kurtosis_salary"),
        F.skewness("salary").alias("skewness_salary"),
        F.stddev("salary").alias("stddev_salary"),
        F.stddev_pop("salary").alias("stddev_pop_salary"),
        F.variance("salary").alias("variance_salary"),
    ).show(truncate=False)
)

### max, min, avg

In [None]:
df.select(
    F.max("salary").alias("max_salary"),
    F.avg("salary").alias("avg_salary"),
    F.min("salary").alias("min_salary"),
).show(truncate=False)

### sum, sumDistinct

In [None]:
df.select(
    F.sum("salary"),
    F.sum_distinct("salary"),
).show(truncate=False)