## Aggregate Functions
let’s create a DataFrame to work with PySpark aggregate functions

In [82]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import approx_count_distinct,collect_list
from pyspark.sql.functions import collect_set,sum,avg,max,countDistinct,count
from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness 
from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct
from pyspark.sql.functions import variance,var_samp,  var_pop

spark = SparkSession.builder.appName("aggregate").getOrCreate()


simpleData = [("Greg", "Sales", 13000),
    ("Michael", "Sales", 14600),
    ("Robert", "Sales", 14100),
    ("Martha", "HR", 13000),
    ("James", "Sales", 13000),
    ("Scott", "Finance", 13300),
    ("Jen", "Marketing", 13900),
    ("Jeff", "Marketing", 13000),
    ("Manny", "Finance", 12000),
    ("James", "Sales", 14100)
  ]
schema = ["employee_name", "department", "salary"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Greg         |Sales     |13000 |
|Michael      |Sales     |14600 |
|Robert       |Sales     |14100 |
|Martha       |HR        |13000 |
|James        |Sales     |13000 |
|Scott        |Finance   |13300 |
|Jen          |Marketing |13900 |
|Jeff         |Marketing |13000 |
|Manny        |Finance   |12000 |
|James        |Sales     |14100 |
+-------------+----------+------+



### approx_count_distinct()
 `approx_count_distinct()` function returns the count of distinct items in a group

In [83]:
print(f"approx_count_distinct:  {str(df.select(approx_count_distinct('salary')).collect()[0][0])}")

approx_count_distinct:  6


### avg (average)
`avg()` function returns the average of values in the input column

In [84]:
print(f"avg: {str(df.select(avg('salary')).collect()[0][0])}")

avg: 13400.0


### collect_list
`collect_list()` function returns all values from an input column with duplicates.

In [85]:
df.select(collect_list("salary")).show(truncate=False)

+----------------------------------------------------------------------+
|collect_list(salary)                                                  |
+----------------------------------------------------------------------+
|[13000, 14600, 14100, 13000, 13000, 13300, 13900, 13000, 12000, 14100]|
+----------------------------------------------------------------------+



### collect_set
`collect_set()` function returns all values from an input column with duplicate values eliminated.

In [86]:
df.select(collect_set("salary")).show(truncate=False)

+------------------------------------------+
|collect_set(salary)                       |
+------------------------------------------+
|[12000, 13300, 14100, 13000, 14600, 13900]|
+------------------------------------------+



### countDistinct
`countDistinct()` function returns the number of distinct elements in a columns

In [87]:
df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print(f"Distinct Count of Department & Salary: {str(df2.collect()[0][0])}")

+----------------------------------+
|count(DISTINCT department, salary)|
+----------------------------------+
|8                                 |
+----------------------------------+

Distinct Count of Department & Salary: 8


### count
`count()` function returns number of elements in a column.

In [88]:
print(f"count: {str(df.select(count('salary')).collect()[0])}")

count: Row(count(salary)=10)


### first
`first()` function returns the first element in a column when ignoreNulls is set to true, it returns the first non-null element.

In [89]:
df.select(first("salary")).show(truncate=False)

+-------------+
|first(salary)|
+-------------+
|13000        |
+-------------+



### last 
`last()` function returns the last element in a column. when ignoreNulls is set to true, it returns the last non-null element.

In [90]:
df.select(last("salary")).show(truncate=False)

+------------+
|last(salary)|
+------------+
|14100       |
+------------+



### kurtosis 
`kurtosis()` function returns the kurtosis of the values in a group.

In [91]:
df.select(kurtosis("salary")).show(truncate=False)

+-------------------+
|kurtosis(salary)   |
+-------------------+
|-0.6467803030303032|
+-------------------+



### max 
`max()` function returns the maximum value in a column.

In [92]:
df.select(max("salary")).show(truncate=False)

+-----------+
|max(salary)|
+-----------+
|14600      |
+-----------+



### min 
`min()` function

In [93]:
df.select(min("salary")).show(truncate=False)

+-----------+
|min(salary)|
+-----------+
|12000      |
+-----------+



### mean 
`mean()` function returns the average of the values in a column. Alias for Avg

In [94]:
df.select(mean("salary")).show(truncate=False)

+-----------+
|avg(salary)|
+-----------+
|13400.0    |
+-----------+



### skewness 
`skewness()` function returns the skewness of the values in a group.

In [95]:
df.select(skewness("salary")).show(truncate=False)

+--------------------+
|skewness(salary)    |
+--------------------+
|-0.12041791181069571|
+--------------------+



### stddev(), stddev_samp() and stddev_pop()
`stddev()` alias for stddev_samp.

`stddev_samp()` function returns the sample standard deviation of values in a column.

`stddev_pop()` function returns the population standard deviation of the values in a column

In [96]:
df.select(stddev("salary"), stddev_samp("salary"), stddev_pop("salary")).show(truncate=False)

+-----------------+-------------------+------------------+
|stddev(salary)   |stddev_samp(salary)|stddev_pop(salary)|
+-----------------+-------------------+------------------+
|765.9416862050705|765.9416862050705  |726.636084983398  |
+-----------------+-------------------+------------------+



### sum 
sum() function Returns the sum of all values in a column.

In [97]:
df.select(sum("salary")).show(truncate=False)

+-----------+
|sum(salary)|
+-----------+
|134000     |
+-----------+



### sumDistinct
`sumDistinct()` function returns the sum of all distinct values in a column.

In [98]:
df.select(sumDistinct("salary")).show(truncate=False)



+--------------------+
|sum(DISTINCT salary)|
+--------------------+
|80900               |
+--------------------+



### variance(), var_samp(), var_pop()
`variance()` alias for var_samp

`var_samp()` function returns the unbiased variance of the values in a column.

`var_pop()` function returns the population variance of the values in a column.



In [99]:
from pyspark.sql.functions import round

df.select(round(variance("salary"), 2).alias("salary_variance"),round(var_samp("salary"), 2).alias("unbiased_var_salary"),var_pop("salary")).show(truncate=False)

+---------------+-------------------+---------------+
|salary_variance|unbiased_var_salary|var_pop(salary)|
+---------------+-------------------+---------------+
|586666.67      |586666.67          |528000.0       |
+---------------+-------------------+---------------+

