In [None]:
Aggregate functions in PySpark are essential for summarizing data across distributed datasets. They allow computations like 
sum, average, count, maximum, and minimum to be performed efficiently in parallel across multiple nodes in a cluster.


In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import approx_count_distinct,collect_list
from pyspark.sql.functions import collect_set,sum,avg,max,countDistinct,count
from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness 
from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct
from pyspark.sql.functions import variance,var_samp,  var_pop

spark = SparkSession.builder.master("local[1]").appName('SparkByExamples.com').getOrCreate()

simpleData = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ["employee_name", "department", "salary"]
  
  
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

#In PySpark approx_count_distinct() function returns the count of distinct items in a group.
print("approx_count_distinct: " + str(df.select(approx_count_distinct("salary")).collect()[0][0]))

#avg() function returns the average of values in the input column.
print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

#collect_list() function returns all values from an input column with duplicates.
df.select(collect_list("salary")).show(truncate=False)

#collect_set() function returns all values from an input column with duplicate values eliminated.
df.select(collect_set("salary")).show(truncate=False)

#countDistinct() function returns the number of distinct elements in a columns
df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print("Distinct Count of Department &amp; Salary: "+str(df2.collect()[0][0]))

#count() function returns number of elements in a column.
print("count: "+str(df.select(count("salary")).collect()[0]))

#first() function returns the first element in a column when ignoreNulls is set to true, it returns the first non-null element.
df.select(first("salary")).show(truncate=False)

#last() function returns the last element in a column. when ignoreNulls is set to true, it returns the last non-null element.
df.select(last("salary")).show(truncate=False)

#kurtosis() function returns the kurtosis of the values in a group.
#"skewness" measures the asymmetry of a data distribution, indicating whether the data is leaning more towards one side than 
#the other, while "kurtosis" measures the "tailedness" of a distribution, showing how concentrated the data is around the mean
#and how heavy the tails are compared to a normal distribution; essentially, skewness looks at the symmetry,and kurtosis examines the peakiness of a distribution
df.select(kurtosis("salary")).show(truncate=False)


df.select(max("salary")).show(truncate=False)
df.select(min("salary")).show(truncate=False)

#mean() function returns the average of the values in a column. Alias for Avg
df.select(mean("salary")).show(truncate=False)

#skewness() function returns the skewness of the values in a group.
df.select(skewness("salary")).show(truncate=False)

#stddev() alias for stddev_samp.
#stddev_samp() function returns the sample standard deviation of values in a column.
#stddev_pop() function returns the population standard deviation of the values in a column.
df.select(stddev("salary"), stddev_samp("salary"), stddev_pop("salary")).show(truncate=False)

df.select(sum("salary")).show(truncate=False)

#sumDistinct() function returns the sum of all distinct values in a column.
df.select(sumDistinct("salary")).show(truncate=False)

#variance() alias for var_samp
#var_samp() function returns the unbiased variance of the values in a column.
#var_pop() function returns the population variance of the values in a column.
df.select(variance("salary"),var_samp("salary"),var_pop("salary")).show(truncate=False)

#"variance" is a measure of how spread out a set of data is from its mean, calculated by finding the average of the squared 
#differences from the mean, while "standard deviation" is the square root of the variance, which means it represents the 
#average deviation of data points from the mean

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+

approx_count_distinct: 6
avg: 3400.0
+------------------------------------------------------------+
|collect_list(salary)                                        |
+------------------------------------------------------------+
|[3000, 4600, 4100, 3000, 3000, 3300, 3900, 3000, 2000, 4100]|
+------------------------------------------------------------+

+------------------------------------+
|c



+-----------------+-----------------+---------------+
|var_samp(salary) |var_samp(salary) |var_pop(salary)|
+-----------------+-----------------+---------------+
|586666.6666666666|586666.6666666666|528000.0       |
+-----------------+-----------------+---------------+

