# Groupby and Aggregate Function in PySpark

In [1]:
import pandas as pd
data = pd.read_csv("salary.csv")
data.head()

Unnamed: 0,Name,Dept,Sal
0,A,D1,10000
1,B,D3,5000
2,C,D1,4000
3,C,D2,4000
4,D,D2,20000


In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Agg").getOrCreate()
spark

In [4]:
df_pyspark = spark.read.csv("salary.csv",header=True,inferSchema=True)
df_pyspark.show()

+----+----+-----+
|Name|Dept|  Sal|
+----+----+-----+
|   A|  D1|10000|
|   B|  D3| 5000|
|   C|  D1| 4000|
|   C|  D2| 4000|
|   D|  D2|20000|
|   C|  D1| 3000|
|   B|  D2| 1000|
|   A|  D1| 2000|
|   C|  D3| 7000|
+----+----+-----+



In [5]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Dept: string (nullable = true)
 |-- Sal: integer (nullable = true)



**GroupBy Operation is always clubbed with an Aggregation**

In [6]:
df_pyspark.groupBy("Name").sum().show()

+----+--------+
|Name|sum(Sal)|
+----+--------+
|   B|    6000|
|   D|   20000|
|   C|   18000|
|   A|   12000|
+----+--------+



In [7]:
df_pyspark.groupBy("Dept").sum().show()

+----+--------+
|Dept|sum(Sal)|
+----+--------+
|  D1|   19000|
|  D3|   12000|
|  D2|   25000|
+----+--------+



In [8]:
df_pyspark.groupBy("Dept").mean().show()

+----+-----------------+
|Dept|         avg(Sal)|
+----+-----------------+
|  D1|           4750.0|
|  D3|           6000.0|
|  D2|8333.333333333334|
+----+-----------------+



In [9]:
df_pyspark.groupBy("Dept").count().show()

+----+-----+
|Dept|count|
+----+-----+
|  D1|    4|
|  D3|    2|
|  D2|    3|
+----+-----+



In [11]:
df_pyspark.agg({"Sal":"sum"}).show()

+--------+
|sum(Sal)|
+--------+
|   56000|
+--------+

