## Import libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max,min,count,approx_count_distinct
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

## Create SparkSession object

In [2]:
spark = (SparkSession.builder
         .appName("aggregations-in-pyspark")
         .master("spark://spark-master:7077")
         .config("spark.executor.memory", "512m")
         .getOrCreate()
        )

spark.sparkContext.setLogLevel("ERROR")

    
    

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/17 10:56:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Read the data

In [3]:
df = (spark.read.format("csv")
      .option("header","true")
      .option("nullValue", "null")
      .option("dateFormat", "LLLL d, y")
      .load("../../data/netflix_titles.csv")
     )



In [4]:
df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



## Perform Aggregations

In [5]:
grouped_df = df.groupBy("country")

## Peforming Actions

### Count the number of rows in each group

In [6]:
count_df = grouped_df.count()

count_df.show()

+--------------------+-----+
|             country|count|
+--------------------+-----+
|Peru, United Stat...|    1|
|India, United Kin...|    1|
|Japan, Canada, Un...|    1|
|United Kingdom, C...|    1|
|      India, Germany|    2|
|South Africa, Uni...|    1|
|              Russia|   15|
|United Kingdom, G...|    1|
|Chile, United Sta...|    1|
|South Africa, Angola|    1|
|United States, Po...|    1|
|  Philippines, Qatar|    1|
|Hong Kong, China,...|    1|
|  Germany, Sri Lanka|    1|
|Denmark, France, ...|    1|
|United States, Fr...|    1|
|United Kingdom, N...|    2|
|Australia, United...|    2|
|Brazil, France, G...|    1|
|United States, Ir...|    3|
+--------------------+-----+
only showing top 20 rows



### Applying custom aggregation functions using the agg function

In [7]:
max_release_df = grouped_df.agg(max(col("date_added")))

max_release_df.show()

+--------------------+--------------------+
|             country|     max(date_added)|
+--------------------+--------------------+
|                null|   September 9, 2021|
|     Ama K. Abebrese|  Kobina Amissah Sam|
|         Aziz Ansari|         Carla Gallo|
|            Chuck D.|     Desiree Densiti|
|       Dominic Costa|        Nick Ferraro|
|          Doug Plaut|    Cheyenne Jackson|
|     Francesc Orella|        Imanol Arias|
|  Henri-Noël Tabary"|              France|
|       James Toback"|       United States|
| Justin ""Alyssa ...|         Molly Ryman|
|  Lachion Buckingham|       Chasity Moore|
|  Leonardo Sbaraglia|     Francesc Orella|
|   Michael Cavalieri|     Walton Goggins"|
| Remilekun ""Remi...| Charles  ""Charl...|
|          Rob Morgan|     Shakira Barrera|
|       Sophia Loren"|       United States|
|     Tantoo Cardinal|   Robert Pastorelli|
|      Theo Campbell"|       United States|
| Tobechukwu ""iLL...|          Toni Tones|
| plus Whitney Cum...|          

## Applying multiple aggregations

In [8]:
release_date_grouped_df = (df.groupBy("country")
                           .agg(
                                count("show_id").alias("NumberOfReleases"),
                                max("date_added").alias("LastReleaseDate"),
                                min("date_added").alias("FirstReleaseDate")
                           ))

release_date_grouped_df.show()
                               
                           
                           
                                

[Stage 7:>                                                          (0 + 1) / 1]

+--------------------+----------------+--------------------+--------------------+
|             country|NumberOfReleases|     LastReleaseDate|    FirstReleaseDate|
+--------------------+----------------+--------------------+--------------------+
|                null|             830|   September 9, 2021|   December 14, 2018|
|     Ama K. Abebrese|               1|  Kobina Amissah Sam|  Kobina Amissah Sam|
|         Aziz Ansari|               1|         Carla Gallo|         Carla Gallo|
|            Chuck D.|               1|     Desiree Densiti|     Desiree Densiti|
|       Dominic Costa|               1|        Nick Ferraro|        Nick Ferraro|
|          Doug Plaut|               1|    Cheyenne Jackson|    Cheyenne Jackson|
|     Francesc Orella|               1|        Imanol Arias|        Imanol Arias|
|  Henri-Noël Tabary"|               1|              France|              France|
|       James Toback"|               1|       United States|       United States|
| Justin ""Alyss

                                                                                

In [9]:
spark.stop()