## Import libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

## Create SparkSession object

In [2]:
spark = (SparkSession.builder
         .appName("aggregations-in-pyspark")
         .master("spark://spark-master:7077")
         .config("spark.executor.memory", "512m")
         .getOrCreate()
        )

spark.sparkContext.setLogLevel("ERROR")

    
    

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/17 12:14:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Read the data

In [3]:
df = (spark.read.format("csv")
      .option("header","true")
      .option("nullValue", "null")
      .option("dateFormat", "LLLL d, y")
      .load("../../data/netflix_titles.csv")
     )


In [4]:
df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



In [5]:
df.select(F.col("country")).show(truncate=False)

+---------------------------------------------------------------------+
|country                                                              |
+---------------------------------------------------------------------+
|United States                                                        |
|South Africa                                                         |
|null                                                                 |
|null                                                                 |
|India                                                                |
|null                                                                 |
|null                                                                 |
|United States, Ghana, Burkina Faso, United Kingdom, Germany, Ethiopia|
|United Kingdom                                                       |
|United States                                                        |
|null                                                           

In [6]:
grouped_df = df.groupBy("country")

In [7]:
count_df = grouped_df.count()

count_df.show(truncate=False)

+----------------------------------------------+-----+
|country                                       |count|
+----------------------------------------------+-----+
|Peru, United States, United Kingdom           |1    |
|India, United Kingdom, France, Qatar          |1    |
|Japan, Canada, United States                  |1    |
|United Kingdom, China                         |1    |
|India, Germany                                |2    |
|South Africa, United States, Germany          |1    |
|Russia                                        |15   |
|United Kingdom, Germany, United States, France|1    |
|Chile, United States, France                  |1    |
|South Africa, Angola                          |1    |
|United States, Poland                         |1    |
|Philippines, Qatar                            |1    |
|Hong Kong, China, Singapore                   |1    |
|Germany, Sri Lanka                            |1    |
|Denmark, France, United States, Sweden        |1    |
|United St

## Using Split and Explode and get one row per country

In [8]:


# 1. Turn “Peru, United States, …” into ["Peru", " United States", …]
# 2. Explode into one row per country
# 3. Trim leading/trailing spaces
exploded_df = (
    df
    .withColumn("country", F.explode(F.split(F.col("country"), ",")))
    .withColumn("country", F.trim(F.col("country")))
)



In [9]:
exploded_df.show(5)

+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|       director|                cast|      country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|Kirsten Johnson|                null|United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|           null|Ama Qamata, Khosi...| South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s5|TV Show|        Kota Factory|           null|Mayur More, Jiten...|        India|Septem

## Aggregation- Count of movies in each country

In [10]:
# 4. Now you can groupBy and aggregate. For example, count how many rows mention each country:
grouped_df_final = (
    exploded_df
    .groupBy("country")
    .count()
    .orderBy(F.desc("count"))
)

grouped_df_final.show()

+--------------+-----+
|       country|count|
+--------------+-----+
| United States| 3676|
|         India| 1046|
|United Kingdom|  805|
|        Canada|  445|
|        France|  392|
|         Japan|  318|
|   South Korea|  231|
|         Spain|  230|
|       Germany|  224|
|        Mexico|  169|
|         China|  162|
|     Australia|  160|
|         Egypt|  117|
|        Turkey|  113|
|     Hong Kong|  105|
|       Nigeria|  101|
|         Italy|  100|
|        Brazil|   97|
|     Argentina|   91|
|       Belgium|   90|
+--------------+-----+
only showing top 20 rows



## Unique values of date_added column

In [11]:
exploded_df.select(F.col("date_added")).distinct().show()

+------------------+
|        date_added|
+------------------+
|      May 21, 2021|
|     March 2, 2021|
|September 23, 2020|
| September 8, 2020|
|    April 14, 2020|
| December 30, 2019|
|   August 12, 2019|
|     June 22, 2019|
|      May 30, 2017|
|    April 29, 2016|
|    March 25, 2016|
|  October 27, 2015|
|   January 1, 2008|
|     March 2, 2017|
|  October 31, 2015|
|     June 23, 2021|
|  November 1, 2020|
|  February 9, 2020|
| November 28, 2019|
|   October 5, 2019|
+------------------+
only showing top 20 rows



In [12]:
exploded_df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = false)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



In [13]:
# convert string to date format

df_temp = exploded_df.withColumn("date_added", F.to_date(F.col("date_added"), "LLLL d, yyyy"))

In [14]:
# convert the date format to the desired format

df_date_formatted = df_temp.withColumn("date_added", F.date_format(F.col("date_added"), "yyyy-MM-dd"))

In [15]:
df_date_formatted.select(F.col("date_added")).orderBy("date_added", ascending=False).show()

+----------+
|date_added|
+----------+
|2021-09-25|
|2021-09-24|
|2021-09-24|
|2021-09-24|
|2021-09-24|
|2021-09-24|
|2021-09-24|
|2021-09-24|
|2021-09-24|
|2021-09-24|
|2021-09-24|
|2021-09-23|
|2021-09-23|
|2021-09-22|
|2021-09-22|
|2021-09-22|
|2021-09-21|
|2021-09-21|
|2021-09-20|
|2021-09-19|
+----------+
only showing top 20 rows



In [16]:
df_date_formatted.select(F.col("date_added")).orderBy("date_added", ascending=True).show()

+----------+
|date_added|
+----------+
|      null|
|      null|
|      null|
|      null|
|      null|
|      null|
|      null|
|      null|
|      null|
|      null|
|      null|
|      null|
|      null|
|      null|
|      null|
|      null|
|      null|
|      null|
|      null|
|      null|
+----------+
only showing top 20 rows



## Getting null value rows in date added column

In [17]:
bad_count = df_date_formatted.filter(F.col("date_added").isNull()).count()
print(f"Bad / non‑date rows: {bad_count}")

Bad / non‑date rows: 126


In [18]:

# Keep only the non‑null dates
valid_df = df_date_formatted.filter(F.col("date_added").isNotNull())

valid_count = print(f" Valid rows: {valid_df.count()}")

 Valid rows: 9890


In [19]:
# Then sort them ascending by your parsed date column
valid_df.select(F.col("date_added")).orderBy(F.col("date_added").asc()).show()

+----------+
|date_added|
+----------+
|2008-01-01|
|2008-02-04|
|2009-05-05|
|2009-11-18|
|2010-11-01|
|2011-05-17|
|2011-05-17|
|2011-05-17|
|2011-09-27|
|2011-09-27|
|2011-09-27|
|2011-10-01|
|2011-10-01|
|2011-10-01|
|2011-10-01|
|2011-10-01|
|2011-10-01|
|2011-10-01|
|2011-10-01|
|2011-10-01|
+----------+
only showing top 20 rows



In [20]:
valid_df.show(5)

+-------+-------+--------------------+---------------+--------------------+-------------+----------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|       director|                cast|      country|date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+---------------+--------------------+-------------+----------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|Kirsten Johnson|                null|United States|2021-09-25|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|           null|Ama Qamata, Khosi...| South Africa|2021-09-24|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s5|TV Show|        Kota Factory|           null|Mayur More, Jiten...|        India|2021-09-24|        2021| TV-MA|2 Seasons|Inter

### Applying custom aggregation functions using the agg function

In [21]:
max_release_df = valid_df.groupBy("country").agg(F.max(F.col("date_added")).alias("Latest_release_date"))

max_release_df.show()

+------------+-------------------+
|     country|Latest_release_date|
+------------+-------------------+
|            |         2021-08-27|
| Afghanistan|         2017-01-01|
|     Albania|         2018-05-04|
|     Algeria|         2021-07-30|
|      Angola|         2020-08-28|
|   Argentina|         2021-09-01|
|     Armenia|         2017-08-15|
|   Australia|         2021-09-21|
|     Austria|         2021-05-14|
|  Azerbaijan|         2016-12-26|
|     Bahamas|         2020-08-31|
|  Bangladesh|         2021-02-05|
|     Belarus|         2020-07-15|
|     Belgium|         2021-09-08|
|     Bermuda|         2014-08-15|
|    Botswana|         2020-05-28|
|      Brazil|         2021-08-13|
|    Bulgaria|         2021-08-01|
|Burkina Faso|         2021-09-24|
|    Cambodia|         2021-02-08|
+------------+-------------------+
only showing top 20 rows



## Applying multiple aggregations

In [22]:
release_date_grouped_df = (valid_df.groupBy("country")
                           .agg(
                                F.count("show_id").alias("NumberOfReleases"),
                                F.max("date_added").alias("LastReleaseDate"),
                                F.min("date_added").alias("FirstReleaseDate")
                           )
                          .orderBy(F.desc("NumberOfReleases"))
                          )

release_date_grouped_df.show()       
                                

+--------------+----------------+---------------+----------------+
|       country|NumberOfReleases|LastReleaseDate|FirstReleaseDate|
+--------------+----------------+---------------+----------------+
| United States|            3629|     2021-09-25|      2008-01-01|
|         India|            1045|     2021-09-24|      2016-04-22|
|United Kingdom|             786|     2021-09-24|      2011-09-27|
|        Canada|             432|     2021-09-16|      2013-11-01|
|        France|             388|     2021-09-19|      2011-05-17|
|         Japan|             314|     2021-09-16|      2015-12-01|
|   South Korea|             226|     2021-09-06|      2016-05-23|
|         Spain|             226|     2021-09-09|      2011-05-17|
|       Germany|             223|     2021-09-24|      2015-07-15|
|        Mexico|             169|     2021-09-22|      2011-05-17|
|         China|             162|     2021-09-16|      2015-07-15|
|     Australia|             157|     2021-09-21|      2015-02

In [None]:
spark.stop()