## Aggregation Transformations

In [0]:
%run "../includes/configuration"

In [0]:
race_results_df = spark.read.parquet(f"{presentation_folder_path}/race_results").filter("race_year == 2020")

In [0]:
from pyspark.sql.functions import count, countDistinct, sum, desc, rank
from pyspark.sql.window import Window

In [0]:
race_results_df.select(count("*")).show()

+--------+
|count(1)|
+--------+
|     340|
+--------+



In [0]:
race_results_df.select(countDistinct("race_name")).show()

+-------------------------+
|count(DISTINCT race_name)|
+-------------------------+
|                       17|
+-------------------------+



In [0]:
race_results_df.select(sum("points")).show()

+-----------+
|sum(points)|
+-----------+
|     1734.0|
+-----------+



In [0]:
race_results_df.filter("driver_name == 'Lewis Hamilton'") \
    .select(sum("points"), countDistinct("race_name")) \
    .withColumnRenamed("sum(points)", "total_points") \
    .withColumnRenamed("count(DISTINCT race_name)", "races_won") \
    .show()

+------------+---------+
|total_points|races_won|
+------------+---------+
|       347.0|       16|
+------------+---------+



## GroupBy

In [0]:
race_results_df \
    .groupBy("driver_name") \
    .agg(sum("points"), countDistinct("race_name")) \
    .show()

+------------------+-----------+----------------+
|       driver_name|sum(points)|count(race_name)|
+------------------+-----------+----------------+
|       Jack Aitken|        0.0|               1|
|      Daniil Kvyat|       32.0|              17|
|   Kevin Magnussen|        1.0|              17|
|      Sergio Pérez|      125.0|              15|
|      Carlos Sainz|      105.0|              17|
|    Kimi Räikkönen|        4.0|              17|
|   Romain Grosjean|        2.0|              15|
|   Charles Leclerc|       98.0|              17|
|   Alexander Albon|      105.0|              17|
|      Lance Stroll|       75.0|              16|
|      Pierre Gasly|       75.0|              17|
|    Lewis Hamilton|      347.0|              16|
|   Nico Hülkenberg|       10.0|               3|
|  Daniel Ricciardo|      119.0|              17|
|   Valtteri Bottas|      223.0|              17|
|Antonio Giovinazzi|        4.0|              17|
|      Lando Norris|       97.0|              17|


## Window Functions

In [0]:
demo_df = race_results_df.filter("race_year in (2019, 2020)") 

In [0]:
#display(demo_df)

In [0]:
demo_grouped_df = demo_df \
    .groupBy("race_year", "driver_name") \
    .agg(sum("points").alias("total_points"), countDistinct("race_name").alias("number_races"))
display(demo_grouped_df)

race_year,driver_name,total_points,number_races
2020,Daniil Kvyat,32.0,17
2020,Kevin Magnussen,1.0,17
2020,Antonio Giovinazzi,4.0,17
2020,Nico Hülkenberg,10.0,3
2020,Romain Grosjean,2.0,15
2020,Charles Leclerc,98.0,17
2020,Esteban Ocon,62.0,17
2020,Pietro Fittipaldi,0.0,2
2020,Sebastian Vettel,33.0,17
2020,Daniel Ricciardo,119.0,17


In [0]:
driverRankSpec = Window.partitionBy("race_year").orderBy(desc("total_points"))

demo_grouped_df.withColumn("rank", rank().over(driverRankSpec)).show() 

+---------+------------------+------------+------------+----+
|race_year|       driver_name|total_points|number_races|rank|
+---------+------------------+------------+------------+----+
|     2020|    Lewis Hamilton|       347.0|          16|   1|
|     2020|   Valtteri Bottas|       223.0|          17|   2|
|     2020|    Max Verstappen|       214.0|          17|   3|
|     2020|      Sergio Pérez|       125.0|          15|   4|
|     2020|  Daniel Ricciardo|       119.0|          17|   5|
|     2020|      Carlos Sainz|       105.0|          17|   6|
|     2020|   Alexander Albon|       105.0|          17|   6|
|     2020|   Charles Leclerc|        98.0|          17|   8|
|     2020|      Lando Norris|        97.0|          17|   9|
|     2020|      Pierre Gasly|        75.0|          17|  10|
|     2020|      Lance Stroll|        75.0|          16|  10|
|     2020|      Esteban Ocon|        62.0|          17|  12|
|     2020|  Sebastian Vettel|        33.0|          17|  13|
|     20