In [0]:
%run "../includes/configuration"

## Aggregate functions demo

#### Built-in Aggregate functions

In [0]:
race_results_df = spark.read.parquet(f"{presentation_folder_path}/race_results")

In [0]:
display(race_results_df)

In [0]:
demo_df = race_results_df.where("race_year = 2020")

In [0]:
display(demo_df)

In [0]:
from pyspark.sql.functions import count, countDistinct, sum

In [0]:
demo_df.select(count("*")).show()

In [0]:
demo_df.select(countDistinct("race_name")).show()

In [0]:
demo_df.select(sum("points")).show()

In [0]:
demo_df.where("driver_name = 'Lewis Hamilton'").select(sum("points"), countDistinct("race_name")) \
    .withColumnRenamed("sum(points)", "total_points") \
    .withColumnRenamed("count(DISTINCT race_name)", "number_of_races") \
    .show()

In [0]:
demo_df \
    .groupBy("driver_name") \
    .agg(sum("points").alias("total_points"), countDistinct("race_name").alias("number_of_races")) \
    .orderBy("total_points", ascending=False) \
    .show()

## Window Functions

In [0]:
demo_df = race_results_df.filter("race_year in (2019, 2020)")

In [0]:
display(demo_df)

In [0]:
demo_grouped_df = demo_df \
    .groupBy("race_year", "driver_name") \
    .agg(sum("points").alias("total_points"), countDistinct("race_name").alias("number_of_races")) \
    .orderBy("total_points", ascending=False)

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc, rank

driverRankSpec = Window.partitionBy("race_year").orderBy(desc("total_points"))
demo_grouped_df.withColumn("driver_rank", rank().over(driverRankSpec)).show()