In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [0]:
csv_path = "/databricks-datasets/airlines/part-00000"

df_csv = spark.read.csv(csv_path, header=True, inferSchema=True)
df_csv.show(5)


+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|IsArrDelayed|IsDepDelayed|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|1987|   10|    

In [0]:
import time

start = time.time()
df_csv.count()
print("Time to count CSV:", time.time() - start)


Time to count CSV: 3.4063758850097656


In [0]:
parquet_path = "/tmp/airlines_parquet"
df_csv.write.mode("overwrite").parquet(parquet_path)

df_parquet = spark.read.parquet(parquet_path)
df_parquet.show(5)


+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|IsArrDelayed|IsDepDelayed|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|1987|   10|    

In [0]:
start = time.time()
df_parquet.count()
print("Time to count Parquet:", time.time() - start)


Time to count Parquet: 0.6084728240966797


In [0]:
partitioned_path = "/tmp/airlines_partitioned"
df_csv.write.mode("overwrite").partitionBy("Year").parquet(partitioned_path)

df_partitioned = spark.read.parquet(partitioned_path)
df_partitioned.printSchema()


root
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- CarrierDelay: string (nullable = true)
 

In [0]:
from pyspark.sql.functions import col

start = time.time()
df_partitioned.filter(col("Year") == 2008).count()
print("Time with Partition Pruning:", time.time() - start)


Time with Partition Pruning: 0.6094934940338135


In [0]:
df_partitioned.filter(col("Year") == 2008).explain(True)

== Parsed Logical Plan ==
'Filter '`==`('Year, 2008)
+- Relation [Month#910,DayofMonth#911,DayOfWeek#912,DepTime#913,CRSDepTime#914,ArrTime#915,CRSArrTime#916,UniqueCarrier#917,FlightNum#918,TailNum#919,ActualElapsedTime#920,CRSElapsedTime#921,AirTime#922,ArrDelay#923,DepDelay#924,Origin#925,Dest#926,Distance#927,TaxiIn#928,TaxiOut#929,Cancelled#930,CancellationCode#931,Diverted#932,CarrierDelay#933,... 7 more fields] parquet

== Analyzed Logical Plan ==
Month: int, DayofMonth: int, DayOfWeek: int, DepTime: string, CRSDepTime: int, ArrTime: string, CRSArrTime: int, UniqueCarrier: string, FlightNum: int, TailNum: string, ActualElapsedTime: string, CRSElapsedTime: int, AirTime: string, ArrDelay: string, DepDelay: string, Origin: string, Dest: string, Distance: string, TaxiIn: string, TaxiOut: string, Cancelled: int, CancellationCode: string, Diverted: int, CarrierDelay: string, ... 7 more fields
Filter (Year#940 = 2008)
+- Relation [Month#910,DayofMonth#911,DayOfWeek#912,DepTime#913,CRSD