In [0]:
# import batch data from a  .csv file into a spark dataframe
from pyspark.sql.functions import col

olympics_data = spark.read.format("csv") \
                          .option("header", "true") \
                          .option("inferSchema", "true") \
                          .load("/FileStore/datasets/summer.csv") 

olympics_data.display()

Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold
1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold
1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver
1896,Athens,Aquatics,Swimming,"CHOROPHAS, Efstathios",GRE,Men,1200M Freestyle,Bronze
1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,1200M Freestyle,Gold
1896,Athens,Aquatics,Swimming,"ANDREOU, Joannis",GRE,Men,1200M Freestyle,Silver
1896,Athens,Aquatics,Swimming,"CHOROPHAS, Efstathios",GRE,Men,400M Freestyle,Bronze
1896,Athens,Aquatics,Swimming,"NEUMANN, Paul",AUT,Men,400M Freestyle,Gold


In [0]:
# by default the spark configuration property "adaptive.enabled" is set to "True". That means that spark selects tries to apply the most efficient query execution plan automatically. Sometimes it's worth to disable this function

spark.conf.get("spark.sql.adaptive.enabled")

In [0]:
# the reason we want to set adaptive.enabled to False is to make repartitioning dataframe based on it's column. For small dataframes spark 
# automatically disables repartitioning
spark.conf.set("spark.sql.adaptive.enabled", False)

spark.conf.get("spark.sql.adaptive.enabled")

In [0]:
# filter data first, if possible, to reduce output 
olympics_data_filtered = olympics_data.filter('Country="POL"')

olympics_data_filtered.explain()

In [0]:
olympics_data_filtered.display()

Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
1924,Paris,Cycling,Cycling Track,"LANGE, Jozef",POL,Men,Team Pursuit (4000M),Silver
1924,Paris,Cycling,Cycling Track,"LAZARSKI, Jan",POL,Men,Team Pursuit (4000M),Silver
1924,Paris,Cycling,Cycling Track,"STANKIEWICZ, Tomasz",POL,Men,Team Pursuit (4000M),Silver
1924,Paris,Cycling,Cycling Track,"SZYMCZYK, Franciszek",POL,Men,Team Pursuit (4000M),Silver
1924,Paris,Equestrian,Jumping,"KROLIKIEWICZ, Adam",POL,Men,Individual,Bronze
1928,Amsterdam,Athletics,Athletics,"KONOPACKA, Halina",POL,Women,Discus Throw,Gold
1928,Amsterdam,Equestrian,Eventing,"ANTONIEWICZ, Michal",POL,Men,Team,Bronze
1928,Amsterdam,Equestrian,Eventing,"DE ROMMEL (BARON), Karol",POL,Men,Team,Bronze
1928,Amsterdam,Equestrian,Eventing,"TRENKWALD, Jozef",POL,Men,Team,Bronze
1928,Amsterdam,Equestrian,Jumping,"ANTONIEWICZ, Michal",POL,Men,Team,Silver


In [0]:
# if we expect a lot of filter operations based on countries it is recommended to use "Country" column as a partition key. Than each country will # be stored in separate partition
olympics_data.repartition(col("Country")) \
             .write \
             .mode("overwrite") \
             .parquet("dbfs:/FileStore/datasets/olympics_data_repartitioned")

In [0]:
dbutils.fs.ls("/FileStore/datasets/olympics_data_repartitioned")

In [0]:
# read in repartitioned data from a stored parquet file into a new dataframe
repartitioned_data = spark.read.parquet("dbfs:/FileStore/datasets/olympics_data_repartitioned")

repartitioned_data.display()

Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
1896,Athens,Athletics,Athletics,"LANE, Francis",USA,Men,100M,Bronze
1896,Athens,Athletics,Athletics,"BURKE, Thomas",USA,Men,100M,Gold
1896,Athens,Athletics,Athletics,"CURTIS, Thomas",USA,Men,110M Hurdles,Gold
1896,Athens,Athletics,Athletics,"BLAKE, Arthur",USA,Men,1500M,Silver
1896,Athens,Athletics,Athletics,"BURKE, Thomas",USA,Men,400M,Gold
1896,Athens,Athletics,Athletics,"JAMISON, Herbert",USA,Men,400M,Silver
1896,Athens,Athletics,Athletics,"GARRETT, Robert",USA,Men,Discus Throw,Gold
1896,Athens,Athletics,Athletics,"CLARK, Ellery",USA,Men,High Jump,Gold
1896,Athens,Athletics,Athletics,"CONNOLLY, James",USA,Men,High Jump,Silver
1896,Athens,Athletics,Athletics,"GARRETT, Robert",USA,Men,High Jump,Silver


In [0]:
#perform additional filtering on the repartitioned dataframe, selecting a single country value. Now spark reads just single partition to get data
repartitioned_data_filtered = repartitioned_data.filter('Country = "POL"')

repartitioned_data_filtered.explain()

In [0]:
repartitioned_data_filtered.display()

Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
1924,Paris,Cycling,Cycling Track,"LANGE, Jozef",POL,Men,Team Pursuit (4000M),Silver
1924,Paris,Cycling,Cycling Track,"LAZARSKI, Jan",POL,Men,Team Pursuit (4000M),Silver
1924,Paris,Cycling,Cycling Track,"STANKIEWICZ, Tomasz",POL,Men,Team Pursuit (4000M),Silver
1924,Paris,Cycling,Cycling Track,"SZYMCZYK, Franciszek",POL,Men,Team Pursuit (4000M),Silver
1924,Paris,Equestrian,Jumping,"KROLIKIEWICZ, Adam",POL,Men,Individual,Bronze
1928,Amsterdam,Athletics,Athletics,"KONOPACKA, Halina",POL,Women,Discus Throw,Gold
1928,Amsterdam,Equestrian,Eventing,"ANTONIEWICZ, Michal",POL,Men,Team,Bronze
1928,Amsterdam,Equestrian,Eventing,"DE ROMMEL (BARON), Karol",POL,Men,Team,Bronze
1928,Amsterdam,Equestrian,Eventing,"TRENKWALD, Jozef",POL,Men,Team,Bronze
1928,Amsterdam,Equestrian,Jumping,"ANTONIEWICZ, Michal",POL,Men,Team,Silver


In [0]:
# now we can overwrite data more efficiently, performing partitioning by country

olympics_data.repartition(col("Country")) \
             .write \
             .partitionBy("Country") \
             .mode("overwrite") \
             .parquet("dbfs:/FileStore/datasets/olympics_data_partitioned")

In [0]:
dbutils.fs.ls("/FileStore/datasets/olympics_data_partitioned")

In [0]:
# now le't try to re-read data
olympics_data_partitioned = spark.read.parquet("dbfs:/FileStore/datasets/olympics_data_partitioned")

In [0]:
partitioned_data_filtered = olympics_data_partitioned.filter('Country = "POL"')

partitioned_data_filtered.explain()

In [0]:
# observing job details / SQL/ "size of files read" are just 12.0 KiB
partitioned_data_filtered.display()

Year,City,Sport,Discipline,Athlete,Gender,Event,Medal,Country
1924,Paris,Cycling,Cycling Track,"LANGE, Jozef",Men,Team Pursuit (4000M),Silver,POL
1924,Paris,Cycling,Cycling Track,"LAZARSKI, Jan",Men,Team Pursuit (4000M),Silver,POL
1924,Paris,Cycling,Cycling Track,"STANKIEWICZ, Tomasz",Men,Team Pursuit (4000M),Silver,POL
1924,Paris,Cycling,Cycling Track,"SZYMCZYK, Franciszek",Men,Team Pursuit (4000M),Silver,POL
1924,Paris,Equestrian,Jumping,"KROLIKIEWICZ, Adam",Men,Individual,Bronze,POL
1928,Amsterdam,Athletics,Athletics,"KONOPACKA, Halina",Women,Discus Throw,Gold,POL
1928,Amsterdam,Equestrian,Eventing,"ANTONIEWICZ, Michal",Men,Team,Bronze,POL
1928,Amsterdam,Equestrian,Eventing,"DE ROMMEL (BARON), Karol",Men,Team,Bronze,POL
1928,Amsterdam,Equestrian,Eventing,"TRENKWALD, Jozef",Men,Team,Bronze,POL
1928,Amsterdam,Equestrian,Jumping,"ANTONIEWICZ, Michal",Men,Team,Silver,POL
