# Improving Performance

Improve data cleaning tasks by increasing performance or reducing resource requirements.

## Preparing the environment

### Importing libraries

In [1]:
import pandas as pd
import random
import time

from typing import List

from pyspark.sql.types import (_parse_datatype_string, StructType, StructField,
                               DoubleType, IntegerType, StringType, FloatType)
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
import warnings
warnings.filterwarnings("ignore")  # Ignore warnings coming from Arrow optimizations.

### Connect to Spark

In [3]:
spark = (SparkSession.builder
                     .config("spark.sql.repl.eagerEval.enabled", True)  # eval DataFrame in notebooks
                     .getOrCreate())

In [4]:
sc = spark.sparkContext

### Loading data

In [5]:
flights_2014 = spark.read.csv('data-sources/AA_DFW_2014_Departures_Short.csv.gz', header=True, inferSchema=True)

# cast to date
flights_2014 = flights_2014.withColumn("Date (MM/DD/YYYY)", 
                                       F.to_date(flights_2014["Date (MM/DD/YYYY)"], "MM/dd/yyyy"))

flights_2014.createOrReplaceTempView("flights_2014")
flights_2014.printSchema()
flights_2014.limit(2)

root
 |-- Date (MM/DD/YYYY): date (nullable = true)
 |-- Flight Number: integer (nullable = true)
 |-- Destination Airport: string (nullable = true)
 |-- Actual elapsed time (Minutes): integer (nullable = true)



Date (MM/DD/YYYY),Flight Number,Destination Airport,Actual elapsed time (Minutes)
2014-01-01,5,HNL,519
2014-01-01,7,OGG,505


In [6]:
flights_2015 = spark.read.csv('data-sources/AA_DFW_2015_Departures_Short.csv.gz', header=True, inferSchema=True)

# cast to date
flights_2015 = flights_2015.withColumn("Date (MM/DD/YYYY)", 
                                       F.to_date(flights_2015["Date (MM/DD/YYYY)"], "MM/dd/yyyy"))

flights_2015.createOrReplaceTempView("flights_2015")
flights_2015.printSchema()
flights_2015.limit(2)

root
 |-- Date (MM/DD/YYYY): date (nullable = true)
 |-- Flight Number: integer (nullable = true)
 |-- Destination Airport: string (nullable = true)
 |-- Actual elapsed time (Minutes): integer (nullable = true)



Date (MM/DD/YYYY),Flight Number,Destination Airport,Actual elapsed time (Minutes)
2015-01-01,5,HNL,526
2015-01-01,7,OGG,517


In [7]:
flights_2017 = spark.read.csv('data-sources/AA_DFW_2017_Departures_Short.csv.gz', header=True, inferSchema=True)

# cast to date
flights_2017 = flights_2017.withColumn("Date (MM/DD/YYYY)", 
                                       F.to_date(flights_2017["Date (MM/DD/YYYY)"], "MM/dd/yyyy"))
flights_2017.createOrReplaceTempView("flights_2017")
flights_2017.printSchema()
flights_2017.limit(2)

root
 |-- Date (MM/DD/YYYY): date (nullable = true)
 |-- Flight Number: integer (nullable = true)
 |-- Destination Airport: string (nullable = true)
 |-- Actual elapsed time (Minutes): integer (nullable = true)



Date (MM/DD/YYYY),Flight Number,Destination Airport,Actual elapsed time (Minutes)
2017-01-01,5,HNL,537
2017-01-01,7,OGG,498


In [8]:
flights_2018 = spark.read.csv('data-sources/AA_DFW_2018_Departures_Short.csv.gz', header=True, inferSchema=True)

# cast to date
flights_2018 = flights_2018.withColumn("Date (MM/DD/YYYY)", 
                                       F.to_date(flights_2018["Date (MM/DD/YYYY)"], "MM/dd/yyyy"))

# save the file in csv fprmat
(flights_2018.repartition(5)
             .write.format('csv')
             .save('output-files/AA_DFW_2018_Departures_Short.csv', mode='overwrite'))

# Review the data
flights_2018.createOrReplaceTempView("flights_2018")
flights_2018.printSchema()
flights_2018.limit(2)

root
 |-- Date (MM/DD/YYYY): date (nullable = true)
 |-- Flight Number: integer (nullable = true)
 |-- Destination Airport: string (nullable = true)
 |-- Actual elapsed time (Minutes): integer (nullable = true)



Date (MM/DD/YYYY),Flight Number,Destination Airport,Actual elapsed time (Minutes)
2018-01-01,5,HNL,498
2018-01-01,7,OGG,501


In [9]:
dallas_electors = spark.read.csv('data-sources/DallasCouncilVoters.csv.gz', header=True, inferSchema=True)

# cast to date
dallas_electors = dallas_electors.withColumn("DATE", F.to_date(dallas_electors["DATE"], "MM/dd/yyyy"))

dallas_electors.createOrReplaceTempView("dallas_electors")
dallas_electors.printSchema()
dallas_electors.limit(2)

root
 |-- DATE: date (nullable = true)
 |-- TITLE: string (nullable = true)
 |-- VOTER_NAME: string (nullable = true)



DATE,TITLE,VOTER_NAME
2017-02-08,Councilmember,Jennifer S. Gates
2017-02-08,Councilmember,Philip T. Kingston


In [10]:
dallas_votes = spark.read.csv('data-sources/DallasCouncilVotes.csv.gz', header=True, inferSchema=True)

# cast to date
dallas_votes = dallas_votes.withColumn("DATE", F.to_date(dallas_votes["DATE"], "MM/dd/yyyy"))

dallas_votes.createOrReplaceTempView("dallas_votes")
dallas_votes.printSchema()
dallas_votes.limit(2)

root
 |-- DATE: date (nullable = true)
 |-- AGENDA_ITEM_NUMBER: string (nullable = true)
 |-- ITEM_TYPE: string (nullable = true)
 |-- DISTRICT: string (nullable = true)
 |-- TITLE: string (nullable = true)
 |-- VOTER NAME: string (nullable = true)
 |-- VOTE CAST: string (nullable = true)
 |-- FINAL ACTION TAKEN: string (nullable = true)
 |-- AGENDA ITEM DESCRIPTION: string (nullable = true)
 |-- AGENDA_ID: string (nullable = true)
 |-- VOTE_ID: string (nullable = true)



DATE,AGENDA_ITEM_NUMBER,ITEM_TYPE,DISTRICT,TITLE,VOTER NAME,VOTE CAST,FINAL ACTION TAKEN,AGENDA ITEM DESCRIPTION,AGENDA_ID,VOTE_ID
2017-02-08,1,AGENDA,13,Councilmember,Jennifer S. Gates,,NO ACTION NEEDED,Call to Order,020817__Special__1,020817__Special__...
2017-02-08,1,AGENDA,14,Councilmember,Philip T. Kingston,,NO ACTION NEEDED,Call to Order,020817__Special__1,020817__Special__...


In [11]:
people = spark.read.csv('data-sources/people_data_sample.csv', header=True, inferSchema=True)
people.createOrReplaceTempView("people")
people.printSchema()
people.limit(2)

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)



name,age,city
Amy Meyer,3,Kimberlyborough
Amy Jones,10,Davidburgh


In [12]:
flight = spark.read.parquet('data-sources/flight-time.parquet')
flight.createOrReplaceTempView("flight")
flight.printSchema()
flight.limit(2)

root
 |-- FL_DATE: date (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_CITY_NAME: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEST_CITY_NAME: string (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- DEP_TIME: integer (nullable = true)
 |-- WHEELS_ON: integer (nullable = true)
 |-- TAXI_IN: integer (nullable = true)
 |-- CRS_ARR_TIME: integer (nullable = true)
 |-- ARR_TIME: integer (nullable = true)
 |-- CANCELLED: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)



FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,CANCELLED,DISTANCE
2000-01-01,DL,1451,BOS,"Boston, MA",ATL,"Atlanta, GA",1115,1113,1343,5,1400,1348,0,946
2000-01-01,DL,1479,BOS,"Boston, MA",ATL,"Atlanta, GA",1315,1311,1536,7,1559,1543,0,946


### Tables catalogue

In [13]:
spark.catalog.listTables()

[Table(name='dallas_electors', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='dallas_votes', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='flight', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='flights_2014', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='flights_2015', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='flights_2017', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='flights_2018', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='people', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

## Caching

### Implementing caching

Call `.cache()` on the DataFrame before Action

In [14]:
voter_df = dallas_electors.select('*')
voter_df.cache().count()

44625

In [15]:
voter_df = voter_df.withColumn('ID', F.monotonically_increasing_id() + 1)
voter_df = voter_df.cache()
voter_df.show(5)

+----------+-------------+-------------------+---+
|      DATE|        TITLE|         VOTER_NAME| ID|
+----------+-------------+-------------------+---+
|2017-02-08|Councilmember|  Jennifer S. Gates|  1|
|2017-02-08|Councilmember| Philip T. Kingston|  2|
|2017-02-08|        Mayor|Michael S. Rawlings|  3|
|2017-02-08|Councilmember|       Adam Medrano|  4|
|2017-02-08|Councilmember|       Casey Thomas|  5|
+----------+-------------+-------------------+---+
only showing top 5 rows



### More cache operations

- Check `.is_cached` to determine cache status
- Call `.unpersist()` when finished with DataFrame

In [16]:
print(voter_df.is_cached)

True


In [17]:
voter_df = voter_df.unpersist()
voter_df.limit(2)

DATE,TITLE,VOTER_NAME,ID
2017-02-08,Councilmember,Jennifer S. Gates,1
2017-02-08,Councilmember,Philip T. Kingston,2


## Ex. 1 - Caching a DataFrame

You've been assigned a task that requires running several analysis operations on a DataFrame. You've learned that caching can improve performance when reusing DataFrames and would like to implement it.

You'll be working with a new dataset consisting of airline departure information. It may have repetitive data and will need to be de-duplicated.

**Instructions:**

1. Cache the unique rows in the `departures_df` DataFrame.
2. Perform a count query on `departures_df`, noting how long the operation takes.
3. Count the rows again, noting the variance in time of a cached DataFrame.

In [18]:
departures_df = flights_2017.select('*')
departures_df.show(2)

+-----------------+-------------+-------------------+-----------------------------+
|Date (MM/DD/YYYY)|Flight Number|Destination Airport|Actual elapsed time (Minutes)|
+-----------------+-------------+-------------------+-----------------------------+
|       2017-01-01|            5|                HNL|                          537|
|       2017-01-01|            7|                OGG|                          498|
+-----------------+-------------+-------------------+-----------------------------+
only showing top 2 rows



In [19]:
# Add caching to the unique rows in departures_df
departures_df = departures_df.distinct().cache()

# Count the unique rows in departures_df, noting how long the operation takes
start_time = time.time()
print("Counting %d rows took %f seconds" % (departures_df.count(), time.time() - start_time))

# Count the rows again, noting the variance in time of a cached DataFrame
start_time = time.time()
print("Counting %d rows again took %f seconds" % (departures_df.count(), time.time() - start_time))

Counting 139358 rows took 1.496362 seconds
Counting 139358 rows again took 0.448486 seconds


In [20]:
flights_2017.count()

139358

## Ex.2 - Removing a DataFrame from cache

You've finished the analysis tasks with the `departures_df` DataFrame, but have some other processing to do. You'd like to remove the DataFrame from the cache to prevent any excess memory usage on your cluster.

**Instructions:**

1. Check the caching status on the `departures_df` DataFrame.
2. Remove the `departures_df` DataFrame from the cache.
3. Validate the caching status again.

In [21]:
# Determine if departures_df is in the cache
print("Is departures_df cached?: %s" % departures_df.is_cached)

Is departures_df cached?: True


In [22]:
# Remove departures_df from the cache
print("Removing departures_df from cache")
departures_df.unpersist()

# Check the cache status again
print("Is departures_df cached?: %s" % departures_df.is_cached)

Removing departures_df from cache
Is departures_df cached?: False


## Ex. 3 - File import performance

You've been given a large set of data to import into a Spark DataFrame. You'd like to test the difference in import speed by splitting up the file.

**Instructions:**

1. Import the `AA_DFW_2018_Departures_Short.csv.gz` file and the `AA_DFW_2018_Departures_001.csv.gz` files into separate DataFrames.
2. Run a count on each DataFrame and compare the run times.

In [23]:
# Import the full and split files into DataFrames
full_df = spark.read.csv('data-sources/AA_DFW_2018_Departures_Short.csv.gz')
split_df = spark.read.csv('data-sources/AA_DFW_2018_Departures_001.csv.gz')

In [24]:
# Print the count and run time for each DataFrame
start_time_a = time.time()
print("Total rows in full DataFrame:\t%d" % full_df.count())
print("Time to run: %f" % (time.time() - start_time_a))

Total rows in full DataFrame:	119911
Time to run: 0.073950


In [25]:
start_time_b = time.time()
print("Total rows in split DataFrame:\t%d" % split_df.count())
print("Time to run: %f" % (time.time() - start_time_b))

Total rows in split DataFrame:	23982
Time to run: 0.045212


## Ex. 4 - Reading Spark configurations

You've recently configured a cluster via a cloud provider. Your only access is via the command shell or your python code. You'd like to verify some Spark settings to validate the configuration of the cluster.

**Instructions:**

1. Check the name of the Spark application instance (`'spark.app.name'`).
2. Determine the TCP port the driver runs on (`'spark.driver.port'`).
3. Determine how many partitions are configured for joins.
4. Show the results.

In [26]:
# Name of the Spark application instance
app_name = spark.conf.get('spark.app.name')

# Driver TCP port
driver_tcp_port = spark.conf.get('spark.driver.port')

# Number of join partitions
num_partitions = spark.conf.get('spark.sql.shuffle.partitions')

# Show the results
print("Name: %s" % app_name)
print("Driver TCP port: %s" % driver_tcp_port)
print("Number of partitions: %s" % num_partitions)

Name: pyspark-shell
Driver TCP port: 56183
Number of partitions: 200


## Ex. 5 - Writing Spark configurations

Now that you've reviewed some of the Spark configurations on your cluster, you want to modify some of the settings to tune Spark to your needs. You'll import some data to review that your changes have affected the cluster.

The spark configuration is initially set to the default value of 200 partitions.

**Instructions:**

1. Store the number of partitions in `departures_df` in the variable before.
2. Change the `spark.sql.shuffle.partitions` configuration to 500 partitions.
3. Recreate the `departures_df` DataFrame reading the distinct rows from the departures file.
4. Print the number of partitions from before and after the configuration change.

In [27]:
# Review current partition
print(f"Current number of partitions: {spark.conf.get('spark.sql.shuffle.partitions')}")

# Load the file
file_path = 'data-sources/AA_DFW_2018_Departures_Short.csv.gz'
departures_df = spark.read.csv(file_path).distinct()

# Review the number of partitions in the instance
before = departures_df.rdd.getNumPartitions()
print("Partition count before change: %d" % before)

Current number of partitions: 200
Partition count before change: 2


In [28]:
# Configure Spark to use 500 partitions
spark.conf.set('spark.sql.shuffle.partitions', 500)
print(f"Number of partitions: {spark.conf.get('spark.sql.shuffle.partitions')}")

# Recreate the DataFrame
departures_df = spark.read.csv(file_path).distinct()

# Review the number of partitions in the instance
after = departures_df.rdd.getNumPartitions()
print("Partition count after change: %d" % after)

Number of partitions: 500
Partition count after change: 2


## Ex. 6 - Normal joins

You've been given two DataFrames to combine into a single useful DataFrame. Your first task is to combine the DataFrames normally and view the execution plan.

**Instructions:**

1. Create a new `DataFrame normal_df` by joining `flights_df` with `airports_df`.
2. Determine which type of join is used in the query plan.

In [29]:
# Prepare the data - flights
flights_df = flights_2018.select('*')
flights_df.limit(2)

Date (MM/DD/YYYY),Flight Number,Destination Airport,Actual elapsed time (Minutes)
2018-01-01,5,HNL,498
2018-01-01,7,OGG,501


In [30]:
# Prepare the data - airports
airports_df = spark.read.csv('data-sources/airports.csv', header=True)
airports_df.limit(2)

faa,name,lat,lon,alt,tz,dst
04G,Lansdowne Airport,41.1304722,-80.6195833,1044,-5,A
06A,Moton Field Munic...,32.4605722,-85.6800278,264,-5,A


In [31]:
# Join the flights_df and aiports_df DataFrames
normal_df = flights_df.join(airports_df,
                            flights_df["Destination Airport"] == airports_df["faa"])

# Show the query plan
normal_df.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- BroadcastHashJoin [Destination Airport#238], [faa#1672], Inner, BuildRight, false
   :- Project [cast(gettimestamp(Date (MM/DD/YYYY)#236, MM/dd/yyyy, TimestampType, Some(America/Regina), false) as date) AS Date (MM/DD/YYYY)#244, Flight Number#237, Destination Airport#238, Actual elapsed time (Minutes)#239]
   :  +- Filter isnotnull(Destination Airport#238)
   :     +- FileScan csv [Date (MM/DD/YYYY)#236,Flight Number#237,Destination Airport#238,Actual elapsed time (Minutes)#239] Batched: false, DataFilters: [isnotnull(Destination Airport#238)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/C:/Users/Jacqueline/Documents/projects/CAMP-PySpark/3-PySparkCle..., PartitionFilters: [], PushedFilters: [IsNotNull(Destination Airport)], ReadSchema: struct<Date (MM/DD/YYYY):string,Flight Number:int,Destination Airport:string,Actual elapsed time ...
   +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, false])

In [32]:
normal_df.limit(2)

Date (MM/DD/YYYY),Flight Number,Destination Airport,Actual elapsed time (Minutes),faa,name,lat,lon,alt,tz,dst
2018-01-01,5,HNL,498,HNL,Honolulu Intl,21.318681,-157.922428,13,-10,N
2018-01-01,7,OGG,501,OGG,Kahului,20.89865,-156.430458,54,-10,N


## Ex. 7 - Using broadcasting on Spark joins

Remember that table joins in Spark are split between the cluster workers. If the data is not local, various shuffle operations are required and can have a negative impact on performance. Instead, we're going to use Spark's broadcast operations to give each node a copy of the specified data.

A couple tips:
- Broadcast the smaller DataFrame. The larger the DataFrame, the more time required to transfer to the worker nodes.
- On small DataFrames, it may be better skip broadcasting and let Spark figure out any optimization on its own.
- If you look at the query execution plan, a `broadcastHashJoin` indicates you've successfully configured broadcasting.

**Instructions:**

1. Import the `broadcast()` method from `pyspark.sql.functions`.
2. Create a new DataFrame `broadcast_df` by joining `flights_df` with `airports_df`, using the broadcasting.
3. Show the query plan and consider differences from the original.

In [33]:
# Join the flights_df and airports_df DataFrames using broadcasting
broadcast_df = flights_df.join(F.broadcast(airports_df),
                               flights_df["Destination Airport"] == airports_df["faa"])

# Show the query plan and compare against the original
broadcast_df.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- BroadcastHashJoin [Destination Airport#238], [faa#1672], Inner, BuildRight, false
   :- Project [cast(gettimestamp(Date (MM/DD/YYYY)#236, MM/dd/yyyy, TimestampType, Some(America/Regina), false) as date) AS Date (MM/DD/YYYY)#244, Flight Number#237, Destination Airport#238, Actual elapsed time (Minutes)#239]
   :  +- Filter isnotnull(Destination Airport#238)
   :     +- FileScan csv [Date (MM/DD/YYYY)#236,Flight Number#237,Destination Airport#238,Actual elapsed time (Minutes)#239] Batched: false, DataFilters: [isnotnull(Destination Airport#238)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/C:/Users/Jacqueline/Documents/projects/CAMP-PySpark/3-PySparkCle..., PartitionFilters: [], PushedFilters: [IsNotNull(Destination Airport)], ReadSchema: struct<Date (MM/DD/YYYY):string,Flight Number:int,Destination Airport:string,Actual elapsed time ...
   +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, false])

## Ex. 8 - Comparing broadcast vs normal joins

You've created two types of joins, normal and broadcasted. Now your manager would like to know what the performance improvement is by using Spark optimizations. If the results are promising, you'll be given more opportunity to tweak the Spark setup as needed.

**Instructions:**

1. Execute `.count()` on the normal DataFrame.
2. Execute `.count()` on the broadcasted DataFrame.
3. Print the count and duration of the DataFrames noting and differences.

In [34]:
# Count the number of rows in the normal DataFrame
start_time = time.time()
normal_count = normal_df.count()
normal_duration = time.time() - start_time

# Count the number of rows in the broadcast DataFrame
start_time = time.time()
broadcast_count = broadcast_df.count()
broadcast_duration = time.time() - start_time

# Print the counts and the duration of the tests
print("Normal count:\t\t%d\tduration: %f" % (normal_count, normal_duration))
print("Broadcast count:\t%d\tduration: %f" % (broadcast_count, broadcast_duration))

Normal count:		119374	duration: 0.235590
Broadcast count:	119374	duration: 0.184753


## Close session

In [35]:
spark.stop()