In [None]:
"""
  The code here has been inspired by

  PySpark Cookbook:
  Over 60 recipes for implementing big data processing and analytics using
  Apache Spark and Python
"""

# Using Spark RDD

In [None]:
!pip install pyspark




In [None]:
import pyspark
print(pyspark.__version__)


3.5.4


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ColabPySpark") \
    .master("local[*]") \
    .getOrCreate()

print("Spark is running:", spark.version)


Spark is running: 3.5.4


# Spark RDD

In [None]:
# # Get Spark context
sc = spark.sparkContext

print(f"Spark UI: {sc.uiWebUrl}")  # Check Spark UI link
print(f"Spark Version: {sc.version}")

Spark UI: http://4ee654948532:4040
Spark Version: 3.5.4


In [None]:
myRDD = sc.parallelize([('Mike', 19), ('June', 18), ('Rachel',16), ('Rob', 18), ('Scott', 17)])

In [None]:
myRDD.take(5)

[('Mike', 19), ('June', 18), ('Rachel', 16), ('Rob', 18), ('Scott', 17)]

Load and Read data from files:

In [None]:
aiportRDD = sc.textFile('airport-code.txt')
aiportRDD.collect()

['City\tState\tCountry\tIATA',
 'Abbotsford\tBC\tCanada\tYXX',
 'Aberdeen\tSD\tUSA\tABR',
 'Abilene\tTX\tUSA\tABI',
 'Akron\tOH\tUSA\tCAK',
 'Alamosa\tCO\tUSA\tALS',
 'Albany\tGA\tUSA\tABY',
 'Albany\tNY\tUSA\tALB',
 'Albuquerque\tNM\tUSA\tABQ',
 'Alexandria\tLA\tUSA\tAEX',
 'Allentown\tPA\tUSA\tABE',
 'Alliance\tNE\tUSA\tAIA',
 'Alpena\tMI\tUSA\tAPN',
 'Altoona\tPA\tUSA\tAOO',
 'Amarillo\tTX\tUSA\tAMA',
 'Anahim Lake\tBC\tCanada\tYAA',
 'Anchorage\tAK\tUSA\tANC',
 'Appleton\tWI\tUSA\tATW',
 'Arviat\tNWT\tCanada\tYEK',
 'Asheville\tNC\tUSA\tAVL',
 'Aspen\tCO\tUSA\tASE',
 'Athens\tGA\tUSA\tAHN',
 'Atlanta\tGA\tUSA\tATL',
 'Atlantic City\tNJ\tUSA\tACY',
 'Augusta\tGA\tUSA\tAGS',
 'Augusta\tME\tUSA\tAUG',
 'Austin\tTX\tUSA\tAUS',
 'Bagotville\tPQ\tCanada\tYBG',
 'Baie-Comeau\tPQ\tCanada\tYBC',
 'Bakersfield\tCA\tUSA\tBFL',
 'Baltimore\tMD\tUSA\tBWI',
 'Bangor\tME\tUSA\tBGR',
 'Bar Harbor\tME\tUSA\tBHB',
 'Barrow\tAK\tUSA\tBRW',
 'Baton Rouge\tLA\tUSA\tBTR',
 'Beaumont\tTX\tUSA\tBPT',
 'Be

In [None]:
aiportRDD.take(5)

['City\tState\tCountry\tIATA',
 'Abbotsford\tBC\tCanada\tYXX',
 'Aberdeen\tSD\tUSA\tABR',
 'Abilene\tTX\tUSA\tABI',
 'Akron\tOH\tUSA\tCAK']

In [None]:
aiportRDD.count()

527

In [None]:
aiportRDD = sc.textFile('airport-code.txt', minPartitions=4, use_unicode=True).map(lambda line: line.split("\t"))
aiportRDD.collect()

[['City', 'State', 'Country', 'IATA'],
 ['Abbotsford', 'BC', 'Canada', 'YXX'],
 ['Aberdeen', 'SD', 'USA', 'ABR'],
 ['Abilene', 'TX', 'USA', 'ABI'],
 ['Akron', 'OH', 'USA', 'CAK'],
 ['Alamosa', 'CO', 'USA', 'ALS'],
 ['Albany', 'GA', 'USA', 'ABY'],
 ['Albany', 'NY', 'USA', 'ALB'],
 ['Albuquerque', 'NM', 'USA', 'ABQ'],
 ['Alexandria', 'LA', 'USA', 'AEX'],
 ['Allentown', 'PA', 'USA', 'ABE'],
 ['Alliance', 'NE', 'USA', 'AIA'],
 ['Alpena', 'MI', 'USA', 'APN'],
 ['Altoona', 'PA', 'USA', 'AOO'],
 ['Amarillo', 'TX', 'USA', 'AMA'],
 ['Anahim Lake', 'BC', 'Canada', 'YAA'],
 ['Anchorage', 'AK', 'USA', 'ANC'],
 ['Appleton', 'WI', 'USA', 'ATW'],
 ['Arviat', 'NWT', 'Canada', 'YEK'],
 ['Asheville', 'NC', 'USA', 'AVL'],
 ['Aspen', 'CO', 'USA', 'ASE'],
 ['Athens', 'GA', 'USA', 'AHN'],
 ['Atlanta', 'GA', 'USA', 'ATL'],
 ['Atlantic City', 'NJ', 'USA', 'ACY'],
 ['Augusta', 'GA', 'USA', 'AGS'],
 ['Augusta', 'ME', 'USA', 'AUG'],
 ['Austin', 'TX', 'USA', 'AUS'],
 ['Bagotville', 'PQ', 'Canada', 'YBG'],
 ['Baie

In [None]:
aiportRDD.take(5)

[['City', 'State', 'Country', 'IATA'],
 ['Abbotsford', 'BC', 'Canada', 'YXX'],
 ['Aberdeen', 'SD', 'USA', 'ABR'],
 ['Abilene', 'TX', 'USA', 'ABI'],
 ['Akron', 'OH', 'USA', 'CAK']]

In [None]:
aiportRDD.count()

527

In [None]:
#Number of partitin
aiportRDD.getNumPartitions()

4

In [None]:
#Departure RDD
departureRDD = (sc.textFile('departure_delays.csv').map(lambda element: element.split(",")))
departureRDD.collect()

[['date', 'delay', 'distance', 'origin', 'destination'],
 ['01011245', '6', '602', 'ABE', 'ATL'],
 ['01020600', '-8', '369', 'ABE', 'DTW'],
 ['01021245', '-2', '602', 'ABE', 'ATL'],
 ['01020605', '-4', '602', 'ABE', 'ATL'],
 ['01031245', '-4', '602', 'ABE', 'ATL'],
 ['01030605', '0', '602', 'ABE', 'ATL'],
 ['01041243', '10', '602', 'ABE', 'ATL'],
 ['01040605', '28', '602', 'ABE', 'ATL'],
 ['01051245', '88', '602', 'ABE', 'ATL'],
 ['01050605', '9', '602', 'ABE', 'ATL'],
 ['01061215', '-6', '602', 'ABE', 'ATL'],
 ['01061725', '69', '602', 'ABE', 'ATL'],
 ['01061230', '0', '369', 'ABE', 'DTW'],
 ['01060625', '-3', '602', 'ABE', 'ATL'],
 ['01070600', '0', '369', 'ABE', 'DTW'],
 ['01071725', '0', '602', 'ABE', 'ATL'],
 ['01071230', '0', '369', 'ABE', 'DTW'],
 ['01070625', '0', '602', 'ABE', 'ATL'],
 ['01071219', '0', '569', 'ABE', 'ORD'],
 ['01080600', '0', '369', 'ABE', 'DTW'],
 ['01081230', '33', '369', 'ABE', 'DTW'],
 ['01080625', '1', '602', 'ABE', 'ATL'],
 ['01080607', '5', '569', 'ABE

In [None]:
departureRDD.count()

1391579

In [None]:
#Departure RDD
departureRDD = (sc.textFile('departure_delays.csv', minPartitions=8).map(lambda element: element.split(",")))
departureRDD.collect()

[['date', 'delay', 'distance', 'origin', 'destination'],
 ['01011245', '6', '602', 'ABE', 'ATL'],
 ['01020600', '-8', '369', 'ABE', 'DTW'],
 ['01021245', '-2', '602', 'ABE', 'ATL'],
 ['01020605', '-4', '602', 'ABE', 'ATL'],
 ['01031245', '-4', '602', 'ABE', 'ATL'],
 ['01030605', '0', '602', 'ABE', 'ATL'],
 ['01041243', '10', '602', 'ABE', 'ATL'],
 ['01040605', '28', '602', 'ABE', 'ATL'],
 ['01051245', '88', '602', 'ABE', 'ATL'],
 ['01050605', '9', '602', 'ABE', 'ATL'],
 ['01061215', '-6', '602', 'ABE', 'ATL'],
 ['01061725', '69', '602', 'ABE', 'ATL'],
 ['01061230', '0', '369', 'ABE', 'DTW'],
 ['01060625', '-3', '602', 'ABE', 'ATL'],
 ['01070600', '0', '369', 'ABE', 'DTW'],
 ['01071725', '0', '602', 'ABE', 'ATL'],
 ['01071230', '0', '369', 'ABE', 'DTW'],
 ['01070625', '0', '602', 'ABE', 'ATL'],
 ['01071219', '0', '569', 'ABE', 'ORD'],
 ['01080600', '0', '369', 'ABE', 'DTW'],
 ['01081230', '33', '369', 'ABE', 'DTW'],
 ['01080625', '1', '602', 'ABE', 'ATL'],
 ['01080607', '5', '569', 'ABE

In [None]:
departureRDD.count()

1391579

In [None]:
# Use map() to extract out the first two columns
departureRDD.map(lambda c: (c[0], c[1])).take(5)

[('date', 'delay'),
 ('01011245', '6'),
 ('01020600', '-8'),
 ('01021245', '-2'),
 ('01020605', '-4')]

In [None]:
# User filter() to filter where second column == "WA"
(aiportRDD.map(lambda c: (c[0], c[1])).filter(lambda c: c[1] == "WA").take(5))

[('Bellingham', 'WA'),
 ('Moses Lake', 'WA'),
 ('Pasco', 'WA'),
 ('Pullman', 'WA'),
 ('Seattle', 'WA')]

## RDD Transformations

In [None]:
aiportRDD = sc.textFile('airport-code.txt', minPartitions=4, use_unicode=True).map(lambda line: line.split("\t"))
aiportRDD.collect()

[['City', 'State', 'Country', 'IATA'],
 ['Abbotsford', 'BC', 'Canada', 'YXX'],
 ['Aberdeen', 'SD', 'USA', 'ABR'],
 ['Abilene', 'TX', 'USA', 'ABI'],
 ['Akron', 'OH', 'USA', 'CAK'],
 ['Alamosa', 'CO', 'USA', 'ALS'],
 ['Albany', 'GA', 'USA', 'ABY'],
 ['Albany', 'NY', 'USA', 'ALB'],
 ['Albuquerque', 'NM', 'USA', 'ABQ'],
 ['Alexandria', 'LA', 'USA', 'AEX'],
 ['Allentown', 'PA', 'USA', 'ABE'],
 ['Alliance', 'NE', 'USA', 'AIA'],
 ['Alpena', 'MI', 'USA', 'APN'],
 ['Altoona', 'PA', 'USA', 'AOO'],
 ['Amarillo', 'TX', 'USA', 'AMA'],
 ['Anahim Lake', 'BC', 'Canada', 'YAA'],
 ['Anchorage', 'AK', 'USA', 'ANC'],
 ['Appleton', 'WI', 'USA', 'ATW'],
 ['Arviat', 'NWT', 'Canada', 'YEK'],
 ['Asheville', 'NC', 'USA', 'AVL'],
 ['Aspen', 'CO', 'USA', 'ASE'],
 ['Athens', 'GA', 'USA', 'AHN'],
 ['Atlanta', 'GA', 'USA', 'ATL'],
 ['Atlantic City', 'NJ', 'USA', 'ACY'],
 ['Augusta', 'GA', 'USA', 'AGS'],
 ['Augusta', 'ME', 'USA', 'AUG'],
 ['Austin', 'TX', 'USA', 'AUS'],
 ['Bagotville', 'PQ', 'Canada', 'YBG'],
 ['Baie

In [None]:
#Departure RDD
departure_flightRDD = (sc.textFile('departure_delays.csv', minPartitions=8).map(lambda element: element.split(",")))
departure_flightRDD.collect()

[['date', 'delay', 'distance', 'origin', 'destination'],
 ['01011245', '6', '602', 'ABE', 'ATL'],
 ['01020600', '-8', '369', 'ABE', 'DTW'],
 ['01021245', '-2', '602', 'ABE', 'ATL'],
 ['01020605', '-4', '602', 'ABE', 'ATL'],
 ['01031245', '-4', '602', 'ABE', 'ATL'],
 ['01030605', '0', '602', 'ABE', 'ATL'],
 ['01041243', '10', '602', 'ABE', 'ATL'],
 ['01040605', '28', '602', 'ABE', 'ATL'],
 ['01051245', '88', '602', 'ABE', 'ATL'],
 ['01050605', '9', '602', 'ABE', 'ATL'],
 ['01061215', '-6', '602', 'ABE', 'ATL'],
 ['01061725', '69', '602', 'ABE', 'ATL'],
 ['01061230', '0', '369', 'ABE', 'DTW'],
 ['01060625', '-3', '602', 'ABE', 'ATL'],
 ['01070600', '0', '369', 'ABE', 'DTW'],
 ['01071725', '0', '602', 'ABE', 'ATL'],
 ['01071230', '0', '369', 'ABE', 'DTW'],
 ['01070625', '0', '602', 'ABE', 'ATL'],
 ['01071219', '0', '569', 'ABE', 'ORD'],
 ['01080600', '0', '369', 'ABE', 'DTW'],
 ['01081230', '33', '369', 'ABE', 'DTW'],
 ['01080625', '1', '602', 'ABE', 'ATL'],
 ['01080607', '5', '569', 'ABE

flat map transformation

In [None]:
# Filter only second column == "WA",
# select first two columns within the RDD,
# and flatten out all values
(aiportRDD.filter(lambda c: c[1] == "WA").map(lambda c: (c[0], c[1])).flatMap(lambda x: x).take(10))

['Bellingham',
 'WA',
 'Moses Lake',
 'WA',
 'Pasco',
 'WA',
 'Pullman',
 'WA',
 'Seattle',
 'WA']

In [None]:
# Provide the distinct elements for the
# third column of airports representing
# countries
(aiportRDD.map(lambda c: c[2]).distinct().take(5))

['Canada', 'USA', 'Country']

In [None]:
# Provide a sample based on 0.001% the
# flights RDD data specific to the fourth
# column (origin city of flight)
# without replacement (False) using random
# seed of 123
(
departure_flightRDD
.map(lambda c: c[3])
.sample(False, 0.001, 123)
.take(5)
)

['ABQ', 'AEX', 'AGS', 'ANC', 'ATL']

Join transformation

In [None]:
# Flights data
flt = departure_flightRDD.map(lambda c: (c[3], c[0]))
# Airports data
air = aiportRDD.map(lambda c: (c[3], c[1]))
# Execute inner join between RDDs
flt.join(air).take(5)

[('ALB', ('01010600', 'NY')),
 ('ALB', ('01021808', 'NY')),
 ('ALB', ('01020600', 'NY')),
 ('ALB', ('01031808', 'NY')),
 ('ALB', ('01030600', 'NY'))]

repartition() transformation

In [None]:
# The flights RDD originally generated has 2 partitions
departure_flightRDD.getNumPartitions()

flights2 = departure_flightRDD.repartition(8)
# Checking the number of partitions for the flights2 RDD
flights2.getNumPartitions()


8

zipWithIndex transformation

In [None]:
aiportRDD.take(10)

[['City', 'State', 'Country', 'IATA'],
 ['Abbotsford', 'BC', 'Canada', 'YXX'],
 ['Aberdeen', 'SD', 'USA', 'ABR'],
 ['Abilene', 'TX', 'USA', 'ABI'],
 ['Akron', 'OH', 'USA', 'CAK'],
 ['Alamosa', 'CO', 'USA', 'ALS'],
 ['Albany', 'GA', 'USA', 'ABY'],
 ['Albany', 'NY', 'USA', 'ALB'],
 ['Albuquerque', 'NM', 'USA', 'ABQ'],
 ['Alexandria', 'LA', 'USA', 'AEX']]

In [None]:
# View each row within RDD + the index
# i.e. output is in form ([row], idx)
ac = aiportRDD.map(lambda c: (c[0], c[3]))
ac.zipWithIndex().take(5)

[(('City', 'IATA'), 0),
 (('Abbotsford', 'YXX'), 1),
 (('Aberdeen', 'ABR'), 2),
 (('Abilene', 'ABI'), 3),
 (('Akron', 'CAK'), 4)]

In [None]:
# To remove the header from the data
# Using zipWithIndex to skip header row
# - filter out row 0
# - extract only row info
# (ac.zipWithIndex().filter(lambda (row, idx): idx > 0).map(lambda (row, idx): row).take(5))


In [None]:
(ac.zipWithIndex()
   .filter(lambda row_idx: row_idx[1] > 0)  # Unpacking tuple
   .map(lambda row_idx: row_idx[0])  # Extracting the row
   .take(5)
)


[('Abbotsford', 'YXX'),
 ('Aberdeen', 'ABR'),
 ('Abilene', 'ABI'),
 ('Akron', 'CAK'),
 ('Alamosa', 'ALS')]

## reduceByKey transformation :

In [None]:
# # Determine delays by originating city
# # - remove header row via zipWithIndex() and map()
# (departure_flightRDD.zipWithIndex().filter(lambda row_idx: row_idx[0] > 0).map(lambda row_idx[1]: row)\
#  .map(lambda c: (c[3], int(c[1]))).reduceByKey(lambda x, y: x + y).take(5))

In [None]:
(
    departure_flightRDD
    .zipWithIndex()
    .filter(lambda row_idx: row_idx[1] > 0)  # idx is now accessed with row_idx[1]
    .map(lambda row_idx: row_idx[0])  # Extract the row
    .map(lambda c: (c[3], int(c[1])))  # Assuming c[1] is the delay and c[3] is the city
    .reduceByKey(lambda x, y: x + y)
    .take(5)
)


[('ADQ', -254), ('BHM', 44355), ('BQN', 3943), ('BRO', 4967), ('BTM', -138)]

In [None]:
departure_flightRDD.take(10)

[['date', 'delay', 'distance', 'origin', 'destination'],
 ['01011245', '6', '602', 'ABE', 'ATL'],
 ['01020600', '-8', '369', 'ABE', 'DTW'],
 ['01021245', '-2', '602', 'ABE', 'ATL'],
 ['01020605', '-4', '602', 'ABE', 'ATL'],
 ['01031245', '-4', '602', 'ABE', 'ATL'],
 ['01030605', '0', '602', 'ABE', 'ATL'],
 ['01041243', '10', '602', 'ABE', 'ATL'],
 ['01040605', '28', '602', 'ABE', 'ATL'],
 ['01051245', '88', '602', 'ABE', 'ATL']]

## sortByKey transformation

In [None]:
(
    departure_flightRDD.zipWithIndex()
    .filter(lambda row_idx: row_idx[1] > 0)  # Filter out the header row
    .map(lambda row_idx: row_idx[0])  # Extract the actual row
    .map(lambda c: (c[3], int(c[1])))  # Extract the origin code (c[3]) and delay (c[1])
    .reduceByKey(lambda x, y: x + y)  # Sum the delays by origin code
    .sortByKey()  # Sort by origin code
    .take(50)  # Take top 50 results
)


[('ABE', 5113),
 ('ABI', 5128),
 ('ABQ', 64422),
 ('ABY', 1554),
 ('ACT', 392),
 ('ACV', 8429),
 ('ADQ', -254),
 ('AEX', 10193),
 ('AGS', 5003),
 ('ALB', 22362),
 ('ALO', 2866),
 ('AMA', 21979),
 ('ANC', 4948),
 ('ATL', 1151087),
 ('ATW', 8151),
 ('AUS', 108638),
 ('AVL', 5727),
 ('AVP', 2946),
 ('AZO', 233),
 ('BDL', 54662),
 ('BET', -645),
 ('BFL', 4022),
 ('BGR', 2852),
 ('BHM', 44355),
 ('BIL', 2616),
 ('BIS', 3825),
 ('BMI', 7817),
 ('BNA', 212243),
 ('BOI', 18004),
 ('BOS', 238602),
 ('BPT', 1936),
 ('BQK', 3952),
 ('BQN', 3943),
 ('BRO', 4967),
 ('BRW', 880),
 ('BTM', -138),
 ('BTR', 21989),
 ('BTV', 14755),
 ('BUF', 54309),
 ('BUR', 42241),
 ('BWI', 362845),
 ('BZN', 7226),
 ('CAE', 25686),
 ('CAK', 14749),
 ('CDC', 51),
 ('CDV', -1024),
 ('CEC', 2832),
 ('CHA', 7586),
 ('CHO', 2421),
 ('CHS', 30789)]

## Union transformation

In [None]:
# Create `a` RDD of Washington airports
a = (
    aiportRDD.zipWithIndex()
    .filter(lambda row_idx: row_idx[1] > 0)  # Filtering out the header row
    .map(lambda row_idx: row_idx[0])  # Extracting the actual row
    .filter(lambda c: c[1] == "WA")  # Filter for Washington (WA) airports
)

# Create `b` RDD of British Columbia airports
b = (
    aiportRDD.zipWithIndex()
    .filter(lambda row_idx: row_idx[1] > 0)  # Filtering out the header row
    .map(lambda row_idx: row_idx[0])  # Extracting the actual row
    .filter(lambda c: c[1] == "BC")  # Filter for British Columbia (BC) airports
)

# Union WA and BC airports
a.union(b).collect()


[['Bellingham', 'WA', 'USA', 'BLI'],
 ['Moses Lake', 'WA', 'USA', 'MWH'],
 ['Pasco', 'WA', 'USA', 'PSC'],
 ['Pullman', 'WA', 'USA', 'PUW'],
 ['Seattle', 'WA', 'USA', 'SEA'],
 ['Spokane', 'WA', 'USA', 'GEG'],
 ['Walla Walla', 'WA', 'USA', 'ALW'],
 ['Wenatchee', 'WA', 'USA', 'EAT'],
 ['Yakima', 'WA', 'USA', 'YKM'],
 ['Abbotsford', 'BC', 'Canada', 'YXX'],
 ['Anahim Lake', 'BC', 'Canada', 'YAA'],
 ['Campbell River', 'BC', 'Canada', 'YBL'],
 ['Castlegar', 'BC', 'Canada', 'YCG'],
 ['Cranbrook', 'BC', 'Canada', 'YXC'],
 ['Fort Nelson', 'BC', 'Canada', 'YYE'],
 ['Fort Saint John', 'BC', 'Canada', 'YXJ'],
 ['Kamloops', 'BC', 'Canada', 'YKA'],
 ['Kelowna', 'BC', 'Canada', 'YLW'],
 ['Nanaimo', 'BC', 'Canada', 'YCD'],
 ['Penticton', 'BC', 'Canada', 'YYF'],
 ['Port Hardy', 'BC', 'Canada', 'YZT'],
 ['Powell River', 'BC', 'Canada', 'YPW'],
 ['Prince George', 'BC', 'Canada', 'YXS'],
 ['Prince Rupert', 'BC', 'Canada', 'YPR'],
 ['Quesnel', 'BC', 'Canada', 'YQZ'],
 ['"Sandspit, Queen Charlotte Islands"',

## mapPartitionsWithIndex transformation

In [None]:
# # The mapPartitionsWithIndex(f) is similar to map but runs the
# f function separately on each partition and provides an index of the
# partition.

In [None]:
# Define the partitionElementCount function
def partitionElementCount(idx, iterator):
    count = 0
    for _ in iterator:
        count += 1
    return [(idx, count)]  # Return a list of tuples

# Use mapPartitionsWithIndex to determine the number of elements in each partition
departure_flightRDD.mapPartitionsWithIndex(partitionElementCount).collect()


[(0, 174294),
 (1, 174020),
 (2, 173849),
 (3, 174006),
 (4, 173864),
 (5, 174308),
 (6, 173620),
 (7, 173618)]

# RDD Actions


## Collect() action

In [None]:
# Return all airports elements
# filtered by WA state
aiportRDD.filter(lambda c: c[1] == "WA").collect()

[['Bellingham', 'WA', 'USA', 'BLI'],
 ['Moses Lake', 'WA', 'USA', 'MWH'],
 ['Pasco', 'WA', 'USA', 'PSC'],
 ['Pullman', 'WA', 'USA', 'PUW'],
 ['Seattle', 'WA', 'USA', 'SEA'],
 ['Spokane', 'WA', 'USA', 'GEG'],
 ['Walla Walla', 'WA', 'USA', 'ALW'],
 ['Wenatchee', 'WA', 'USA', 'EAT'],
 ['Yakima', 'WA', 'USA', 'YKM']]

## Reduce action

In [None]:
# The reduce(f) action aggregates the elements of an RDD by f. The
# f function should be commutative and associative so that it can be
# computed correctly in parallel.



In [None]:
# Calculate the total delays of flights
# between SEA (origin) and SFO (dest),
# convert delays column to int
# and summarize
departure_flightRDD.filter(lambda c: c[3] == 'SEA' and c[4] == 'SFO')\
.map(lambda c: int(c[1]))\
.reduce(lambda x, y: x + y)

22293

## count() action

In [None]:

departure_flightRDD.zipWithIndex().filter(lambda row_idx: row_idx[1] > 0).map(lambda row_idx: row_idx[0]).count()


1391578

## saveAsTextFile() action

In [None]:
# Saves airports as a text file
# Note, each partition has their own file
# saveAsTextFile
aiportRDD.saveAsTextFile("airports")

In [None]:
# Determine delays by originating city
# - remove header row via zipWithIndex() and map()
departure_flightRDD.zipWithIndex().filter(lambda row_idx: row_idx[1] > 0).map(lambda row_idx: row_idx[0])\
.map(lambda c: (c[3], int(c[1]))).reduceByKey(lambda x, y: x + y).take(5)


[('ADQ', -254), ('BHM', 44355), ('BQN', 3943), ('BRO', 4967), ('BTM', -138)]