In [26]:
from pyspark import SparkContext, SparkConf
import time


In [27]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))


In [28]:
populations = sc.textFile("data/statesPopulation.csv")
populations = (
    populations.filter(lambda line: "State" not in line)
    .map(lambda line: line.split(","))
    .map(lambda line: (line[0], int(line[2])))
)


In [29]:
start = time.time()
collected_reduce = populations.reduceByKey(lambda x, y: x + y) # type: ignore
stop = time.time()
print(f'Execution time: {(stop - start) * 1000} ms')

Execution time: 28.453826904296875 ms


In [30]:
collected_reduce.take(5)

[('Arizona', 46518355),
 ('Arkansas', 20703849),
 ('Colorado', 36963486),
 ('Delaware', 6481217),
 ('Florida', 137618322)]

In [31]:
start = time.time()
collected_groupby = populations.groupByKey().mapValues(sum) # type: ignore
stop = time.time()
print(f'Execution time: {(stop - start) * 1000} ms')

Execution time: 25.278806686401367 ms


In [32]:
collected_groupby.take(5)

[('Arizona', 46518355),
 ('Arkansas', 20703849),
 ('Colorado', 36963486),
 ('Delaware', 6481217),
 ('Florida', 137618322)]

In [33]:
start = time.time()
collected_agg = populations.aggregateByKey(0, lambda x, y: x + y, lambda x, y: x + y) # type: ignore
stop = time.time()
print(f'Execution time: {(stop - start) * 1000} ms')

Execution time: 20.406484603881836 ms


In [34]:
collected_agg.take(5)

[('Arizona', 46518355),
 ('Arkansas', 20703849),
 ('Colorado', 36963486),
 ('Delaware', 6481217),
 ('Florida', 137618322)]