## Broadcast Variables
- Variables that are made available to all nodes in a Spark Cluster, useful if data with a common property is spread accross all nodes and said property is not properly represented


In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
student_data = [("Chris",1523,0.72,"CA"),
                ("Jake", 1555,0.83,"NY"),
                ("Cody", 1439,0.92,"CA"),
                ("Lisa",1442,0.81,"FL"),
                ("Daniel",1600,0.88,"TX"),
                ("Kelvin",1382,0.99,"FL"),
                ("Nancy",1442,0.74,"TX"),
                ("Pavel",1599,0.82,"NY"),
                ("Josh",1482,0.78,"CA"),
                ("Cynthia",1582,0.94,"CA")]
student_rdd = spark.sparkContext.parallelize(student_data)
rdd_transformation = student_rdd.map(lambda x: (x[0], x[1], int(x[2]*100), x[3]))

states = {"NY":"New York", "CA":"California", "TX":"Texas", "FL":"Florida"}

In [4]:
broadcastStates = spark.sparkContext.broadcast(states)
type(broadcastStates)

pyspark.broadcast.Broadcast

In [5]:
rdd_broadcast = rdd_transformation.map(lambda x: (x[0],x[1],x[2],broadcastStates.value[x[3]]))
rdd_broadcast.collect()

[('Chris', 1523, 72, 'California'),
 ('Jake', 1555, 83, 'New York'),
 ('Cody', 1439, 92, 'California'),
 ('Lisa', 1442, 81, 'Florida'),
 ('Daniel', 1600, 88, 'Texas'),
 ('Kelvin', 1382, 99, 'Florida'),
 ('Nancy', 1442, 74, 'Texas'),
 ('Pavel', 1599, 82, 'New York'),
 ('Josh', 1482, 78, 'California'),
 ('Cynthia', 1582, 94, 'California')]

## Accumulator Variables
- Variables that can be updated and primarily serve as counters or sums, shared amongst nodes to reduce serialization and overhead

In [12]:
sat_1500 = spark.sparkContext.accumulator(0)
type(sat_1500)

pyspark.accumulators.Accumulator

In [13]:
def count_high_sat_score(tuple):
    if tuple[1] > 1500: sat_1500.add(1)

In [14]:
rdd_broadcast.foreach(lambda x: count_high_sat_score(x))
print(sat_1500)

5
