# Broadcase Variables

In [None]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Broadcast Variable")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/04 17:15:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
# Our example dataset

_data = [
    ["Ramesh", "D001", "Apache Spark"],
    ["Siv", "D001", "C++"],
    ["Imran", "D002", "English"],
    ["Akshay", "D003", "Hindi"],
    ["Somesh", "D002", "Scala"],
    ["Hitesh", "D001", "Physics"],
]

_cols = ["NAME", "DEPT_CODE", "FAV_SUBJECT"]

df_students = spark.createDataFrame(data=_data, schema=_cols)
df_students.printSchema()
df_students.show()

root
 |-- NAME: string (nullable = true)
 |-- DEPT_CODE: string (nullable = true)
 |-- FAV_SUBJECT: string (nullable = true)



                                                                                

+------+---------+------------+
|  NAME|DEPT_CODE| FAV_SUBJECT|
+------+---------+------------+
|Ramesh|     D001|Apache Spark|
|   Siv|     D001|         C++|
| Imran|     D002|     English|
|Akshay|     D003|       Hindi|
|Somesh|     D002|       Scala|
|Hitesh|     D001|     Physics|
+------+---------+------------+



In [4]:
# Broadcast Varible

dept_names = {"D001": "Department 1", "D002": "Department 2", "D003": "Department 3"}
dept_est = {"D001": 1990, "D003": 2001}

broadcast_dept_names = spark.sparkContext.broadcast(dept_names)
broadcast_dept_est = spark.sparkContext.broadcast(dept_est)

# Check the type of the variable
print("The type is: " + str(type(broadcast_dept_est)))

# In case we want to check the value
print(broadcast_dept_names.value)
print(broadcast_dept_est.value)

The type is: <class 'pyspark.broadcast.Broadcast'>
{'D001': 'Department 1', 'D002': 'Department 2', 'D003': 'Department 3'}
{'D001': 1990, 'D003': 2001}


In [None]:
# Lets use broadcast variable to use Dept info
_new_schema = ["NAME", "DEPT_CODE", "FAV_SUBJECT", "DEPT_NAME", "ESTD"]

# Use lambda function to iterate over row to get the broadcast value
df = df_students.rdd.map(
    lambda row: [
        row.NAME,
        row.DEPT_CODE,
        row.FAV_SUBJECT,
        broadcast_dept_names.value.get(row.DEPT_CODE),
        broadcast_dept_est.value.get(row.DEPT_CODE),
    ]
).toDF(_new_schema)

df.show()

                                                                                

+------+---------+------------+------------+----+
|  NAME|DEPT_CODE| FAV_SUBJECT|   DEPT_NAME|ESTD|
+------+---------+------------+------------+----+
|Ramesh|     D001|Apache Spark|Department 1|1990|
|   Siv|     D001|         C++|Department 1|1990|
| Imran|     D002|     English|Department 2|null|
|Akshay|     D003|       Hindi|Department 3|2001|
|Somesh|     D002|       Scala|Department 2|null|
|Hitesh|     D001|     Physics|Department 1|1990|
+------+---------+------------+------------+----+



In [6]:
df.printSchema()

root
 |-- NAME: string (nullable = true)
 |-- DEPT_CODE: string (nullable = true)
 |-- FAV_SUBJECT: string (nullable = true)
 |-- DEPT_NAME: string (nullable = true)
 |-- ESTD: long (nullable = true)



In [7]:
spark.stop()