In [None]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Distributed Shared Variables")
    .master("spark://spark-master:7077")
    .config("spark.cores.max", 16)
    .config("spark.executor.cores", 4)
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/04 17:19:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
# Read EMP CSV data

_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = (
    spark.read.format("csv")
    .schema(_schema)
    .option("header", True)
    .load("hdfs://namenode:9000/input/data/employee_records.csv")
)

In [8]:
emp.show()

[Stage 0:>                                                          (0 + 1) / 1]

+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+
|first_name| last_name|           job_title|       dob|               email|               phone|  salary|department_id|
+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+
|   Richard|  Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|       (699)525-4827|512653.0|            8|
|     Bobby|  Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|  (750)846-1602x7458|999836.0|            7|
|    Dennis|    Norman|Land/geomatics su...|1990-06-24| jturner@example.net|    873.820.0518x825|131900.0|           10|
|      John|    Monroe|        Retail buyer|1968-06-16|  erik33@example.net|    820-813-0557x624|485506.0|            1|
|  Michelle|   Elliott|      Air cabin crew|1975-03-31|tiffanyjohnston@e...|       (705)900-5337|604738.0|            8|
|    Ashley|   Montoya|        C

                                                                                

In [None]:
# Calculate total salary of Department 6

from pyspark.sql.functions import sum

emp.where("department_id = 6").groupBy("department_id").agg(
    sum("salary").cast("long")
).show()



+-------------+---------------------------+
|department_id|CAST(sum(salary) AS BIGINT)|
+-------------+---------------------------+
|            6|                50294510721|
+-------------+---------------------------+



                                                                                

In [4]:
# Accumulators

dept_sal = spark.sparkContext.accumulator(0)

In [None]:
# Use foreach


def calculate_salary(department_id, salary):
    if department_id == 6:
        dept_sal.add(salary)


emp.foreach(lambda row: calculate_salary(row.department_id, row.salary))

                                                                                

In [6]:
# View total value

dept_sal.value

50294510721.0

In [7]:
# Stop Spark Session

spark.stop()