In [6]:
import findspark
findspark.init()


In [9]:
# Databricks notebook source
# accumulator designed to fetch data from executor to notebook
# Databricks notebook source
# accumulator useful to collect data from executor to driver program
rdd = sc.parallelize(range(0, 20))

sumAccum = sc.accumulator(0)

rdd.foreach(lambda n: sumAccum.add(n))  # run inside executor

print("Acc value is ", sumAccum.value) # driver

[Stage 0:>                                                          (0 + 1) / 1]

Acc value is  190


                                                                                

In [8]:
from pyspark import SparkContext
sc = SparkContext("local", "Accumulator")

22/05/07 00:35:41 WARN Utils: Your hostname, ubuntu-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.174.129 instead (on interface ens33)
22/05/07 00:35:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/05/07 00:35:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/07 00:35:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/07 00:35:43 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/05/07 00:35:43 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/05/07 00:35:43 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
2

In [14]:
# Databricks notebook source
# accumulator designed to fetch data from executor to notebook
# Databricks notebook source
# accumulator useful to collect data from executor to driver program
rdd = sc.parallelize(range(0, 20),4)

print("Data", rdd.glom().collect())

sumAccum = sc.accumulator(0)

rdd.foreach(lambda n: sumAccum.add(n))  # run inside executor

print("Acc value is ", sumAccum.value) # driver

Data [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17, 18, 19]]
Acc value is  190


In [15]:
sumFirstValueInPartitionAccum = sc.accumulator(0)

# write a accumulator that picks first element in each partition and sum them up
def sumFirstElement(partitionItr):
  global sumFirstValueInPartitionAccum
  for n in partitionItr: 
    sumFirstValueInPartitionAccum.add(n)
    break # we process only 1st element in the partition and end the loop
    
rdd.foreachPartition(lambda itr: sumFirstElement(itr))

print ("sum of first elements ", sumFirstValueInPartitionAccum.value)

sum of first elements  30


In [16]:
# custom accumulator
# collect first element in each parition [not to sum/count]
# zero, addInPlace are default functions will be invoked by accumulator 
from  pyspark.accumulators import AccumulatorParam
class ListItemParamAccumulator(AccumulatorParam):
  def zero(self, v):
    return [] # return list, empty list used when no seed value given initially
  # variable is list, value is the arg we pass via add function
  # acc.add(value) ==> calls addInPlace()
  def addInPlace(self, variable, value):
    variable.append(value)
    return variable

In [17]:
# create accumulator
# [] is empty list
# ListItemParamAccumulator is custom accumulator defiend above
firstValueAccum = sc.accumulator([], ListItemParamAccumulator())


# write a accumulator that picks first element in each partition and sum them up
def sampleFirstElement(partitionItr):
  global firstValueAccum
  for n in partitionItr: 
    # n is passed as value to addInPlace function
    firstValueAccum.add(n) # this will call ListItemParamAccumulator addInPlace function
    break
    
rdd.foreachPartition(lambda itr: sampleFirstElement(itr))

print (" first elements in each partition ", firstValueAccum.value)

 first elements in each partition  [[0], [5], [10], [15]]


In [18]:
rdd.glom().collect()


[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17, 18, 19]]