# Jonathan Halverson
# Wednesday, September 21, 2016  (Feb 2018 update)
# Accumulators and broadcast variables

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NoApp").getOrCreate()

In [2]:
lines = spark.read.text('text_file.md')
lines.show(5)

+--------------------+
|               value|
+--------------------+
|      # Apache Spark|
|                    |
|Spark is a fast a...|
|high-level APIs i...|
|supports general ...|
+--------------------+
only showing top 5 rows



In [3]:
from pyspark import SparkContext

In [4]:
pcount = spark.sparkContext.accumulator(0)
tcount = spark.sparkContext.accumulator(0)

In [5]:
def countPython(line):
     global pcount, tcount
     if ('python' in line.value.lower()): pcount += 1
     if ('the' in line.value.lower()): tcount += 1

In [6]:
lines.foreach(lambda x: x.value)

In [7]:
lines.foreach(countPython)
print lines.first()
print pcount.value, tcount.value

Row(value=u'# Apache Spark')
3 21


Note that "grep -ci 'The' text_file.md" gives 21 and 3 in the case of 'Python'.

It is important to carry out an action such as first() on the RDD before evaluating the accumulators. This is because the transformation is not executed until the action is called.

In [8]:
keywords = ['code', 'hardware', 'causality', 'engine', 'computer', '# Apache Spark']
ret = spark.sparkContext.broadcast(keywords)

In [9]:
lines.filter(lines.value.isin(keywords)).show()

+--------------+
|         value|
+--------------+
|# Apache Spark|
+--------------+



In [10]:
lines.filter(lines.value.contains('Apache') | lines.value.contains('Python')).show()

+--------------------+
|               value|
+--------------------+
|      # Apache Spark|
|high-level APIs i...|
|Spark is built us...|
|## Interactive Py...|
|Alternatively, if...|
+--------------------+



In [11]:
out = lines.rdd.filter(lambda row: any([keyword in row.value for keyword in keywords]))
print out.count()

2


In [12]:
print out.map(lambda u: u.value).collect()

[u'# Apache Spark', u'high-level APIs in Scala, Java, Python, and R, and an optimized engine that']


### Per partition basis

In [13]:
lens = lines.rdd.map(lambda row: len(row.value))
print lens.take(3)

[14, 0, 78]


In [14]:
from pyspark.sql import functions as F
lines.withColumn('value', F.length(lines.value)).show(3)

+-----+
|value|
+-----+
|   14|
|    0|
|   78|
+-----+
only showing top 3 rows



In [15]:
def combineCtrs(c1, c2):
     return (c1[0] + c2[0], c1[1] + c2[1])

In [16]:
def partitionCounters(nums):
     sumCount = [0, 0]
     for num in nums:
         sumCount[0] += num
         sumCount[1] += 1
     return [sumCount]

In [17]:
def fastAvg(nums):
     sumCount = nums.mapPartitions(partitionCounters).reduce(combineCtrs)
     return sumCount[0] / float(sumCount[1])

In [18]:
fastAvg(lens)

34.357894736842105

The alternative of using nums.map(lambda num: (num, 1)).reduce(combinCtrs) is slower.

### Numeric RDD operations

In [19]:
lens.mean()

34.3578947368421

In [20]:
pairs = lens.map(lambda x: (x, 1))
# pairs.mean() this line fails because not a numeric RDD

In [21]:
stats = lens.stats()
mu = stats.mean()
print mu

34.3578947368


In [22]:
lines.rdd.filter(lambda row: len(row.value) > 0 and len(row.value) < 20).take(10)

[Row(value=u'# Apache Spark'),
 Row(value=u'## Building Spark'),
 Row(value=u'    ./bin/pyspark'),
 Row(value=u'## Example Programs'),
 Row(value=u'## Running Tests'),
 Row(value=u'can be run using:'),
 Row(value=u'    ./dev/run-tests'),
 Row(value=u'## Configuration')]

In [23]:
lines.rdd.filter(lambda row: len(row.value) > 0 and len(row.value) < 20).reduce(lambda x, y: x[0] + y[0])

u'### Configuration'

### Get the number of partitions

In [24]:
lines.rdd.getNumPartitions()

1