# Jonathan Halverson
# Wednesday, September 21, 2016
# Accumulators and broadcast variables

In [1]:
lines = sc.textFile('text_file.md')
lines.take(5)

[u'# Apache Spark',
 u'',
 u'Spark is a fast and general cluster computing system for Big Data. It provides',
 u'high-level APIs in Scala, Java, Python, and R, and an optimized engine that',
 u'supports general computation graphs for data analysis. It also supports a']

In [2]:
pcount = sc.accumulator(0)
tcount = sc.accumulator(0)

In [3]:
def countPython(line):
    global pcount, tcount
    if ('python' in line.lower()): pcount += 1
    if ('the' in line.lower()): tcount += 1

In [4]:
lines.foreach(countPython)
print lines.first()
print pcount.value, tcount.value

# Apache Spark
3 21


Note that "grep -ci 'The' text_file.md" gives 21 and 3 in the case of 'Python'.

It is important to carry out an action such as first() on the RDD before evaluating the accumulators. This is because the transformation is not executed until the action is called.

In [5]:
keywords = ['code', 'hardware', 'causality', 'engine', 'computer']
ret = sc.broadcast(keywords)

In [6]:
out = lines.filter(lambda x: any([keyword in x for keyword in keywords]))
print out.count()

1


In [7]:
print out.collect()

[u'high-level APIs in Scala, Java, Python, and R, and an optimized engine that']


### Per partition basis

In [22]:
lens = lines.map(lambda x: len(x))
print lens.take(3)

[14, 0, 78]


In [29]:
def combineCtrs(c1, c2):
    return (c1[0] + c2[0], c1[1] + c2[1])

In [30]:
def partitionCounters(nums):
    sumCount = [0, 0]
    for num in nums:
        sumCount[0] += num
        sumCount[1] += 1
    return [sumCount]

In [33]:
def fastAvg(nums):
    sumCount = nums.mapPartitions(partitionCounters).reduce(combineCtrs)
    return sumCount[0] / float(sumCount[1])

In [34]:
fastAvg(lens)

34.357894736842105

The alternative of using nums.map(lambda num: (num, 1)).reduce(combinCtrs) is slower.

### Numeric RDD operations

In [35]:
lens.mean()

34.3578947368421

In [37]:
pairs = lens.map(lambda x: (x, 1))
# pairs.mean() this line fails because not a numeric RDD

In [43]:
stats = lens.stats()
mu = stats.mean()
print mu

34.3578947368


In [49]:
lines.filter(lambda x: len(x) > 0).reduce(lambda x,y: x[0] + y[1])

u'#n'

### Get the number of partitions

In [63]:
lines.getNumPartitions()

2