# Jonathan Halverson
# Tuesday, May 3, 2016
# Working with key/value pairs

In [275]:
lines = sc.textFile('text_file.md')
print lines.first()
print lines.count()

# Apache Spark
95


In [276]:
# wordcount in a single line
wdct = lines.flatMap(lambda line: line.split()).countByValue()
print wdct.items()[:10]

[(u'help', 1), (u'storage', 1), (u'Hadoop', 3), (u'not', 1), (u'including', 3), (u'computation', 1), (u'high-level', 1), (u'find', 1), (u'web', 1), (u'Shell', 2)]


In [277]:
num_chars = lines.map(lambda line: len(line))
first_word = lines.filter(lambda line: len(line.split()) > 2).map(lambda line: line.lower().split()[0])

In [278]:
# make a pair RDD
pairs_num = num_chars.map(lambda x: (x, x**2))
pairs_wds = first_word.map(lambda word: (word, 1))
print pairs_num.take(5)
print pairs_wds.take(5)

[(14, 196), (0, 0), (78, 6084), (75, 5625), (73, 5329)]
[(u'#', 1), (u'spark', 1), (u'high-level', 1), (u'supports', 1), (u'rich', 1)]


### Common transformations

In [279]:
# single-line word count (the lambda function says what to do with the values)
# the value type must the same as original type
wc = pairs_wds.reduceByKey(lambda x, y: x + y)
print wc.filter(lambda p: p[1] > 1).collect()

[(u'and', 3), (u'##', 7), (u'please', 3), (u'you', 2), (u'spark', 4), (u'can', 2), (u'to', 2)]


In [280]:
# group by key then convert the pyspark.resultiterable.ResultIterable to a Python list using mapValues
print pairs_num.groupByKey().mapValues(list).take(10)

[(0, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), (14, [196]), (16, [256, 256]), (22, [484]), (26, [676, 676]), (32, [1024]), (42, [1764, 1764]), (44, [1936]), (52, [2704]), (54, [2916])]


In [281]:
# mapValue will apply a function to each value without altering the key
# the partition of the return RDD (this is a transformation, not action)
# will be the same of the original partition
pairs_num.mapValues(lambda x: -x).take(5)

[(14, -196), (0, 0), (78, -6084), (75, -5625), (73, -5329)]

In [282]:
pairs_num.flatMapValues(lambda x: range(x)).take(5)

[(14, 0), (14, 1), (14, 2), (14, 3), (14, 4)]

In [283]:
# revisit map and flatmap
print 'map', num_chars.map(lambda x: x / 2).take(4)
print 'map', num_chars.map(lambda x: (x, x)).take(4)
print 'flatmap', num_chars.flatMap(lambda x: (x, x)).take(4)

map [7, 0, 39, 37]
map [(14, 14), (0, 0), (78, 78), (75, 75)]
flatmap [14, 14, 0, 0]


In [284]:
wc.keys().take(10)

[u'and',
 u'package.',
 u'##',
 u'please',
 u'alternatively,',
 u'will',
 u'#',
 u'for',
 u'storage',
 u'rich']

In [285]:
# values
wc.values().take(10)

[3, 1, 7, 3, 1, 1, 1, 1, 1, 1]

In [286]:
# here we create a new collection of pairs using existing data
repeat = sc.parallelize([(w, c) for w, c, in zip(wc.keys().collect(), wc.values().collect())])
print repeat.count()
print repeat.first()

36
(u'and', 3)


In [287]:
wc.sortByKey().take(10)

[(u'"yarn"', 1),
 (u'#', 1),
 (u'##', 7),
 (u'(you', 1),
 (u'["specifying', 1),
 (u'[run', 1),
 (u'alternatively,', 1),
 (u'and', 3),
 (u'build/mvn', 1),
 (u'building', 1)]

In [303]:
wc.sortByKey(ascending=False, keyfunc=lambda x: len(x)).take(10)

[(u'master=spark://host:7077', 1),
 (u'alternatively,', 1),
 (u'["specifying', 1),
 (u'high-level', 1),
 (u'build/mvn', 1),
 (u'package.', 1),
 (u'supports', 1),
 (u'building', 1),
 (u'examples', 1),
 (u'storage', 1)]

In [289]:
# check for duplicates (distinct works on RDDs and pair RDDs)
print pairs_wds.count()
print pairs_wds.distinct().count()

52
36


### Transformations on two pair RDDs

In [290]:
# this should give an empty list since both RDDs are equal
print wc.subtract(repeat).collect()

[]


In [291]:
a = sc.parallelize([(1, 2), (3, 4), (3, 6)])
b = sc.parallelize([(3, 9)])

In [292]:
# remove elements with a key present in the 2nd RDD
a.subtractByKey(b).collect()

[(1, 2)]

In [293]:
# inner join
a.join(b).collect()

[(3, (4, 9)), (3, (6, 9))]

In [294]:
# inner join
b.join(a).collect()

[(3, (9, 4)), (3, (9, 6))]

In [295]:
# rightOuterJoin
a.rightOuterJoin(b).collect()

[(3, (4, 9)), (3, (6, 9))]

In [296]:
# rightOuterJoin
b.rightOuterJoin(a).collect()

[(1, (None, 2)), (3, (9, 4)), (3, (9, 6))]

In [297]:
# leftOuterJoin
a.leftOuterJoin(b).collect()

[(1, (2, None)), (3, (4, 9)), (3, (6, 9))]

In [298]:
# cogroup gives the keys and a list of corresponding values
a.cogroup(b).mapValues(lambda value: [item for val in value for item in val]).collect()

[(1, [2]), (3, [4, 6, 9])]

In [299]:
# combine per key is the most general aggregation function that most
# other functions are built on; like aggregate the return type can
# different from the original type
print pairs_num.take(10)
print pairs_num.keys().count(), pairs_num.keys().distinct().count()

[(14, 196), (0, 0), (78, 6084), (75, 5625), (73, 5329), (74, 5476), (56, 3136), (42, 1764), (0, 0), (26, 676)]
95 42


In [300]:
pairs_num.combineByKey(createCombiner=(lambda x: (x, 1)),
                       mergeValue=(lambda x, y: (x[0] + y, x[1] + 1)),
                       mergeCombiners=(lambda x, y: (x[0] + y[0], x[1] + y[1]))).collectAsMap()

{0: (0, 35),
 14: (196, 1),
 16: (512, 2),
 17: (867, 3),
 19: (722, 2),
 21: (441, 1),
 22: (484, 1),
 23: (529, 1),
 26: (1352, 2),
 27: (729, 1),
 29: (841, 1),
 31: (961, 1),
 32: (1024, 1),
 33: (1089, 1),
 39: (1521, 1),
 41: (1681, 1),
 42: (3528, 2),
 43: (1849, 1),
 44: (1936, 1),
 45: (2025, 1),
 52: (2704, 1),
 54: (2916, 1),
 56: (6272, 2),
 61: (3721, 1),
 62: (7688, 2),
 64: (4096, 1),
 65: (4225, 1),
 66: (8712, 2),
 67: (4489, 1),
 68: (4624, 1),
 69: (4761, 1),
 70: (9800, 2),
 72: (5184, 1),
 73: (10658, 2),
 74: (16428, 3),
 75: (11250, 2),
 76: (17328, 3),
 77: (11858, 2),
 78: (6084, 1),
 84: (14112, 2),
 97: (9409, 1),
 120: (28800, 2)}

In [301]:
# the number of partitions the RDD exists on
pairs_num.getNumPartitions()

2

In [308]:
pairs_num.countByKey().items()[:10]

[(0, 35),
 (14, 1),
 (16, 2),
 (17, 3),
 (19, 2),
 (21, 1),
 (22, 1),
 (23, 1),
 (26, 2),
 (27, 1)]

In [312]:
print pairs_num.lookup(14)
print pairs_num.lookup(17)

[196]
[289, 289, 289]
