# Jonathan Halverson
# Tuesday, May 3, 2016
# Working with key/value pairs

In [119]:
lines = sc.textFile('text_file.md')
print lines.first()
print lines.count()

# Apache Spark
95


In [120]:
num_chars = lines.map(lambda line: len(line))
first_word = lines.filter(lambda line: len(line.split()) > 2).map(lambda line: line.lower().split()[0])

In [121]:
# make a pair RDD
pairs_num = num_chars.map(lambda x: (x, x**2))
pairs_wds = first_word.map(lambda word: (word, 1))
print pairs_num.take(5)
print pairs_wds.take(5)

[(14, 196), (0, 0), (78, 6084), (75, 5625), (73, 5329)]
[(u'#', 1), (u'spark', 1), (u'high-level', 1), (u'supports', 1), (u'rich', 1)]


### Common transformations

In [122]:
# single-line word count (the lambda function says what to do with the values)
# the value type must the same as original type
wc = pairs_wds.reduceByKey(lambda x, y: x + y)
print wc.filter(lambda p: p[1] > 1).collect()

[(u'and', 3), (u'##', 7), (u'please', 3), (u'you', 2), (u'spark', 4), (u'can', 2), (u'to', 2)]


In [123]:
# group by key then convert the pyspark.resultiterable.ResultIterable to a Python list using mapValues
print pairs_num.groupByKey().mapValues(list).take(10)

[(0, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), (14, [196]), (16, [256, 256]), (22, [484]), (26, [676, 676]), (32, [1024]), (42, [1764, 1764]), (44, [1936]), (52, [2704]), (54, [2916])]


In [169]:
# mapValue will apply a function to each value without altering the key
# the partition of the return RDD (this is a transformation, not action)
# will be the same of the original partition
pairs_num.mapValues(lambda x: -x).take(5)

[(14, -196), (0, 0), (78, -6084), (75, -5625), (73, -5329)]

In [168]:
pairs_num.flatMapValues(lambda x: range(x)).take(5)

[(14, 0), (14, 1), (14, 2), (14, 3), (14, 4)]

In [126]:
# revisit map and flatmap
print 'map', num_chars.map(lambda x: x / 2).take(4)
print 'map', num_chars.map(lambda x: (x, x)).take(4)
print 'flatmap', num_chars.flatMap(lambda x: (x, x)).take(4)

map [7, 0, 39, 37]
map [(14, 14), (0, 0), (78, 78), (75, 75)]
flatmap [14, 14, 0, 0]


In [127]:
wc.keys().take(10)

[u'and',
 u'package.',
 u'##',
 u'please',
 u'alternatively,',
 u'will',
 u'#',
 u'for',
 u'storage',
 u'rich']

In [128]:
# values
wc.values().take(10)

[3, 1, 7, 3, 1, 1, 1, 1, 1, 1]

In [129]:
# here we create a new collection of pairs using existing data
repeat = sc.parallelize([(w, c) for w, c, in zip(wc.keys().collect(), wc.values().collect())])
print repeat.count()
print repeat.first()

36
(u'and', 3)


In [130]:
wc.sortByKey().take(10)

[(u'"yarn"', 1),
 (u'#', 1),
 (u'##', 7),
 (u'(you', 1),
 (u'["specifying', 1),
 (u'[run', 1),
 (u'alternatively,', 1),
 (u'and', 3),
 (u'build/mvn', 1),
 (u'building', 1)]

In [131]:
# check for duplicates (distinct works on RDDs and pair RDDs)
print pairs_wds.count()
print pairs_wds.distinct().count()

52
36


### Transformations on two pair RDDs

In [132]:
# this should give an empty list since both RDDs are equal
print wc.subtract(repeat).collect()

[]


In [133]:
a = sc.parallelize([(1, 2), (3, 4), (3, 6)])
b = sc.parallelize([(3, 9)])

In [147]:
# remove elements with a key present in the 2nd RDD
a.subtractByKey(b).collect()

[(1, 2)]

In [137]:
# inner join
a.join(b).collect()

[(3, (4, 9)), (3, (6, 9))]

In [143]:
# inner join
b.join(a).collect()

[(3, (9, 4)), (3, (9, 6))]

In [139]:
# rightOuterJoin
a.rightOuterJoin(b).collect()

[(3, (4, 9)), (3, (6, 9))]

In [141]:
# rightOuterJoin
b.rightOuterJoin(a).collect()

[(1, (None, 2)), (3, (9, 4)), (3, (9, 6))]

In [149]:
# leftOuterJoin
a.leftOuterJoin(b).collect()

[(1, (2, None)), (3, (4, 9)), (3, (6, 9))]