# Chapter 13. Advanced RDDs

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
    .config("spark.sql.shuffle.partitions", "5")\
    .appName("Advanced RDDs")\
    .getOrCreate()

24/08/30 14:36:41 WARN Utils: Your hostname, Khanhs-MAC.local resolves to a loopback address: 127.0.0.1; using 192.168.0.103 instead (on interface en0)
24/08/30 14:36:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/30 14:36:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple"\
    .split(" ")

words = spark.sparkContext.parallelize(myCollection, 2)

## Key-Value Basics (Key-Value RDDs)

In [3]:
words.map(lambda words: words (words.lower() , 1))

PythonRDD[1] at RDD at PythonRDD.scala:53

### keyBy

In [4]:
keyword = words.keyBy(lambda word: word.lower()[0])

### Mapping over Values

In [5]:
keyword.mapValues(lambda word: word.upper()).collect()

                                                                                

[('s', 'SPARK'),
 ('t', 'THE'),
 ('d', 'DEFINITIVE'),
 ('g', 'GUIDE'),
 (':', ':'),
 ('b', 'BIG'),
 ('d', 'DATA'),
 ('p', 'PROCESSING'),
 ('m', 'MADE'),
 ('s', 'SIMPLE')]

In [6]:
keyword.flatMapValues(lambda word: word.upper()).collect()

[('s', 'S'),
 ('s', 'P'),
 ('s', 'A'),
 ('s', 'R'),
 ('s', 'K'),
 ('t', 'T'),
 ('t', 'H'),
 ('t', 'E'),
 ('d', 'D'),
 ('d', 'E'),
 ('d', 'F'),
 ('d', 'I'),
 ('d', 'N'),
 ('d', 'I'),
 ('d', 'T'),
 ('d', 'I'),
 ('d', 'V'),
 ('d', 'E'),
 ('g', 'G'),
 ('g', 'U'),
 ('g', 'I'),
 ('g', 'D'),
 ('g', 'E'),
 (':', ':'),
 ('b', 'B'),
 ('b', 'I'),
 ('b', 'G'),
 ('d', 'D'),
 ('d', 'A'),
 ('d', 'T'),
 ('d', 'A'),
 ('p', 'P'),
 ('p', 'R'),
 ('p', 'O'),
 ('p', 'C'),
 ('p', 'E'),
 ('p', 'S'),
 ('p', 'S'),
 ('p', 'I'),
 ('p', 'N'),
 ('p', 'G'),
 ('m', 'M'),
 ('m', 'A'),
 ('m', 'D'),
 ('m', 'E'),
 ('s', 'S'),
 ('s', 'I'),
 ('s', 'M'),
 ('s', 'P'),
 ('s', 'L'),
 ('s', 'E')]

### Extracting Keys and Values

In [7]:
keyword.keys().collect()

['s', 't', 'd', 'g', ':', 'b', 'd', 'p', 'm', 's']

In [8]:
keyword.values().collect()

['Spark',
 'The',
 'Definitive',
 'Guide',
 ':',
 'Big',
 'Data',
 'Processing',
 'Made',
 'Simple']

### lookup

In [9]:
keyword.lookup("s")

['Spark', 'Simple']

### sampleByKey

In [10]:
import random

distinctChars = words.flatMap(lambda word: list(word.lower()))\
    .distinct()\
    .collect()
print(distinctChars)
sampleMap = dict(map(lambda c: (c, random.random()), distinctChars))
print(sampleMap)
words.map(lambda word: (word.lower()[0], word))\
    .sampleByKey(True, sampleMap, 6).collect()

[Stage 5:>                                                          (0 + 2) / 2]

['p', 't', 'd', 'g', 'b', 'o', 'c', 'l', 's', 'a', 'r', 'k', 'h', 'e', 'f', 'i', 'n', 'v', 'u', ':', 'm']
{'p': 0.8053699136020446, 't': 0.2515320222285219, 'd': 0.42082925556196704, 'g': 0.6314819744921397, 'b': 0.3295621535213801, 'o': 0.7976391422061683, 'c': 0.5217491770914663, 'l': 0.6607649221389149, 's': 0.07936372031147565, 'a': 0.9036858749632595, 'r': 0.10187455947853974, 'k': 0.16997303984431789, 'h': 0.17703403463098788, 'e': 0.8229375057224789, 'f': 0.16040118020146232, 'i': 0.057989469012307504, 'n': 0.8087039416817501, 'v': 0.4128904850796872, 'u': 0.25143925741163864, ':': 0.6237706641260152, 'm': 0.8387484211043137}


                                                                                

[('t', 'The'), ('g', 'Guide'), ('m', 'Made')]

## Aggregations

In [11]:
chars = words.flatMap(lambda word: word.lower())
KVcharacters = chars.map(lambda letter: (letter, 1))
def maxFunc(left, right):
    return max(left, right)
def addFunc(left, right):
    return left + right
nums = spark.sparkContext.parallelize(range(1, 31), 5)

### countByKey

In [12]:
KVcharacters.countByKey()

defaultdict(int,
            {'s': 4,
             'p': 3,
             'a': 4,
             'r': 2,
             'k': 1,
             't': 3,
             'h': 1,
             'e': 7,
             'd': 4,
             'f': 1,
             'i': 7,
             'n': 2,
             'v': 1,
             'g': 3,
             'u': 1,
             ':': 1,
             'b': 1,
             'o': 1,
             'c': 1,
             'm': 2,
             'l': 1})

### Understanding Aggregation Implementations

#### groupByKey

In [13]:
from functools import reduce

KVcharacters.groupByKey().map(lambda row: (row[0], reduce(addFunc, row[1])))\
    .collect()

[('p', 3),
 ('t', 3),
 ('d', 4),
 ('g', 3),
 ('b', 1),
 ('o', 1),
 ('c', 1),
 ('l', 1),
 ('s', 4),
 ('a', 4),
 ('r', 2),
 ('k', 1),
 ('h', 1),
 ('e', 7),
 ('f', 1),
 ('i', 7),
 ('n', 2),
 ('v', 1),
 ('u', 1),
 (':', 1),
 ('m', 2)]

#### reduceByKey

In [14]:
KVcharacters.reduceByKey(addFunc).collect()

[('p', 3),
 ('t', 3),
 ('d', 4),
 ('g', 3),
 ('b', 1),
 ('o', 1),
 ('c', 1),
 ('l', 1),
 ('s', 4),
 ('a', 4),
 ('r', 2),
 ('k', 1),
 ('h', 1),
 ('e', 7),
 ('f', 1),
 ('i', 7),
 ('n', 2),
 ('v', 1),
 ('u', 1),
 (':', 1),
 ('m', 2)]

### Other Aggregation Methods

#### aggregate

In [15]:
nums.aggregate(0, maxFunc, addFunc)

90

In [16]:
depth = 3
nums.treeAggregate(0, maxFunc, addFunc, depth)

90

#### aggregateByKey

In [17]:
KVcharacters.aggregateByKey(0, addFunc, maxFunc).collect()

[('p', 2),
 ('t', 2),
 ('d', 2),
 ('g', 2),
 ('b', 1),
 ('o', 1),
 ('c', 1),
 ('l', 1),
 ('s', 3),
 ('a', 3),
 ('r', 1),
 ('k', 1),
 ('h', 1),
 ('e', 4),
 ('f', 1),
 ('i', 4),
 ('n', 1),
 ('v', 1),
 ('u', 1),
 (':', 1),
 ('m', 2)]

#### combineByKey

In [18]:
def valToCombiner(value):
    return [value]


def mergeValuesFunc(vals, valToAppend):
    vals.append(valToAppend)
    return vals


def mergeCombinerFunc(vals1, vals2):
    return vals1 + vals2


outputPartitions = 6
KVcharacters\
    .combineByKey(
        valToCombiner,
        mergeValuesFunc,
        mergeCombinerFunc,
        outputPartitions)\
    .collect()

                                                                                

[('p', [1, 1, 1]),
 ('b', [1]),
 ('l', [1]),
 ('a', [1, 1, 1, 1]),
 ('k', [1]),
 ('h', [1]),
 ('i', [1, 1, 1, 1, 1, 1, 1]),
 ('u', [1]),
 ('t', [1, 1, 1]),
 ('d', [1, 1, 1, 1]),
 ('g', [1, 1, 1]),
 ('o', [1]),
 ('s', [1, 1, 1, 1]),
 ('r', [1, 1]),
 ('f', [1]),
 ('v', [1]),
 ('c', [1]),
 ('e', [1, 1, 1, 1, 1, 1, 1]),
 ('n', [1, 1]),
 (':', [1]),
 ('m', [1, 1])]

#### foldByKey

In [19]:
KVcharacters.foldByKey(0, addFunc).collect()

[('p', 3),
 ('t', 3),
 ('d', 4),
 ('g', 3),
 ('b', 1),
 ('o', 1),
 ('c', 1),
 ('l', 1),
 ('s', 4),
 ('a', 4),
 ('r', 2),
 ('k', 1),
 ('h', 1),
 ('e', 7),
 ('f', 1),
 ('i', 7),
 ('n', 2),
 ('v', 1),
 ('u', 1),
 (':', 1),
 ('m', 2)]

## CoGroups

In [20]:
distinctChars = words.flatMap(lambda word: word.lower()).distinct()
charRDD = distinctChars.map(lambda c: (c, random.random()))
charRDD2 = distinctChars.map(lambda c: (c, random.random()))
charRDD.cogroup(charRDD2).take(5)

[('p',
  (<pyspark.resultiterable.ResultIterable at 0x10cfd96a0>,
   <pyspark.resultiterable.ResultIterable at 0x10cfdbc50>)),
 ('t',
  (<pyspark.resultiterable.ResultIterable at 0x10cfd9b80>,
   <pyspark.resultiterable.ResultIterable at 0x10cfd92e0>)),
 ('d',
  (<pyspark.resultiterable.ResultIterable at 0x10cfdbb00>,
   <pyspark.resultiterable.ResultIterable at 0x10cfd9d00>)),
 ('g',
  (<pyspark.resultiterable.ResultIterable at 0x10cfd8fe0>,
   <pyspark.resultiterable.ResultIterable at 0x10cfd8ce0>)),
 ('l',
  (<pyspark.resultiterable.ResultIterable at 0x10cfdbec0>,
   <pyspark.resultiterable.ResultIterable at 0x10cfdbf50>))]

## Joins

### Inner Join

In [21]:
keyedChars = distinctChars.map(lambda c: (c, random.random()))
outputPartitions = 10
KVcharacters.join(keyedChars).count()

51

In [22]:
KVcharacters.join(keyedChars, outputPartitions).count()

51

### zips

In [23]:
numRange = spark.sparkContext.parallelize(range(10), 2)
words.zip(numRange).collect()

[('Spark', 0),
 ('The', 1),
 ('Definitive', 2),
 ('Guide', 3),
 (':', 4),
 ('Big', 5),
 ('Data', 6),
 ('Processing', 7),
 ('Made', 8),
 ('Simple', 9)]

## Controlling Partitions

### coalesce

In [34]:
words.coalesce(1).getNumPartitions() # 1

1

### repartition

In [35]:
words.repartition(10) 

MapPartitionsRDD[128] at coalesce at NativeMethodAccessorImpl.java:0

### repartitionAndSortWithinPartitions

### Custom Partitioning

In [36]:
df = spark.read.option("header", "true").option("inferSchema", "true")\
    .csv("../data/retail-data/all/")
rdd = df.coalesce(10).rdd
df.printSchema()

                                                                                

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [37]:
def partitionFunc(key):
    import random
    if key == 17850 or key == 12583:
      return 0
    else:
      return random.randint(1,2)
keyedRDD = rdd.keyBy(lambda row: row[6])
keyedRDD\
    .partitionBy(3, partitionFunc)\
    .map(lambda x: x[0])\
    .glom()\
    .map(lambda x: len(set(x)))\
    .take(5)

                                                                                

[2, 4302, 4307]

## Custom Serialization

In [38]:
class SomeClass:
    def __init__(self):
        self.someValue = 0

    def setSomeValue(self, i):
        self.someValue = i
        return self

rdd = spark.sparkContext.parallelize(range(1, 11)).map(lambda num: SomeClass().setSomeValue(num))

result = rdd.collect()

for obj in result:
    print(obj.someValue)

1
2
3
4
5
6
7
8
9
10
