In [None]:
spark

# RDD Transform Methods

In [None]:
rdd = spark.sparkContext.parallelize(range(1, 20))

## map

In [None]:
rdd_square = rdd.map(lambda x: x**2)
rdd_cube = rdd.map(lambda x: x**3)

In [None]:
print("Square: " + repr(rdd_square.collect()))

In [None]:
print("Cube: " + repr(rdd_cube.collect()))

Nothing changes in `rdd`. It generate a new RDD when calling transform methods on it.

In [None]:
print("Original: " + repr(rdd.collect()))

## filter

In [None]:
rdd_even = rdd.filter(lambda x: x % 2 == 0)

In [None]:
print("Even: " + repr(rdd_even.collect()))

In [None]:
import math


def is_prime(n):
    for i in range(2, int(math.ceil(math.sqrt(n))) + 1):
        if n % i == 0:
            return False
    return True

rdd_prime = rdd.filter(is_prime)

In [None]:
print("Prime: " + repr(rdd_prime.collect()))

## flatMap

Flatten a result which is a list of collection of items as a list of items.

In [None]:
def power_cube(n):
    return n, n**2, n**3

In [None]:
rdd_map = rdd.map(power_cube)
rdd_map.collect()

In [None]:
rdd_flatten_map = rdd.flatMap(power_cube)
rdd_flatten_map.collect()

## mapPartitions

Instead to process values in `rdd` one time for a function call, this function supports to process one function call for a collection of values in `rdd` partition.

In [None]:
def cube(partition_data):
    for x in partition_data:
        yield x**2
        
rdd.mapPartitions(process_partitions).collect()

In [None]:
rdd_five_partitions = rdd.repartition(5)
rdd_five_partitions.mapPartitions(process_partitions).collect()

## reduceByKey

First of all, we must make an RDD of key-value pair. It is simple to do by using `map()`

In [None]:
keys = {
    0: "even",
    1: "odd", 
}

rdd_kv = rdd.map(lambda x: (keys[x % 2], x))

In [None]:
rdd_kv.reduceByKey(lambda x, y: x + y).collect()

## groupByKey

In [None]:
grouped_rdd_kv = rdd_kv.groupByKey()
grouped_rdd_kv.collect()

In [None]:
grouped_rdd_kv.mapValues(len).collect()    # grouped_rdd_kv.map(lambda x: (x[0], len(x[1]))).collect()

## combineByKey

In some cases that need to control aggregation in fine grain. `combineByKey` provide that capability for us. Actually this is not neccessary for learning, but just demonstrate it.

In [None]:
rdd_kv_2 = spark.sparkContext.parallelize([("a", 1), ("b", 3), ("c", 1), ("b", 1)])

In [None]:
def to_list(n):
    return [n]

def append(p, n):
    p.append(n)
    return p

def extend(a, b):
    a.extend(b)
    return a

rdd_kv_2.combineByKey(
    createCombiner=to_list,
    mergeValue=append,
    mergeCombiners=extend
).collect()