# Tranformations

In [3]:
# Based on Spark 2.3.0 Doc https://spark.apache.org/docs/latest/rdd-programming-guide.html

from pyspark.sql import SparkSession
from pyspark.sql import Row

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .getOrCreate()
    
data = spark.read.csv("books.csv", header=True, inferSchema=True)
print(data.schema)
data.show()

books = data.rdd

StructType(List(StructField(title,StringType,true),StructField(author,StringType,true),StructField(type,StringType,true),StructField(price,IntegerType,true)))
+-------------------+------------+--------+-----+
|              title|      author|    type|price|
+-------------------+------------+--------+-----+
|       Where's Spot|   Eric Hill|Children|   10|
| The Cat In The Hat|   Dr. Seuss|Children|   15|
|Jamie's 15 Min Meal|Jamie Oliver|Adult NF|   20|
+-------------------+------------+--------+-----+



**Map**
- apply a function to each element and return an RDD

In [2]:
results = books.map(lambda b: b.title + " by " + b.author).collect()
print(type(results))
print(results)

<class 'list'>
["Where's Spot by Eric Hill", 'The Cat In The Hat by Dr. Seuss', "Jamie's 15 Min Meal by Jamie Oliver"]


**FlatMap**
- return a collection of the iterables returned by the function

In [3]:
results = books.flatMap(lambda b: b.title.split(' ')).collect()
print(type(results))
print(results)

<class 'list'>
["Where's", 'Spot', 'The', 'Cat', 'In', 'The', 'Hat', "Jamie's", '15', 'Min', 'Meal']


**Filter**
- return elements that satisify the specified condition

**Reduce**
- Reduces the elements of this RDD using the specified commutative and associative binary operator.  
- Returns the same type of RDD as the input RDD

In [4]:
from operator import add
spark.sparkContext.parallelize([1, 2, 3, 4, 5]).reduce(add)

15

In [4]:
results = books.filter(lambda b: b.price > 12).collect()
print(type(results))
print(results)

<class 'list'>
[Row(title='The Cat In The Hat', author='Dr. Seuss', type='Children', price=15), Row(title="Jamie's 15 Min Meal", author='Jamie Oliver', type='Adult NF', price=20)]


**Aggregate**
Aggregates the elements of each partition, and then the results for all the partitions, using a given combine functions and a neutral “zero value.”

In [None]:
seqOp = (lambda x, y: (x[0] + y, x[1] + 1))
combOp = (lambda x, y: (x[0] + y[0], x[1] + y[1]))
sc.parallelize([1, 2, 3, 4]).aggregate((0, 0), seqOp, combOp)
(10, 4)
>>> sc.parallelize([]).aggregate((0, 0), seqOp, combOp)
(0, 0)

**Persistence**
-  When you persist an RDD, each node stores any partitions of it that it computes in memory and reuses them in other actions on that dataset (or datasets derived from it). This allows future actions to be much faster (often by more than 10x)
- *cache()* is a shorthand for using the default storage level. which is StorageLevel.MEMORY_ONLY.  Other storage level includes:MEMORY_AND_DISK, DISK_ONLY

In [5]:
for i in range(1, 5):
    print(books.filter(lambda b: b.price > 12).collect())

# Versus #

cached = books.filter(lambda b: b.price > 12).cache()
for i in range(1, 5):
    print(cached.collect())


[Row(title='The Cat In The Hat', author='Dr. Seuss', type='Children', price=15), Row(title="Jamie's 15 Min Meal", author='Jamie Oliver', type='Adult NF', price=20)]
[Row(title='The Cat In The Hat', author='Dr. Seuss', type='Children', price=15), Row(title="Jamie's 15 Min Meal", author='Jamie Oliver', type='Adult NF', price=20)]
[Row(title='The Cat In The Hat', author='Dr. Seuss', type='Children', price=15), Row(title="Jamie's 15 Min Meal", author='Jamie Oliver', type='Adult NF', price=20)]
[Row(title='The Cat In The Hat', author='Dr. Seuss', type='Children', price=15), Row(title="Jamie's 15 Min Meal", author='Jamie Oliver', type='Adult NF', price=20)]
[Row(title='The Cat In The Hat', author='Dr. Seuss', type='Children', price=15), Row(title="Jamie's 15 Min Meal", author='Jamie Oliver', type='Adult NF', price=20)]
[Row(title='The Cat In The Hat', author='Dr. Seuss', type='Children', price=15), Row(title="Jamie's 15 Min Meal", author='Jamie Oliver', type='Adult NF', price=20)]
[Row(title