# RDD Operations

In [1]:
# Based on Spark 2.3.0 Doc

from pyspark.sql import SparkSession
from pyspark.sql import Row

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .getOrCreate()


In [2]:
# Create RDD from existing collection in drive program

nums = range(1,4)
distNums = spark.sparkContext.parallelize(nums)
print()
print("distDetails type:" + str(type(distNums)))



distDetails type:<class 'pyspark.rdd.PipelinedRDD'>


In [3]:
# Read source data from a file and convert to RDD
books = spark.read.csv("books.csv", header=True)
books.show()
bookRDD = books.rdd
print(type(bookRDD)) # an RDD[Row]

+-------------------+------------+--------+-----+
|              title|      author|    type|price|
+-------------------+------------+--------+-----+
|       Where's Spot|   Eric Hill|Children|   10|
| The Cat In The Hat|   Dr. Seuss|Children|   15|
|Jamie's 15 Min Meal|Jamie Oliver|Adult NF|   20|
+-------------------+------------+--------+-----+

<class 'pyspark.rdd.RDD'>


In [4]:
# Call map to apply a function on RDD then use collect to materialise
bookDetails = bookRDD.map(lambda b: b.title + " by " + b.author).collect()
print(bookDetails)

print()
print("bookDetails type:" + str(type(bookDetails)))

["Where's Spot by Eric Hill", 'The Cat In The Hat by Dr. Seuss', "Jamie's 15 Min Meal by Jamie Oliver"]

bookDetails type:<class 'list'>


In [5]:
# Called map by passing in a defined function
def myFunction(book):
    return book.title + " by " + book.author
bookDetails = bookRDD.map(myFunction).collect()
print(bookDetails)

["Where's Spot by Eric Hill", 'The Cat In The Hat by Dr. Seuss', "Jamie's 15 Min Meal by Jamie Oliver"]


In [6]:
# IMPORTANT: If the lambda referencing an object the entire object will be send to the cluster, hence try 
# only access local variables in lambda function

class ClassA(object):
    def __init__(self):
        self.value = "==="

a = ClassA()
bookRDD.map(lambda b: b.title + a.value).collect() # <- avoid this

v = a.value
bookRDD.map(lambda b: b.title + v).collect()

["Where's Spot===", 'The Cat In The Hat===', "Jamie's 15 Min Meal==="]