In [23]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("TestingRDDs").getOrCreate())

In [41]:
words_list = "Spark makes life a lot easier and puts me into good Spirits, Spark is too Awesome!".split(" ")

In [42]:
type(words_list)

list

In [43]:
print(words_list)

['Spark', 'makes', 'life', 'a', 'lot', 'easier', 'and', 'puts', 'me', 'into', 'good', 'Spirits,', 'Spark', 'is', 'too', 'Awesome!']


In [44]:
# load into RDD
words_rdd = spark.sparkContext.parallelize(words_list)

In [45]:
# to retrieve values in the rdd, we need to collect into a list
words_data = words_rdd.collect()

In [46]:
for word in words_data:
    print(word)

Spark
makes
life
a
lot
easier
and
puts
me
into
good
Spirits,
Spark
is
too
Awesome!


In [47]:
words_rdd.count()

16

In [48]:
# Distinct Transformation
# Does not alter the RDD!
print(words_rdd.distinct().count(), "\n")
words_data = words_rdd.collect()

print("Current RDD: ")
for word in words_data:
    print(word)

15 

Current RDD: 
Spark
makes
life
a
lot
easier
and
puts
me
into
good
Spirits,
Spark
is
too
Awesome!


In [49]:
# Need to assign to new RDD
words_unique_rdd = words_rdd.distinct()

print("New RDD: ")
for word in words_unique_rdd.collect():
    print(word)

New RDD: 
life
and
Awesome!
Spirits,
good
makes
a
Spark
into
easier
too
lot
puts
is
me


In [50]:
def wordStartsWith(word, letter):
    return word.startswith(letter)

In [51]:
words_rdd.filter(lambda word: wordStartsWith(word, "S")).collect()

['Spark', 'Spirits,', 'Spark']

In [52]:
# Sorting Transformations
countries_list = [("India",91),("USA",4),("Greece",13)]
countries_rdd = spark.sparkContext.parallelize(countries_list)

In [53]:
sorted_countries_list = countries_rdd.sortByKey().collect()

In [54]:
for country in sorted_countries_list:
    print(country)

('Greece', 13)
('India', 91)
('USA', 4)


In [58]:
# Map Transformation
# the c represents each tuple, we want to switch the key and value
sorted_countries_list = countries_rdd.map(lambda c: (c[1],c[0])
                                         ).sortByKey(False).collect()

for country in sorted_countries_list:
    print(country)

(91, 'India')
(13, 'Greece')
(4, 'USA')


In [59]:
# Reduce Action, summation example
num_list = [1,5,2,3,4]
result = spark.sparkContext.parallelize(num_list).reduce(lambda x, y: x + y)
print(result)

15


In [60]:
def sumList(x, y): #print x and y as they enter the function
    print(x,y)
    return x + y

In [61]:
result = spark.sparkContext.parallelize(num_list).reduce(lambda x, y: sumList(x,y))
print(result)

1 5
6 2
8 3
11 4
15


In [64]:
# Using reducer to get the largest length word in the list
def wordLengthReducer(leftWord, rightWord):
    print(leftWord, " compared to ", rightWord)
    if len(leftWord) > len(rightWord):
        return leftWord
    else:
        return rightWord

In [65]:
words_rdd.reduce(wordLengthReducer)

Spark  compared to  makes
makes  compared to  life
makes  compared to  lot
makes  compared to  easier
easier  compared to  puts
easier  compared to  me
easier  compared to  into
easier  compared to  Spirits,
Spirits,  compared to  Spark
Spirits,  compared to  is
Spirits,  compared to  Awesome!


'Awesome!'

In [66]:
words_rdd.first()

'Spark'

In [67]:
spark.sparkContext.parallelize(range(1,21)).max()

20

In [69]:
spark.sparkContext.parallelize(range(1,21)).min()

1