# RDDs

In [1]:
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()

'H:\\Spark\\spark-3.0.0-bin-hadoop2.7'

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [3]:
conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)

## Reading from a sample textfile

In [12]:
words = sc.textFile('sample_file.txt')
type(words)

pyspark.rdd.RDD

In [13]:
words = words.map(lambda line:line.split(' '))
words.collect()

[['Hi,', 'How', 'are', 'you?'],
 ['My', 'name', 'is', 'Jeswin', 'George.'],
 ['I', 'am', 'studying', 'spark.']]

## Reading from  a string data

In [8]:
data = 'Hello, How are you?'.split(" ")
mdata = sc.parallelize(data,2)

In [7]:
mdata.collect()

['Hello,', 'How', 'are', 'you?']

# Transformations
## Filitering operation

In [14]:
words = sc.parallelize(['scala', 'java', 'hadoop', 'spark', 'spark vs hadoop', 'pyspark', 'spark and pyspark'])
# transformation
words_filter = words.filter(lambda x: 'spark' in x)
# action
filtered = words_filter.collect()
print('Filtered RDD is: ', filtered)

Filtered RDD is:  ['spark', 'spark vs hadoop', 'pyspark', 'spark and pyspark']


In [15]:
# transformation
words_diff_filter = words.filter(lambda x: 'spark' not in x)
# action
filtered_diff = words_diff_filter.collect()
print('Filtered RDD is: ', filtered_diff)

Filtered RDD is:  ['scala', 'java', 'hadoop']


## other operations

In [17]:
# transformation
words_diff_filter = words.map(lambda x: x.upper())   # used map here
# action
filtered_diff = words_diff_filter.collect()
print('Filtered RDD is: ', filtered_diff)

Filtered RDD is:  ['SCALA', 'JAVA', 'HADOOP', 'SPARK', 'SPARK VS HADOOP', 'PYSPARK', 'SPARK AND PYSPARK']


## rdds can be used as an iterable too

In [19]:
data = 'Welcome to Edureka Spark Certification training'.split(' ')
rdd = sc.parallelize(data)
rdd.collect()

['Welcome', 'to', 'Edureka', 'Spark', 'Certification', 'training']

In [20]:
# find largest word
max_w1 = 0
for word in rdd.collect():
    if len(word)>max_w1:
        max_w1 = len(word)
        largest_word = word

print("Largest word is {} with length {}".format(largest_word, max_w1))

Largest word is Certification with length 13


## Difference between map and flatMap

In [21]:
words = sc.textFile('sample_file.txt')
words.collect()  # Each line is read as  a string

['Hi, How are you?', 'My name is Jeswin George.', 'I am studying spark.']

#### Using map to apply a function each string element 

In [22]:
words_map = words.map(lambda line:line.split(' '))
words_map.collect()

[['Hi,', 'How', 'are', 'you?'],
 ['My', 'name', 'is', 'Jeswin', 'George.'],
 ['I', 'am', 'studying', 'spark.']]

#### Using flatMap each element inside the list of lists will be taken as a single element

In [23]:
words_flatmap = words.flatMap(lambda line:line.split(' '))
words_flatmap.collect()

['Hi,',
 'How',
 'are',
 'you?',
 'My',
 'name',
 'is',
 'Jeswin',
 'George.',
 'I',
 'am',
 'studying',
 'spark.']

#### Taking the flatMapped data again and applying flatMap on it

In [24]:
w1 = words_flatmap.flatMap(lambda word:(word))
w1.collect()

['H',
 'i',
 ',',
 'H',
 'o',
 'w',
 'a',
 'r',
 'e',
 'y',
 'o',
 'u',
 '?',
 'M',
 'y',
 'n',
 'a',
 'm',
 'e',
 'i',
 's',
 'J',
 'e',
 's',
 'w',
 'i',
 'n',
 'G',
 'e',
 'o',
 'r',
 'g',
 'e',
 '.',
 'I',
 'a',
 'm',
 's',
 't',
 'u',
 'd',
 'y',
 'i',
 'n',
 'g',
 's',
 'p',
 'a',
 'r',
 'k',
 '.']

### distinct()

In [25]:
data = sc.parallelize('Welcome to Edureka')
data.distinct().count()

14

In [26]:
data.distinct().collect()

['W', 'e', 'l', 'c', 'o', 'm', ' ', 't', 'E', 'd', 'u', 'r', 'k', 'a']

### sortBy()

In [28]:
rdd1 = sc.parallelize('Hello, Welcome to Edureka'.split(" "))
rdd1.sortBy(lambda line: len(line)).collect()

['to', 'Hello,', 'Welcome', 'Edureka']

### intersection

In [29]:
rdd1 = sc.parallelize(((1, 'jan', 2016), (3, 'nov', 2014), (16, 'feb', 2014)))
rdd1.collect()

[(1, 'jan', 2016), (3, 'nov', 2014), (16, 'feb', 2014)]

In [30]:
rdd2 = sc.parallelize(((1, 'jan', 2016), (5, 'nov', 2014), (16, 'mar', 2014)))
rdd2.collect()

[(1, 'jan', 2016), (5, 'nov', 2014), (16, 'mar', 2014)]

In [31]:
rdd1.intersection(rdd2).collect()

[(1, 'jan', 2016)]

### Union

In [33]:
input_rdd = sc.parallelize('Hello, Welcome to Edureka'.split(" "))
input2_rdd = sc.parallelize('Hi, Goto to floor 2'.split(' '))
input_rdd.union(input2_rdd).collect()

['Hello,', 'Welcome', 'to', 'Edureka', 'Hi,', 'Goto', 'to', 'floor', '2']