# Chapter 12. RDDs

## Creating RDDs

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
    .config("spark.sql.shuffle.partitions", "5")\
    .appName('RDDs')\
    .getOrCreate()

24/08/26 20:20:28 WARN Utils: Your hostname, Khanhs-MAC.local resolves to a loopback address: 127.0.0.1; using 192.168.0.103 instead (on interface en0)
24/08/26 20:20:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/26 20:20:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Creating RDDs

### Interoperating Between DataFrames, Datasets, and RDDs

In [13]:
spark.range(10).rdd

MapPartitionsRDD[37] at javaToPython at NativeMethodAccessorImpl.java:0

In [14]:
spark.range(10).rdd.toDF()

DataFrame[id: bigint]

### From a Local Collection

In [15]:
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple"\
    .split(" ")

words = spark.sparkContext.parallelize(myCollection, 2)
print(words)

ParallelCollectionRDD[49] at readRDDFromFile at PythonRDD.scala:289


In [16]:
words.setName("myWords")
words.name()

'myWords'

### From Data Sources

In [17]:
rdd = spark.sparkContext.wholeTextFiles("../text")

for line in rdd.collect():
    print(line)

('file:/Users/khanhnn/Developer/DE/spark/practice_spark/text/text1.txt', 'Hello, this is the content of file1.\nIt has multiple lines of text.')
('file:/Users/khanhnn/Developer/DE/spark/practice_spark/text/text2.txt', 'This is the content of file2.\nIt also has multiple lines of text.\nSpark will read this content as a single string.')


## Transformations

### distinct

In [18]:
words.distinct().count()

10

### filter

In [20]:
def startsWithS(individual):
  return individual.startswith("S")

words.filter(lambda word: startsWithS(word)).collect()

['Spark', 'Simple']

### map

In [26]:
words2 = words.map(lambda word: (word, word[0], word.startswith("S")))

words2.filter(lambda record: record[2].take(5))

PythonRDD[62] at RDD at PythonRDD.scala:53

#### flatMap

In [30]:
words.flatMap(lambda word: list(word)).take(5)

['S', 'p', 'a', 'r', 'k']

### sort

In [32]:
words.sortBy(lambda word: len(word) * - 1).take(6)

['Definitive', 'Processing', 'Simple', 'Spark', 'Guide', 'Data']

### Random Splits

In [38]:
fiftyFiftySplit = words.randomSplit([0.5, 0.5])

first_split_result = fiftyFiftySplit[0].take(5)

second_split_result = fiftyFiftySplit[1].take(5)

print("First split results:")
for record in first_split_result:
    print(record)

print("\nSecond split results:")
for record in second_split_result:
    print(record)

First split results:
The
Guide
Big

Second split results:
Spark
Definitive
:
Data
Processing


## Actions

### reduce

In [39]:
spark.sparkContext.parallelize(range(1, 21)).reduce(lambda x, y: x + y)

210

In [45]:
def wordLengthReducer(leftWord, rightWord):
    if len(leftWord) > len(rightWord):
        return leftWord
    else:
        return rightWord

words.reduce(wordLengthReducer)

'Processing'

### count

In [46]:
words.count()

10

### countApprox

In [48]:
confidence = 0.95
timeout_milliseconds = 400

approx_count = words.countApprox(timeout_milliseconds, confidence)

print(approx_count)

10


### countApproxDistinct

In [49]:
words.countApproxDistinct(0.05)

10

### countByValueApprox

### first

In [52]:
words.first()

'Spark'

### max and min

In [56]:
# Create an RDD with numbers from 1 to 20
rdd = spark.sparkContext.parallelize(range(1, 21))

# Find the maximum value
max_value = rdd.max()

# Find the minimum value
min_value = rdd.min()

print(f"Max value: {max_value}")
print(f"Min value: {min_value}")

Max value: 20
Min value: 1


### take

In [57]:
first_five = words.take(5)
print("First 5 elements:", first_five)

first_five_ordered = words.takeOrdered(5)
print("First 5 elements in order:", first_five_ordered)

top_five = words.top(5)
print("Top 5 elements:", top_five)

with_replacement = True
number_to_take = 6
random_seed = 100

sample = words.takeSample(with_replacement, number_to_take, random_seed)
print("Sample of 6 elements:", sample)

First 5 elements: ['Spark', 'The', 'Definitive', 'Guide', ':']
First 5 elements in order: [':', 'Big', 'Data', 'Definitive', 'Guide']
Top 5 elements: ['The', 'Spark', 'Simple', 'Processing', 'Made']
Sample of 6 elements: ['Data', 'Definitive', 'Data', 'The', 'Definitive', 'Spark']


## Saving Files

### saveAsTextFile

In [59]:
words.saveAsTextFile("../tmp/bookTitle")

### SequenceFiles

In [61]:
# Transform the RDD into key-value pairs with None as the key
rdd_with_none_key = words.map(lambda x: (None, x))

# Save the RDD as a SequenceFile
rdd_with_none_key.saveAsSequenceFile("../tmp/my/sequenceFilePath")

                                                                                

## Caching

In [62]:
words.cache()

myWords ParallelCollectionRDD[49] at readRDDFromFile at PythonRDD.scala:289

In [63]:
words.getStorageLevel()

StorageLevel(False, True, False, False, 1)

## Checkpointing

In [65]:
spark.sparkContext.setCheckpointDir("../tmp")
words.checkpoint()

## Pipe RDDs to System Commands

In [66]:
words.pipe("wc -l").collect()

['       5', '       5']

## mapPartitions

In [68]:
words.mapPartitions(lambda part: [1]).sum() # 2

2

In [70]:
def indexedFunc(partitionIndex, withinPartIterator):
  return ["partition: {} => {}".format(partitionIndex, x) for x in withinPartIterator]

words.mapPartitionsWithIndex(indexedFunc).collect()

['partition: 0 => Spark',
 'partition: 0 => The',
 'partition: 0 => Definitive',
 'partition: 0 => Guide',
 'partition: 0 => :',
 'partition: 1 => Big',
 'partition: 1 => Data',
 'partition: 1 => Processing',
 'partition: 1 => Made',
 'partition: 1 => Simple']

### foreachPartition

In [72]:
import os
import random

def save_partition_to_file(iter):
    # Generate a random file name
    random_file_name = random.randint(0, 1000000)
    file_path = f"../tmp/random-file-{random_file_name}.txt"
    
    # Open a file to write the partition data
    with open(file_path, 'w') as file:
        for item in iter:
            file.write(str(item) + '\n')  # Write each item in the partition to the file

# Apply the function to each partition
words.foreachPartition(save_partition_to_file)

### glom

In [75]:
# in Python
spark.sparkContext.parallelize(["Hello", "World"], 2).glom().collect()
# [['Hello'], ['World']]

[['Hello'], ['World']]