In [1]:
# Import PySpark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.\
        builder.\
        appName("RDD-Demo").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        config("spark.eventLog.enabled", "true").\
        config("spark.eventLog.dir", "file:///opt/workspace/events").\
        getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/13 11:40:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
numbers = [1, 2, 3, 4, 5]
rdd = spark.sparkContext.parallelize(numbers)

In [4]:
sc = spark.sparkContext

In [5]:
sc

In [6]:
# Collect action: Retrieve all elements of the RDD
rdd.collect()

                                                                                

[1, 2, 3, 4, 5]

In [7]:
# Create an RDD from a list of tuples
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35), ("Alice", 40)]
rdd = spark.sparkContext.parallelize(data)

In [10]:
# Collect action: Retrieve all elements of the RDD
print("All elements of the rdd: ", rdd.collect())

All elements of the rdd:  [('Alice', 25), ('Bob', 30), ('Charlie', 35), ('Alice', 40)]


In [11]:
# Count action: Count the number of elements in the RDD
count = rdd.count()
print("The total number of elements in rdd: ", count)



The total number of elements in rdd:  4


                                                                                

In [12]:
# First action: Retrieve the first element of the RDD
first_element = rdd.first()
print("The first element of the rdd: ", first_element)

The first element of the rdd:  ('Alice', 25)


                                                                                

In [13]:
# Take action: Retrieve the n elements of the RDD
taken_elements = rdd.take(2)
print("The first two elements of the rdd: ", taken_elements)

The first two elements of the rdd:  [('Alice', 25), ('Bob', 30)]


In [14]:
# Foreach action: Print each element of the RDD
rdd.foreach(lambda x: print(x))

                                                                                

In [15]:
# Map transformation: Convert name to uppercase
mapped_rdd = rdd.map(lambda x: (x[0].upper(), x[1]))

In [16]:
result = mapped_rdd.collect()
print("rdd with uppercease name: ", result)

rdd with uppercease name:  [('ALICE', 25), ('BOB', 30), ('CHARLIE', 35), ('ALICE', 40)]


In [17]:
# Filter transformation: Filter records where age is greater than 30
filtered_rdd = rdd.filter(lambda x: x[1] > 30)
filtered_rdd.collect()

                                                                                

[('Charlie', 35), ('Alice', 40)]

In [18]:
# ReduceByKey transformation: Calculate the total age for each name
reduced_rdd = rdd.reduceByKey(lambda x, y: x + y)
reduced_rdd.collect()

                                                                                

[('Charlie', 35), ('Alice', 65), ('Bob', 30)]

In [19]:
# SortBy transformation: Sort the RDD by age in descending order
sorted_rdd = rdd.sortBy(lambda x: x[1], ascending=False)
sorted_rdd.collect()

                                                                                

[('Alice', 40), ('Charlie', 35), ('Bob', 30), ('Alice', 25)]

In [21]:
# Save action: Save the RDD to a text file
rdd.saveAsTextFile("/opt/workspace/dataout/output.txt")

                                                                                

In [22]:
# create rdd from text file
rdd_text = spark.sparkContext.textFile("/opt/workspace/dataout/output.txt")
rdd_text.collect()

["('Bob', 30)", "('Alice', 25)", "('Charlie', 35)", "('Alice', 40)"]

In [23]:
spark.stop()
