In [0]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("RDDOperations").getOrCreate()

# Create an RDD using parallelize
data = ["Hello world", "Spark is fun", "Learn PySpark"]
rdd = spark.sparkContext.parallelize(data)

# 1. flatMap: Split each string into words
flat_rdd = rdd.flatMap(lambda x: x.split())

# 2. map: Create (word, 1) pairs for counting
mapped_rdd = flat_rdd.map(lambda x: (x, 1))

# 3. filter: Keep words with length > 4
filtered_rdd = mapped_rdd.filter(lambda x: len(x[0]) > 4)

# 4. Create another RDD for join
data2 = [("Hello", "greeting"), ("Spark", "framework"), ("world", "global")]
rdd2 = spark.sparkContext.parallelize(data2)
joined_rdd = mapped_rdd.join(rdd2)

# 5. reduceByKey: Count word occurrences
word_counts = mapped_rdd.reduceByKey(lambda x, y: x + y)

# 6. groupByKey: Group words (less efficient, for demonstration)
grouped_rdd = mapped_rdd.groupByKey().mapValues(list)

# Collect and display results
print("flatMap result:", flat_rdd.collect())
print("map result:", mapped_rdd.collect())
print("filter result:", filtered_rdd.collect())
print("join result:", joined_rdd.collect())
print("reduceByKey result:", word_counts.collect())
print("groupByKey result:", grouped_rdd.collect())

'''
Explanation : parallelize: Distributes the input list across the Spark cluster.
flatMap: Transforms each element into multiple elements (e.g., splits “Hello world” into [“Hello”, “world”]).
map: Transforms each element into a new form (e.g., (word, 1) pairs).
filter: Keeps elements meeting a condition (e.g., word length > 4).
join: Combines two RDDs based on keys.
reduceByKey: Aggregates values for each key (efficient for large data).
groupByKey: Groups all values for each key (less efficient due to shuffling).
'''

flatMap result: ['Hello', 'world', 'Spark', 'is', 'fun', 'Learn', 'PySpark']
map result: [('Hello', 1), ('world', 1), ('Spark', 1), ('is', 1), ('fun', 1), ('Learn', 1), ('PySpark', 1)]
filter result: [('Hello', 1), ('world', 1), ('Spark', 1), ('Learn', 1), ('PySpark', 1)]
join result: [('Hello', (1, 'greeting')), ('Spark', (1, 'framework')), ('world', (1, 'global'))]
reduceByKey result: [('Hello', 1), ('Spark', 1), ('fun', 1), ('Learn', 1), ('world', 1), ('is', 1), ('PySpark', 1)]
groupByKey result: [('Hello', [1]), ('Spark', [1]), ('fun', [1]), ('Learn', [1]), ('world', [1]), ('is', [1]), ('PySpark', [1])]
Out[4]: '\nExplanation : parallelize: Distributes the input list across the Spark cluster.\nflatMap: Transforms each element into multiple elements (e.g., splits “Hello world” into [“Hello”, “world”]).\nmap: Transforms each element into a new form (e.g., (word, 1) pairs).\nfilter: Keeps elements meeting a condition (e.g., word length > 4).\njoin: Combines two RDDs based on keys.\nre