In [82]:
from pyspark import SparkContext, SparkConf

# Initializing Spark

In [2]:
AppName, Mode = "RDD_DEMO", "local"
conf = SparkConf().setAppName(AppName).setMaster(Mode)
sc = SparkContext(conf=conf)

In [81]:
print(f"SparkContext: {sc}")

SparkContext: <SparkContext master=local appName=RDD_DEMO>


# Create RDD with Parallelized Collections

In [64]:
data = [
    [1, 2, 3], 
    [4, 5],
    [6],
    [7, 8, 9, 10]
]
persist_data = sc.parallelize(data).persist()
print(f"Type: {type(persist_data)}, Data: {persist_data.collect()}")

Type: <class 'pyspark.rdd.RDD'>, Data: [[1, 2, 3], [4, 5], [6], [7, 8, 9, 10]]


# RDD Operations

In [100]:
# Basic Operation 
print(f" first: {persist_data.first()}")
print(f" take: {persist_data.take(2)}")
print(f" count: {persist_data.count()}")
print(f" collect: {persist_data.collect()}")
print(f" countByKey: {persist_data.countByKey()}")

 first: [1, 2, 3]
 take: [[1, 2, 3], [4, 5]]
 count: 4
 collect: [[1, 2, 3], [4, 5], [6], [7, 8, 9, 10]]
 countByKey: defaultdict(<class 'int'>, {1: 1, 4: 1, 6: 1, 7: 1})


In [93]:
# Map
result = persist_data.map(lambda x: [i+1 for i in x])
print(f" Result: {result.collect()}")

 Result: [[2, 3, 4], [5, 6], [7], [8, 9, 10, 11]]


In [94]:
# flatMap
result = persist_data.flatMap(lambda x: [i+1 for i in x])
print(f" Result: {result.collect()}")

 Result: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]


In [95]:
# filter
result = persist_data.filter(lambda x: 7 in x)
print(f" Result: {result.collect()}")

 Result: [[7, 8, 9, 10]]


In [122]:
# foreach
count = 0
class MyComputer:
    def accumulator(nums: List[int]) -> None:
        if len(nums) > 0:
            count += 1
        return count
def acc(nums: List[int]) -> None:
    if len(nums) > 0:
        count += 1
        
def f(x): print(x)
persist_data.foreach(f)
# print(f" Result_3: {result_3.collect()}")
# print(f" count: {count}")
# persist_data.collect()
# count
persist_data.collect()

[[1, 2, 3], [4, 5], [6], [7, 8, 9, 10]]

In [96]:
# Passing Functions to Spark
from typing import Optional, List

class MyComputer:
    def plusone(nums: List[int]) -> list:
        return [i+1 for i in nums]
    def minusone(nums: List[int]) -> list:
        return [i-1 for i in nums]

result_1 = persist_data.map(MyComputer.plusone)
result_2 = persist_data.map(MyComputer.minusone)
print(f" Result_1: {result_1.collect()}")
print(f" Result_2: {result_2.collect()}")

 Result_1: [[2, 3, 4], [5, 6], [7], [8, 9, 10, 11]]
 Result_2: [[0, 1, 2], [3, 4], [5], [6, 7, 8, 9]]
