In [1]:
from pyspark import SparkContext, SparkConf

# Initializing Spark

In [2]:
AppName, Mode = "RDD_DEMO", "local"
conf = SparkConf().setAppName(AppName).setMaster(Mode)
sc = SparkContext(conf=conf)

In [8]:
print(f"SparkContext: {sc}")

SparkContext: <SparkContext master=local appName=RDD_DEMO>


# Create RDD with Parallelized Collections

In [64]:
data = [
    [1, 2, 3], 
    [4, 5],
    [6],
    [7, 8, 9, 10]
]
persist_data = sc.parallelize(data).persist()
print(f"Type: {type(persist_data)}, Data: {persist_data.collect()}")

Type: <class 'pyspark.rdd.RDD'>, Data: [[1, 2, 3], [4, 5], [6], [7, 8, 9, 10]]


# RDD Operations

In [71]:
# Basic Operation 
print(f" first: {persist_data.first()}")
print(f" take: {persist_data.take(2)}")
print(f" count: {persist_data.count()}")
print(f" collect: {persist_data.collect()}")

 first: [1, 2, 3]
 take: [[1, 2, 3], [4, 5]]
 count: 4
 collect: [[1, 2, 3], [4, 5], [6], [7, 8, 9, 10]]


In [73]:
# Passing Functions to Spark with Lambda
PlusOne = persist_data.map(lambda x: [i+1 for i in x])
print(f" PlusOne: {PlusOne.collect()}")

 PlusOne: [[2, 3, 4], [5, 6], [7], [8, 9, 10, 11]]


In [80]:
# Passing Functions to Spark
from typing import Optional, List

def plusone(nums: List[int]) -> list:
    return [i+1 for i in nums]    
def minusone(nums: List[int]) -> list:
    return [i-1 for i in nums]  
 
result_1 = persist_data.map(plusone)
result_2 = persist_data.map(plusone)
print(f" Result_1: {result_1.collect()}")
print(f" Result_2: {result_2.collect()}")

 Result_1: [[2, 3, 4], [5, 6], [7], [8, 9, 10, 11]]
 Result_2: [[2, 3, 4], [5, 6], [7], [8, 9, 10, 11]]


In [78]:
# class MyComputer:
#     def plusone(nums: List[int]) -> list:
#         return [i+1 for i in nums]
#     def minusone(nums: List[int]) -> list:
#         return [i-1 for i in nums]

<bound method MyComputer.plusone of <__main__.MyComputer object at 0xffff6184bdc0>>