In [None]:
#1] Create a Spark RDD using 5 different Functions

#Solution:

import os
import shutil
from os.path import basename
from pyspark.sql import SparkSession

# --- Setup Spark Session and Context ---
spark = SparkSession.builder.appName("RDD_Creation_Fixed").master("local[*]").getOrCreate()
sc = spark.sparkContext

# --- Create a working directory for our files ---
WORK_DIR = "/tmp/spark_rdd_demo_fixed"
if os.path.exists(WORK_DIR):
    shutil.rmtree(WORK_DIR)
os.makedirs(WORK_DIR)
os.chdir(WORK_DIR)

print(f"Working directory: {os.getcwd()}\n")

# ==============================================================================
# 1) sc.parallelize
# ==============================================================================
print("=== 1) sc.parallelize ===")
rdd1 = sc.parallelize([1, 2, 3, 4, 5], 2)
print(f"Partitions: {rdd1.getNumPartitions()}")
print(f"Data: {rdd1.collect()}\n")

# ==============================================================================
# 2) sc.textFile
# ==============================================================================
print("=== 2) sc.textFile ===")
log_file_path = os.path.join(WORK_DIR, "logs.txt")
with open(log_file_path, "w") as f:
    for i in range(1, 4):
        f.write(f"INFO log{i}: hello world\n")
        f.write(f"WARN log{i}: something to note\n")
        f.write(f"ERROR log{i}: something went wrong\n")

rdd2 = sc.textFile(log_file_path, 3)
print(f"Partitions: {rdd2.getNumPartitions()}")
print(f"Line count: {rdd2.count()}")
print(f"Sample lines: {rdd2.take(3)}\n")

# ==============================================================================
# 3) sc.wholeTextFiles
# ==============================================================================
print("=== 3) sc.wholeTextFiles ===")
conf_dir = os.path.join(WORK_DIR, "configs")
os.makedirs(conf_dir)
with open(os.path.join(conf_dir, "db.conf"), "w") as f:
    f.write("user=root\npassword=secret\n")
with open(os.path.join(conf_dir, "app.conf"), "w") as f:
    f.write("threads=4\nport=8080\n")

rdd3 = sc.wholeTextFiles(conf_dir, 2)
print(f"Partitions: {rdd3.getNumPartitions()}")
files_and_sizes = rdd3.map(lambda kv: (basename(kv[0]), len(kv[1]))).collect()
print(f"Files and sizes (bytes): {files_and_sizes}\n")

# ==============================================================================
# 4) sc.range
# ==============================================================================
print("=== 4) sc.range ===")
# Creates an RDD from 0 to 100 with a step of 5
rdd4 = sc.range(0, 100, 5, 4)
print(f"Partitions: {rdd4.getNumPartitions()}")
print(f"First 10 values: {rdd4.take(10)}\n")

# ==============================================================================
# 5) sc.binaryFiles (FIXED)
# ==============================================================================
print("=== 5) sc.binaryFiles ===")
binary_dir = os.path.join(WORK_DIR, "binaries")
os.makedirs(binary_dir)
# Create a dummy binary file (e.g., a small image or just some bytes)
with open(os.path.join(binary_dir, "icon.png"), "wb") as f:
    f.write(os.urandom(1024)) # 1KB of random data
with open(os.path.join(binary_dir, "data.bin"), "wb") as f:
    f.write(os.urandom(2048)) # 2KB of random data

rdd5 = sc.binaryFiles(binary_dir, 1)
print(f"Partitions: {rdd5.getNumPartitions()}")

# --- THE FIX IS APPLIED HERE ---
# Instead of kv[1].read(), we use len(kv[1]) directly on the bytes object.
sizes = rdd5.map(lambda kv: (basename(kv[0]), len(kv[1]))).collect()
print("Binary file sizes (bytes):", sizes)

# ==============================================================================
# --- Clean up ---
# ==============================================================================
sc.stop()
shutil.rmtree(WORK_DIR)
print("\nSpark session stopped and temporary directory cleaned up.")

Working directory: /tmp/spark_rdd_demo_fixed

=== 1) sc.parallelize ===
Partitions: 2
Data: [1, 2, 3, 4, 5]

=== 2) sc.textFile ===
Partitions: 3
Line count: 9
Sample lines: ['INFO log1: hello world', 'WARN log1: something to note', 'ERROR log1: something went wrong']

=== 3) sc.wholeTextFiles ===
Partitions: 2
Files and sizes (bytes): [('db.conf', 26), ('app.conf', 20)]

=== 4) sc.range ===
Partitions: 4
First 10 values: [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]

=== 5) sc.binaryFiles ===
Partitions: 1
Binary file sizes (bytes): [('data.bin', 2048), ('icon.png', 1024)]

Spark session stopped and temporary directory cleaned up.


The error you are encountering, `PicklingError: Could not serialize object: IndexError: tuple index out of range`, often happens in Spark when it tries to send a function or object to its workers for execution, but the object cannot be properly serialized (or "pickled").

In interactive environments like notebooks, this can be related to how the Spark context or session is created and how the functions are defined and executed within the notebook's scope. The `IndexError: tuple index out of range` within the `cloudpickle` library suggests an issue during the inspection of the function's code or globals for serialization.

While the exact root cause can be complex and environment-dependent, ensuring the Spark session and context are correctly initialized and managed is crucial.

Here's the code again, structured to minimize potential serialization issues in a notebook environment:

In [None]:
#2] Write example for following Spark RDD Actions:
#a. aggregate  b. treeAggregate c. fold
#d. reduce  e. collect

#Solution:

# Spark RDD Actions Demo:
# a) aggregate      -> aggregate with different input/output types (e.g., (sum, count) to compute avg)
# b) treeAggregate  -> hierarchical/Tree-style aggregate (same API as aggregate + depth)
# c) fold           -> like aggregate but zero value is same type as elements and uses one binary op
# d) reduce         -> combine elements using an associative+commutative function
# e) collect        -> bring all elements to driver (use only for small RDDs)

from pyspark.sql import SparkSession
from operator import add, mul

spark = SparkSession.builder.appName("RDD Actions Demo").master("local[*]").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

# Sample RDD
rdd = sc.parallelize(list(range(1, 11)), numSlices=4)  # 1..10
print(f"RDD partitions: {rdd.getNumPartitions()}")
print(f"RDD data (small preview via take): {rdd.take(5)}")

# a) aggregate: compute (sum, count) then average
# zero must be a neutral element for both seqOp and combOp
zero = (0, 0)

def seq_op(acc, x):
    s, c = acc
    return (s + x, c + 1)

def comb_op(a, b):
    s1, c1 = a
    s2, c2 = b
    return (s1 + s2, c1 + c2)

sum_count = rdd.aggregate(zero, seq_op, comb_op)
avg_via_aggregate = sum_count[0] / sum_count[1]
print("\n[a] aggregate -> (sum, count):", sum_count, "average:", avg_via_aggregate)

# b) treeAggregate: same logic, aggregated in a tree pattern (set depth as needed)
sum_count_tree = rdd.treeAggregate(zero, seq_op, comb_op, depth=3)
avg_via_treeAggregate = sum_count_tree[0] / sum_count_tree[1]
print("[b] treeAggregate -> (sum, count):", sum_count_tree, "average:", avg_via_treeAggregate)

# c) fold: single associative op, zero must be identity for the op and same type as elements
fold_sum = rdd.fold(0, add)           # identity for + is 0
fold_prod = rdd.fold(1, mul)          # identity for * is 1
print("[c] fold -> sum:", fold_sum, "product:", fold_prod)

# d) reduce: combine elements with a binary op
reduce_sum = rdd.reduce(add)
reduce_max = rdd.reduce(max)
print("[d] reduce -> sum:", reduce_sum, "max:", reduce_max)

# e) collect: bring all elements to the driver (only for small RDDs)
all_data = rdd.collect()
print("[e] collect ->", all_data)

spark.stop()

RDD partitions: 4
RDD data (small preview via take): [1, 2, 3, 4, 5]

[a] aggregate -> (sum, count): (55, 10) average: 5.5
[b] treeAggregate -> (sum, count): (55, 10) average: 5.5
[c] fold -> sum: 55 product: 3628800
[d] reduce -> sum: 55 max: 10
[e] collect -> [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
