In [10]:
from pyspark.sql import SparkSession
# Use reduce to apply combine to numbers
from functools import reduce
from termcolor import cprint

spark = SparkSession.builder.appName('reduce').getOrCreate()

### Reduce
Reduce takes a function `f` and an array as input. The function `f` gets two input parameters that work on individual elements of the array. `Reduce` combines every two elements of the array using the function `f`. 
The reduce() function accepts a function and a sequence and returns a single value calculated as follows:

1) Initially, the function is called with the first two items from the sequence and the result is returned.
2) The function is then called again with the result obtained in step 1  and the next value in the sequence. This process keeps repeating  until there are items in the sequence.  
Let’s take an example:

In [11]:
# we define a list of integers
numbers = [1, 4, 6, 2, 9, 10]

# Define a new function combine
# Convert x and y to strings and create a tuple from x,y
def combine(x,y):
  return "(" + str(x) + ", " + str(y) + ")"

cprint(numbers, 'red')
reduce(combine,numbers)

[31m[1, 4, 6, 2, 9, 10][0m


'(((((1, 4), 6), 2), 9), 10)'

In [12]:
# we define a list of integers
numbers = [1, 4, 6, 2, 9, 10]

cprint(numbers, 'red')
reduce(lambda x,y: "(" + str(x) + ", " + str(y) + ")",numbers)

[31m[1, 4, 6, 2, 9, 10][0m


'(((((1, 4), 6), 2), 9), 10)'

### reduce(f)
Reduces the elements of this RDD using the specified commutative and associative binary operator. Currently reduces partitions locally.

In [13]:
from operator import add
spark.sparkContext.parallelize([1, 2, 3, 4, 5]).reduce(add)

15

In [14]:
spark.sparkContext.parallelize(numbers).reduce(combine)

'(((((1, 4), 6), 2), 9), 10)'

In [15]:
import datetime

data=range(1,10)
distData = spark.sparkContext.parallelize(data,4)

cprint(f"start {datetime.datetime.now()}", 'red')
a = distData.reduce(lambda x,y:x+y)
print(f"distData.reduce(lambda x,y:x+y) = {a}")
cprint(f"ends {datetime.datetime.now()}", 'red')


[31mstart 2025-08-08 21:05:16.963551[0m
distData.reduce(lambda x,y:x+y) = 45
[31mends 2025-08-08 21:05:17.273518[0m


### Aggregate similar to __`reduce`__

In [16]:
seqOp = (lambda x, y: x+y)
combOp = (lambda x, y: x+y)
cprint(f"start {datetime.datetime.now()}", 'green')
b=distData.aggregate(0, seqOp, combOp)
print(b)
cprint(f"ends {datetime.datetime.now()}", 'green')


[32mstart 2025-08-08 21:05:17.288075[0m
45
[32mends 2025-08-08 21:05:17.553770[0m


In [17]:
# SUM of every element from 1 to 10, formula  (n*(n-1)/2) = 45 for n=10 
data=range(1,10)
reduce(lambda x,y:x+y, data)

45

max num in list

In [18]:

from functools import reduce
items = [1, 24, 17, 14, 9, 32, 2]
all_max = reduce(lambda a,b: a if (a > b) else b, items)
 
cprint (f"all_max of  [1, 24, 17, 14, 9, 32, 2] = {all_max}", 'cyan') 

[36mall_max of  [1, 24, 17, 14, 9, 32, 2] = 32[0m
